论坛垃圾中判断 - 经管之家

3041

收藏 2014-05-19

本文大纲：
1. 问题背景
2. 数据获取
3. 数据信息预处理分析
4. 决策树模式识别
5. 存在的问题
用的到R包是rpart决策树
1.问题背景
论坛开张后，总是会有很多注册的用户，
即机器注册的用户，骗取积分和礼品，分发广告。
希望探索规律，识别一些可能的僵尸注册id。
2.数据获取
数据取自老虎的dataguru论坛的垃圾虫竞赛的数据。
3.数据信息预处理分析
数据有论坛id和邮箱字符，
将数据转化成csv格式，然后sqlldr进入oracle数据库
（其实任何一种数据库都行，只是实现不同）。
在数据库中初步加工和探索，刻画了六个维度区别数据。
create table org
(
  num number,
  id  varchar2(300),
  email varchar2(300)
);
--建立存放原始数据
create table add_org
(
  num number,
  id  varchar2(300),
  email varchar2(300),
  eff_id varchar2(1),
  f1    varchar2(100),  --邮箱大小写混合排列次数
  f2    varchar2(100),  --邮箱字母数字混合排列次数
  f3    varchar2(100),  --id大小写混合排列次数
  f4    varchar2(100),  --id字母数字混合排列次数
  f5    varchar2(100),  --邮箱中是否全数字
  f6    varchar2(100) --论坛id是否含汉字
);
--刻画指标
create package v_add
as
function test_up_low(v_str in varchar2) return varchar2;
function test_nm_str(v_str in varchar2) return varchar2;
function test_fullnum(v_str in varchar2) return varchar2;
function test_hav_han(v_str in varchar2) return varchar2;
end v_add;
/
create or replace package body v_add
as
function test_up_low(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num number;
begin
select
nvl(sum(case when hv=1 and hg=2 then 1
      when hv=2 and hg=1 then 1
else 0
end),0) into v_num
from
(
select h,v,g,
(
case
when v in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when v in
(
'A','B','C','D','E',
'F','G','H','I','J',
'K','L','M','N','O',
'P','Q','R','S','T',
'U','V','W','X','Y',
'Z'
) then 2
else 3
end
) as hv,
(
case
when g in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when g in
(
'A','B','C','D','E',
'F','G','H','I','J',
'K','L','M','N','O',
'P','Q','R','S','T',
'U','V','W','X','Y',
'Z'
) then 2
else 3
end
) as hg from
(
select h,v,lag(v) over(order by h) g
from
(
select level h , substr(v_str,level,1) v
from dual
connect by level<=length(v_str)
)
)
) ;
v_cnt := to_char(v_num);
return v_cnt;
end test_up_low;
function test_nm_str(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num number;
begin
select
nvl(sum(case when hv=1 and hg=2 then 1
      when hv=2 and hg=1 then 1
else 0
end),0) into v_num
from
(
select h,v,g,
(
case
when lower(v) in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when v in
(
'0','1','2','3','4',
'5','6','7','8','9'
) then 2
else 3
end
) as hv,
(
case
when lower(g) in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when g in
(
  '0','1','2','3','4',
  '5','6','7','8','9'
) then 2
else 3
end
) as hg from
(
select h,v,lag(v) over(order by h) g
from
(
select level h , substr(v_str,level,1) v
from dual
connect by level<=length(v_str)
)
)
) ;
v_cnt := to_char(v_num);
return v_cnt;
end test_nm_str;
function test_fullnum(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num number;
begin
select
sum(case
when v in
(
   '0','1','2','3','4',
   '5','6','7','8','9'
)
then 0
else 1
end) into v_num
from
(
select level h , substr(v_str,level,1) v
from dual
connect by level<=length(v_str)
);
if v_num=0
then v_cnt:='1' ;  --全为数字
else v_cnt:='0' ;
end if;
return v_cnt;
end test_fullnum;
function test_hav_han(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num1 number;
v_num2 number;
begin
v_num1 :=LENGTHB(v_str);
v_num2 := LENGTH(v_str);
if v_num1>v_num2
then v_cnt:='1' ;  --含有汉字
else v_cnt:='0' ;
end if;
return v_cnt;
end test_hav_han;
end v_add;
/
--加工数据集
insert into add_org
select
num,id,email,case when num between 1 and 170 then '1' else '0' end ,
v_add.test_up_low(email),
v_add.test_nm_str(email),
v_add.test_up_low(id),
v_add.test_nm_str(id),
v_add.test_fullnum(email),
v_add.test_hav_han(id)
from org;
--定义模式识别的可识别范围
select eff_id,f1,f2,f3,f4,f5,f6,count(1)
from add_org
group by  eff_id,f1,f2,f3,f4,f5,f6
order by  eff_id,f1,f2,f3,f4,f5,f6
--观察000000和000001在垃圾虫和非垃圾虫中都占据大量的号码
--我们称000000和000001为不可用字符串识别的类，需要留存用其他规则判断。
--剔除这部分数据
select eff_id,f1,f2,f3,f4,f5,f6
from add_org
where f1||f2||f3||f4||f5||f6 not in ('000000','000001');
--生成数据到r中计算，生成data.csv
4. 决策树模式识别
我们将数据加载到r中，然后通过r的决策树分析。
r的代码如下：
#加载决策树
> library(rpart);
#加载数据
> rc<-read.csv("data.csv");
#使用决策树建模
> fit<-rpart(EFF_ID~.,method="class",data=rc)
#检验决策树模型
> printcp(fit);
Classification tree:
rpart(formula = EFF_ID ~ ., data = rc, method = "class")
Variables actually used in tree construction:
[1] F1 F2 F3 F4
Root node error: 93/191 = 0.48691
n= 191
      CP nsplit rel error  xerror    xstd
1 0.279570    0 1.00000 1.00000 0.074277
2 0.086022    1 0.72043 0.75269 0.071605
3 0.016129    3 0.54839 0.61290 0.067997
4 0.010000    5 0.51613 0.67742 0.069868
> plotcp(fit);
> summary(fit);
Call:
rpart(formula = EFF_ID ~ ., data = rc, method = "class")
  n= 191
      CP nsplit rel error xerror    xstd
1 0.27956989    0 1.0000000 1.0000000 0.07427709
2 0.08602151    1 0.7204301 0.7526882 0.07160479
3 0.01612903    3 0.5483871 0.6129032 0.06799707
4 0.01000000    5 0.5161290 0.6774194 0.06986757
Node number 1: 191 observations, complexity param=0.2795699
  predicted class=0  expected loss=0.486911
class counts: 98 93
probabilities: 0.513 0.487
  left son=2 (161 obs) right son=3 (30 obs)
  Primary splits:
   F2 < 1.5 to the left,  improve=14.1856900, (0 missing)
   F4 < 0.5 to the right, improve=12.0456300, (0 missing)
   F1 < 0.5 to the left,  improve= 6.4681680, (0 missing)
   F6 < 0.5 to the left,  improve= 3.3873020, (0 missing)
   F3 < 2.5 to the left,  improve= 0.4604841, (0 missing)
Node number 2: 161 observations, complexity param=0.08602151
  predicted class=0  expected loss=0.4037267
class counts: 96 65
probabilities: 0.596 0.404
  left son=4 (82 obs) right son=5 (79 obs)
  Primary splits:
   F4 < 0.5 to the right, improve=6.1305330, (0 missing)
   F1 < 1.5 to the left,  improve=4.7862210, (0 missing)
   F6 < 0.5 to the left,  improve=2.3143120, (0 missing)
   F3 < 2.5 to the left,  improve=1.2670870, (0 missing)
   F2 < 0.5 to the left,  improve=0.8315453, (0 missing)
  Surrogate splits:
   F3 < 0.5 to the left,  agree=0.677, adj=0.342, (0 split)
   F2 < 0.5 to the left,  agree=0.634, adj=0.253, (0 split)
   F6 < 0.5 to the left,  agree=0.627, adj=0.241, (0 split)
   F1 < 0.5 to the left,  agree=0.578, adj=0.139, (0 split)
Node number 3: 30 observations
  predicted class=1  expected loss=0.06666667
class counts:    2 28
probabilities: 0.067 0.933
Node number 4: 82 observations, complexity param=0.08602151
  predicted class=0  expected loss=0.2682927
class counts: 60 22
probabilities: 0.732 0.268
  left son=8 (67 obs) right son=9 (15 obs)
  Primary splits:
   F4 < 1.5 to the left,  improve=10.380200, (0 missing)
   F2 < 0.5 to the left,  improve= 0.205122, (0 missing)
Node number 5: 79 observations, complexity param=0.01612903
  predicted class=1  expected loss=0.4556962
class counts: 36 43
probabilities: 0.456 0.544
  left son=10 (68 obs) right son=11 (11 obs)
  Primary splits:
   F1 < 1.5 to the left,  improve=1.91714600, (0 missing)
   F3 < 0.5 to the right, improve=0.34711480, (0 missing)
   F6 < 0.5 to the right, improve=0.03304112, (0 missing)
   F2 < 0.5 to the right, improve=0.01038624, (0 missing)
Node number 8: 67 observations
  predicted class=0  expected loss=0.1492537
class counts: 57 10
probabilities: 0.851 0.149
Node number 9: 15 observations
  predicted class=1  expected loss=0.2
class counts:    3 12
probabilities: 0.200 0.800
Node number 10: 68 observations, complexity param=0.01612903
  predicted class=0  expected loss=0.5
class counts: 34 34
probabilities: 0.500 0.500
  left son=20 (57 obs) right son=21 (11 obs)
  Primary splits:
   F3 < 2.5 to the left,  improve=0.48803830, (0 missing)
   F2 < 0.5 to the left,  improve=0.15111110, (0 missing)
   F6 < 0.5 to the right, improve=0.03651987, (0 missing)
Node number 11: 11 observations
  predicted class=1  expected loss=0.1818182
class counts:    2    9
probabilities: 0.182 0.818
Node number 20: 57 observations
  predicted class=0  expected loss=0.4736842
class counts: 30 27
probabilities: 0.526 0.474
Node number 21: 11 observations
  predicted class=1  expected loss=0.3636364
class counts:    4    7
probabilities: 0.364 0.636
#画图
> plot(fit,uniform=T);
#决策规则显示
> text(fit,use.n=T,fancy=T,col="blue")
#模型归纳：
> fit
n= 191
node), split, n, loss, yval, (yprob)
   * denotes terminal node
1) root 191 93 0 (0.51308901 0.48691099)
2) F2< 1.5 161 65 0 (0.59627329 0.40372671)
   4) F4>=0.5 82 22 0 (0.73170732 0.26829268)
   8) F4< 1.5 67 10 0 (0.85074627 0.14925373) *
   9) F4>=1.5 15  3 1 (0.20000000 0.80000000) *
   5) F4< 0.5 79 36 1 (0.45569620 0.54430380)
   10) F1< 1.5 68 34 0 (0.50000000 0.50000000)
      20) F3< 2.5 57 27 0 (0.52631579 0.47368421) *
      21) F3>=2.5 11  4 1 (0.36363636 0.63636364) *
   11) F1>=1.5 11  2 1 (0.18181818 0.81818182) *
3) F2>=1.5 30  2 1 (0.06666667 0.93333333) *
规则描述：
1 F2>=1.5 那么判定是垃圾虫（总数30，误判2）
2.F2<1.5 and F4>=0.5 and F4<1.5 那么判定是正常用户（总数67误判10）
3.F2<1.5 and f4>=1.5 那么判定是垃圾虫（总数15 误判3）
4．F2<1.5 and f4<0.5 and f1<1.5 and f3<2.5 无法判断（总数57 30个是正常 27个是垃圾虫）
5.f2<1.5 and f4<0.5 and f1<1.5 and f3>=2.5 判定是垃圾虫（总数11 误判4）
6. f2<1.5 and f4<0.5 and f1>=1.5 判定是垃圾虫（总数11 误判2）
5. 存在的问题
使用名称建模，数据还是过于单薄
如果能根据行为和名字等各个维度，建立评分可以提高预测的能力。
附上data.csv作为附件。

扫码加我拉你入群

请注明：姓名-公司-职位

以便审核进群资格，未注明则拒绝

全部回复

Ameuzine

2014-5-19 17:39:15

不明觉厉= =

扫码加我拉你入群

请注明：姓名-公司-职位

以便审核进群资格，未注明则拒绝

扫码加我 拉你入群

扫码加我 拉你入群

分享

扫码加好友，拉您进群

扫码加我拉你入群

扫码加我拉你入群