全部版块 我的主页
论坛 数据科学与人工智能 数据分析与数据科学 R语言论坛
3041 1
2014-05-19
本文大纲:
1. 问题背景
2. 数据获取
3. 数据信息预处理分析
4. 决策树模式识别
5. 存在的问题
用的到R包是rpart决策树
1.问题背景
   论坛开张后,总是会有很多注册的用户,
即机器注册的用户,骗取积分和礼品,分发广告。
希望探索规律,识别一些可能的僵尸注册id。
2.数据获取
数据取自老虎的dataguru论坛的垃圾虫竞赛的数据。
3.数据信息预处理分析
数据有论坛id和邮箱字符,
将数据转化成csv格式,然后sqlldr进入oracle数据库
(其实任何一种数据库都行,只是实现不同)。
在数据库中初步加工和探索,刻画了六个维度区别数据。
create table org
(
  num number,
  id  varchar2(300),
  email varchar2(300)
);
--建立存放原始数据
create table add_org
(
  num number,
  id  varchar2(300),
  email varchar2(300),
  eff_id varchar2(1),
  f1     varchar2(100),  --邮箱大小写混合排列次数
  f2     varchar2(100),  --邮箱字母数字混合排列次数
  f3     varchar2(100),  --id大小写混合排列次数
  f4     varchar2(100),  --id字母数字混合排列次数
  f5     varchar2(100),  --邮箱中是否全数字
  f6     varchar2(100)   --论坛id是否含汉字
);
--刻画指标
create package v_add
as
function test_up_low(v_str in varchar2) return varchar2;
function test_nm_str(v_str in varchar2) return varchar2;
function test_fullnum(v_str in varchar2) return varchar2;
function test_hav_han(v_str in varchar2) return varchar2;
end v_add;
/
create or replace package body v_add
as
function test_up_low(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num number;
begin
select
nvl(sum(case when hv=1 and hg=2 then 1
        when hv=2 and hg=1 then 1
   else 0
   end),0) into v_num
from
(
select h,v,g,
(
case
when v in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when v in
(
'A','B','C','D','E',
'F','G','H','I','J',
'K','L','M','N','O',
'P','Q','R','S','T',
'U','V','W','X','Y',
'Z'
) then 2
else 3
end
) as hv,
(
case
when g in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when g in
(
'A','B','C','D','E',
'F','G','H','I','J',
'K','L','M','N','O',
'P','Q','R','S','T',
'U','V','W','X','Y',
'Z'
) then 2
else 3
end
) as hg from
(
select h,v,lag(v) over(order by h) g
from
(
select level h , substr(v_str,level,1) v
from dual
connect by level<=length(v_str)
)
)
) ;
v_cnt := to_char(v_num);
return v_cnt;
end test_up_low;
function test_nm_str(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num number;
begin
select
nvl(sum(case when hv=1 and hg=2 then 1
        when hv=2 and hg=1 then 1
   else 0
   end),0) into v_num
from
(
select h,v,g,
(
case
when lower(v) in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when v in
(
'0','1','2','3','4',
'5','6','7','8','9'
) then 2
else 3
end
) as hv,
(
case
when lower(g) in
(
'a','b','c','d','e',
'f','g','h','i','j',
'k','l','m','n','o',
'p','q','r','s','t',
'u','v','w','x','y',
'z'
) then 1
when g in
(
  '0','1','2','3','4',
  '5','6','7','8','9'
) then 2
else 3
end
) as hg from
(
select h,v,lag(v) over(order by h) g
from
(
select level h , substr(v_str,level,1) v
from dual
connect by level<=length(v_str)
)
)
) ;
v_cnt := to_char(v_num);
return v_cnt;
end test_nm_str;
function test_fullnum(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num number;
begin
select
sum(case
when v in
   (
     '0','1','2','3','4',
     '5','6','7','8','9'
    )
then 0
else 1
end) into v_num
from
(
select level h , substr(v_str,level,1) v
from dual
connect by level<=length(v_str)
);
if v_num=0
then v_cnt:='1' ;  --全为数字
else v_cnt:='0' ;
end if;
return v_cnt;
end test_fullnum;
function test_hav_han(v_str in varchar2) return varchar2
as
v_cnt varchar2(100);
v_num1 number;
v_num2 number;
begin
v_num1 :=LENGTHB(v_str);
v_num2 := LENGTH(v_str);
if v_num1>v_num2
then v_cnt:='1' ;  --含有汉字
else v_cnt:='0' ;
end if;
return v_cnt;
end test_hav_han;
end v_add;
/
--加工数据集
insert into add_org
select
num,id,email,case when num between 1 and 170 then '1' else '0' end ,
v_add.test_up_low(email),
v_add.test_nm_str(email),
v_add.test_up_low(id),
v_add.test_nm_str(id),
v_add.test_fullnum(email),
v_add.test_hav_han(id)
from org;
--定义模式识别的可识别范围
select eff_id,f1,f2,f3,f4,f5,f6,count(1)
from add_org
group by  eff_id,f1,f2,f3,f4,f5,f6
order by  eff_id,f1,f2,f3,f4,f5,f6
--观察000000和000001在垃圾虫和非垃圾虫中都占据大量的号码
--我们称000000和000001为不可用字符串识别的类,需要留存用其他规则判断。
--剔除这部分数据
select eff_id,f1,f2,f3,f4,f5,f6
from add_org
where f1||f2||f3||f4||f5||f6 not in ('000000','000001');
--生成数据到r中计算,生成data.csv
4. 决策树模式识别
我们将数据加载到r中,然后通过r的决策树分析。
r的代码如下:
#加载决策树
> library(rpart);
#加载数据
> rc<-read.csv("data.csv");
#使用决策树建模
> fit<-rpart(EFF_ID~.,method="class",data=rc)
#检验决策树模型
> printcp(fit);
Classification tree:
rpart(formula = EFF_ID ~ ., data = rc, method = "class")
Variables actually used in tree construction:
[1] F1 F2 F3 F4
Root node error: 93/191 = 0.48691
n= 191
        CP nsplit rel error  xerror     xstd
1 0.279570      0   1.00000 1.00000 0.074277
2 0.086022      1   0.72043 0.75269 0.071605
3 0.016129      3   0.54839 0.61290 0.067997
4 0.010000      5   0.51613 0.67742 0.069868
> plotcp(fit);
> summary(fit);
Call:
rpart(formula = EFF_ID ~ ., data = rc, method = "class")
  n= 191
          CP nsplit rel error    xerror       xstd
1 0.27956989      0 1.0000000 1.0000000 0.07427709
2 0.08602151      1 0.7204301 0.7526882 0.07160479
3 0.01612903      3 0.5483871 0.6129032 0.06799707
4 0.01000000      5 0.5161290 0.6774194 0.06986757
Node number 1: 191 observations,    complexity param=0.2795699
  predicted class=0  expected loss=0.486911
    class counts:    98    93
   probabilities: 0.513 0.487
  left son=2 (161 obs) right son=3 (30 obs)
  Primary splits:
      F2 < 1.5 to the left,  improve=14.1856900, (0 missing)
      F4 < 0.5 to the right, improve=12.0456300, (0 missing)
      F1 < 0.5 to the left,  improve= 6.4681680, (0 missing)
      F6 < 0.5 to the left,  improve= 3.3873020, (0 missing)
      F3 < 2.5 to the left,  improve= 0.4604841, (0 missing)
Node number 2: 161 observations,    complexity param=0.08602151
  predicted class=0  expected loss=0.4037267
    class counts:    96    65
   probabilities: 0.596 0.404
  left son=4 (82 obs) right son=5 (79 obs)
  Primary splits:
      F4 < 0.5 to the right, improve=6.1305330, (0 missing)
      F1 < 1.5 to the left,  improve=4.7862210, (0 missing)
      F6 < 0.5 to the left,  improve=2.3143120, (0 missing)
      F3 < 2.5 to the left,  improve=1.2670870, (0 missing)
      F2 < 0.5 to the left,  improve=0.8315453, (0 missing)
  Surrogate splits:
      F3 < 0.5 to the left,  agree=0.677, adj=0.342, (0 split)
      F2 < 0.5 to the left,  agree=0.634, adj=0.253, (0 split)
      F6 < 0.5 to the left,  agree=0.627, adj=0.241, (0 split)
      F1 < 0.5 to the left,  agree=0.578, adj=0.139, (0 split)
Node number 3: 30 observations
  predicted class=1  expected loss=0.06666667
    class counts:     2    28
   probabilities: 0.067 0.933
Node number 4: 82 observations,    complexity param=0.08602151
  predicted class=0  expected loss=0.2682927
    class counts:    60    22
   probabilities: 0.732 0.268
  left son=8 (67 obs) right son=9 (15 obs)
  Primary splits:
      F4 < 1.5 to the left,  improve=10.380200, (0 missing)
      F2 < 0.5 to the left,  improve= 0.205122, (0 missing)
Node number 5: 79 observations,    complexity param=0.01612903
  predicted class=1  expected loss=0.4556962
    class counts:    36    43
   probabilities: 0.456 0.544
  left son=10 (68 obs) right son=11 (11 obs)
  Primary splits:
      F1 < 1.5 to the left,  improve=1.91714600, (0 missing)
      F3 < 0.5 to the right, improve=0.34711480, (0 missing)
      F6 < 0.5 to the right, improve=0.03304112, (0 missing)
      F2 < 0.5 to the right, improve=0.01038624, (0 missing)
Node number 8: 67 observations
  predicted class=0  expected loss=0.1492537
    class counts:    57    10
   probabilities: 0.851 0.149
Node number 9: 15 observations
  predicted class=1  expected loss=0.2
    class counts:     3    12
   probabilities: 0.200 0.800
Node number 10: 68 observations,    complexity param=0.01612903
  predicted class=0  expected loss=0.5
    class counts:    34    34
   probabilities: 0.500 0.500
  left son=20 (57 obs) right son=21 (11 obs)
  Primary splits:
      F3 < 2.5 to the left,  improve=0.48803830, (0 missing)
      F2 < 0.5 to the left,  improve=0.15111110, (0 missing)
      F6 < 0.5 to the right, improve=0.03651987, (0 missing)
Node number 11: 11 observations
  predicted class=1  expected loss=0.1818182
    class counts:     2     9
   probabilities: 0.182 0.818
Node number 20: 57 observations
  predicted class=0  expected loss=0.4736842
    class counts:    30    27
   probabilities: 0.526 0.474
Node number 21: 11 observations
  predicted class=1  expected loss=0.3636364
    class counts:     4     7
   probabilities: 0.364 0.636
#画图
> plot(fit,uniform=T);
#决策规则显示
> text(fit,use.n=T,fancy=T,col="blue")
#模型归纳:
> fit
n= 191
node), split, n, loss, yval, (yprob)
      * denotes terminal node
1) root 191 93 0 (0.51308901 0.48691099)  
   2) F2< 1.5 161 65 0 (0.59627329 0.40372671)  
     4) F4>=0.5 82 22 0 (0.73170732 0.26829268)  
       8) F4< 1.5 67 10 0 (0.85074627 0.14925373) *
       9) F4>=1.5 15  3 1 (0.20000000 0.80000000) *
     5) F4< 0.5 79 36 1 (0.45569620 0.54430380)  
      10) F1< 1.5 68 34 0 (0.50000000 0.50000000)  
        20) F3< 2.5 57 27 0 (0.52631579 0.47368421) *
        21) F3>=2.5 11  4 1 (0.36363636 0.63636364) *
      11) F1>=1.5 11  2 1 (0.18181818 0.81818182) *
   3) F2>=1.5 30  2 1 (0.06666667 0.93333333) *
规则描述:
1 F2>=1.5 那么判定是垃圾虫(总数30,误判2)
2.F2<1.5 and F4>=0.5 and F4<1.5 那么判定是正常用户(总数67误判10)
3.F2<1.5 and f4>=1.5 那么判定是垃圾虫 (总数15 误判3)
4.F2<1.5 and f4<0.5 and f1<1.5 and f3<2.5 无法判断(总数57 30个是正常 27个是垃圾虫)
5.f2<1.5 and f4<0.5 and f1<1.5 and f3>=2.5 判定是垃圾虫(总数11 误判4)
6. f2<1.5 and f4<0.5 and f1>=1.5 判定是垃圾虫(总数11 误判2)
5. 存在的问题
使用名称建模,数据还是过于单薄
如果能根据行为和名字等各个维度,建立评分可以提高预测的能力。
附上data.csv作为附件。

二维码

扫码加我 拉你入群

请注明:姓名-公司-职位

以便审核进群资格,未注明则拒绝

全部回复
2014-5-19 17:39:15
不明觉厉= =
二维码

扫码加我 拉你入群

请注明:姓名-公司-职位

以便审核进群资格,未注明则拒绝

相关推荐
栏目导航
热门文章
推荐文章

说点什么

分享

扫码加好友,拉您进群
各岗位、行业、专业交流群