R
数据挖掘基础操作及算法应用案例之logistic回归建模
程序代码
###Logistic回归建模
####案例:能否入选全明星?
##读取并查看数据集
library(xlsx)
nba=read.xlsx("nba2013.xlsx",sheetIndex=1,header=T)
names(nba) #列名
n=dim(nba);n #481*32
str(nba) #查看数据集内部结构
nba$allstar<-as.factor(nba$allstar)
nba1<-nba[,c(2,3,8,11,14,24,25,26,29,32)] #根据实际挑取某些变量
#names(nba1)
##下面:为分类变量pos(球员司职)创建指示变量并构造新的数据集
m1=rep(1,n[1]) #创建481个1
m0=rep(0,n[1])
table(nba1$pos) #分类有:C(中锋)、PF(大前锋)、PG(控卫)、
#SF(小前)、SG(分卫)、F(实际不存在)
nba1$posC=ifelse(nba1$pos=="C",m1,m0) #条件成立为v1,否则为v0
nba1$posPF=ifelse(nba1$pos=="PF",m1,m0)
nba1$posPG=ifelse(nba1$pos=="PG",m1,m0)
nba1$posSF=ifelse(nba1$pos=="SF",m1,m0)
nba1$posSG=ifelse(nba1$pos=="SG",m1,m0)
#names(nba1)
nba2<-nba1[,-1] #删去原pos变量
#str(nba2)
##下面:用全部数据进行logistic拟合
glm_model=glm(allstar~.,family=binomial,nba2)
summary(glm_model)
##结果不是很好。下面:根据AIC指标,逐步回归进行变量筛选
step(glm_model) #结果公式:allstar~age+fg+x3p+ast+blk+pts
names(nba2)
nba3<-nba2[,c(1,2,3,5,7,8,9)] #根据上面结果选取最终变量
##构造训练集&测试集
set.seed(123) #随机数种子
n1=sample(dim(nba3)[1],floor(dim(nba3)[1]*0.69));n1
trainNBA<-nba3[n1,] #训练集
testNBA<-nba3[-n1,] #测试集
##使用训练集建模
ModelFit<-glm(allstar~.,family=binomial,trainNBA)
summary(ModelFit)
trainPre<-predict(ModelFit,trainNBA,type="response") #训练集预测
trainPre1<-rep(0,dim(trainNBA)[1])
trainPre1[trainPre>0.5]=1 #trainPre大于0.5,则生成1
table(trainNBA$allstar,trainPre1)
paste(round(mean(trainNBA$allstar==trainPre1)*100,2),"%",sep="")
#训练集预测正确率
##测试集用来做预测
testPre<-predict(ModelFit,testNBA,type="response") #测试集预测
testPre1<-rep(0,dim(testNBA)[1])
testPre1[testPre>0.5]=1
table(testNBA$allstar,testPre1)
paste(round(mean(testNBA$allstar==testPre1)*100,2),"%",sep="")
#测试集预测正确率
运行结果(部分)
> summary(glm_model)
Call:
glm(formula = allstar ~ ., family = binomial, data = nba2)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6821 -0.1099 -0.0373 -0.0163 3.6923
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.162e+01 1.683e+03 -0.013 0.98975
age 1.733e-01 9.174e-02 1.889 0.05885 .
fg -1.960e-02 1.180e-02 -1.661 0.09673 .
x3p -9.500e-03 7.581e-03 -1.253 0.21020
x2p NA NA NA NA
ast 3.411e-03 3.675e-03 0.928 0.35333
stl 2.387e-03 1.275e-02 0.187 0.85151
blk 1.767e-02 9.600e-03 1.840 0.06575 .
pts 1.237e-02 4.675e-03 2.646 0.00814 **
posC 7.810e+00 1.683e+03 0.005 0.99630
posPF 7.672e+00 1.683e+03 0.005 0.99636
posPG 7.685e+00 1.683e+03 0.005 0.99636
posSF 6.607e+00 1.683e+03 0.004 0.99687
posSG 8.183e+00 1.683e+03 0.005 0.99612
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 196.527 on 480 degrees of freedom
Residual deviance: 79.076 on 468 degrees of freedom
AIC: 105.08
Number of Fisher Scoring iterations: 15
> ##结果不是很好。下面:根据AIC指标,逐步回归进行变量筛选
> step(glm_model) #结果公式:allstar~age+fg+x3p+ast+blk+pts
Start: AIC=105.08
allstar ~ age + fg + x3p + x2p + ast + stl + blk + pts + posC +
posPF + posPG + posSF + posSG
Step: AIC=105.08
allstar ~ age + fg + x3p + ast + stl + blk + pts + posC + posPF +
posPG + posSF + posSG
Df Deviance AIC
- posSF 1 79.076 103.08
- posPF 1 79.076 103.08
- posPG 1 79.076 103.08
- posC 1 79.076 103.08
- posSG 1 79.077 103.08
- stl 1 79.110 103.11
- ast 1 79.939 103.94
- x3p 1 80.741 104.74
<none> 79.076 105.08
- fg 1 82.141 106.14
- blk 1 82.213 106.21
- age 1 82.984 106.98
- pts 1 87.816 111.82
Step: AIC=103.08
allstar ~ age + fg + x3p + ast + stl + blk + pts + posC + posPF +
posPG + posSG
Df Deviance AIC
- stl 1 79.111 101.11
- posPG 1 79.579 101.58
- posPF 1 79.644 101.64
- posC 1 79.674 101.67
- ast 1 79.940 101.94
- posSG 1 80.675 102.67
- x3p 1 80.741 102.74
<none> 79.076 103.08
- fg 1 82.141 104.14
- blk 1 82.213 104.21
- age 1 82.984 104.98
- pts 1 87.816 109.82
Step: AIC=101.11
allstar ~ age + fg + x3p + ast + blk + pts + posC + posPF + posPG +
posSG
Df Deviance AIC
- posPG 1 79.581 99.581
- posPF 1 79.645 99.645
- posC 1 79.675 99.675
- ast 1 80.505 100.505
- posSG 1 80.698 100.698
- x3p 1 80.814 100.814
<none> 79.111 101.111
- fg 1 82.194 102.194
- blk 1 82.274 102.274
- age 1 83.088 103.088
- pts 1 87.935 107.935
Step: AIC=99.58
allstar ~ age + fg + x3p + ast + blk + pts + posC + posPF + posSG
Df Deviance AIC
- posPF 1 79.865 97.865
- posC 1 79.923 97.923
- posSG 1 80.699 98.699
- x3p 1 81.275 99.275
<none> 79.581 99.581
- blk 1 82.453 100.453
- fg 1 82.579 100.579
- age 1 83.239 101.239
- ast 1 83.534 101.534
- pts 1 88.103 106.103
Step: AIC=97.87
allstar ~ age + fg + x3p + ast + blk + pts + posC + posSG
Df Deviance AIC
- posC 1 79.960 95.960
- posSG 1 80.728 96.728
<none> 79.865 97.865
- x3p 1 82.133 98.133
- fg 1 82.707 98.707
- blk 1 83.275 99.275
- ast 1 83.600 99.600
- age 1 84.405 100.405
- pts 1 88.493 104.493
Step: AIC=95.96
allstar ~ age + fg + x3p + ast + blk + pts + posSG
Df Deviance AIC
- posSG 1 80.800 94.800
<none> 79.960 95.960
- x3p 1 82.633 96.633
- fg 1 83.013 97.013
- ast 1 83.682 97.682
- age 1 84.773 98.773
- blk 1 85.815 99.815
- pts 1 89.001 103.001
Step: AIC=94.8
allstar ~ age + fg + x3p + ast + blk + pts
Df Deviance AIC
<none> 80.800 94.800
- x3p 1 82.942 94.942
- ast 1 83.758 95.758
- fg 1 84.305 96.305
- blk 1 85.943 97.943
- age 1 86.048 98.048
- pts 1 90.692 102.692
Call: glm(formula = allstar ~ age + fg + x3p + ast + blk + pts, family = binomial,
data = nba2)
Coefficients:
(Intercept) age fg x3p ast blk
-14.052311 0.182476 -0.019873 -0.009723 0.003714 0.017605
pts
0.012472
Degrees of Freedom: 480 Total (i.e. Null); 474 Residual
Null Deviance: 196.5
Residual Deviance: 80.8 AIC: 94.8
> names(nba2)
[1] "age" "fg" "x3p" "x2p" "ast" "stl" "blk" "pts"
[9] "allstar" "posC" "posPF" "posPG" "posSF" "posSG"
> nba3<-nba2[,c(1,2,3,5,7,8,9)] #根据上面结果选取最终变量
> paste(round(mean(trainNBA$allstar==trainPre1)*100,2),"%",sep="")
[1] "97.28%"
> #训练集预测正确率
> paste(round(mean(testNBA$allstar==testPre1)*100,2),"%",sep="")
[1] "95.33%"
###结果说明:训练集及测试集预测正确率如上,分别为97.22%和94.21%,测试集相对训练集预测
###正确率略低,感兴趣的童鞋可调试下训练集抽样比例(剧透:我的在抽取0.69时结果最优),
###来找出最优结果。
写在后面:简单写这么多,算是抛砖引玉。从本案例可以看出,特征的选择对于模型很重要,感兴趣的童鞋可以尝试下其它特征变量的选择对于模型的影响;也可以试试其它算法,从得到更好的模型及结果;顺便,可以多采集些不同年份的数据,从业务层面去深入分析下。