library(class) #加载KNN算法的包
library(gtools)
library(gdata)
library(gmodels)
wbcd <- read.csv("wisc_bc_data.csv",stringsAsFactors=FALSE)
str(wbcd)
wbcd <- wbcd[-1] #将第一列去掉
table(wbcd$diagnosis) #查看diagnosis字段里的个数
wbcd$diagnosis <- factor(wbcd$diagnosis,levels=c("B","M"),labels=c("Benign","Malignant"))
round(prop.table(table(wbcd$diagnosis))*100,digits=1) #计算出数据所占百分比
summary(wbcd[c("radius_mean","area_mean","smoothness_mean")])
normalize <- function(x) {return((x-min(x))/(max(x)-min(x)))}
normalize(c(1,2,3,4,5)) #对函数进行测试
normalize(c(10,20,30,40,50))
wbcd_n <- as.data.frame(lapply(wbcd[2:31],normalize)) #用normalize函数对wbcd进行处理
summary(wbcd_n$area_mean)
summary(wbcd$area_mean) #用来对比查看
wbcd_train <- wbcd_n[1:469,]
wbcd_test <- wbcd_n[470:569,]
wbcd_train_labels <- wbcd[1:469,1]
wbcd_test_labels <- wbcd[470:569,1]
#KNN算法
wbcd_test_pred <- knn(train=wbcd_train, test=wbcd_test, cl=wbcd_train_labels,k=21)
CrossTable(x=wbcd_test_labels,y=wbcd_test_pred,prop.chisq=FALSE)
wbcd_z <- as.data.frame(scale(wbcd[-1])) #用中心标准化法归一化数据
summary(wbcd_z$area_mean)
wbcd_train <- wbcd_z[1:469,]
wbcd_test <- wbcd_z[470:569,]
#wbcd_train_labels <- wbcd[1:469,1]
#wbcd_test_labels <- wbcd[470:569,1]
wbcd_test_pred <- knn(train=wbcd_train, test=wbcd_test, cl=wbcd_train_labels,k=21)
CrossTable(x=wbcd_test_labels,y=wbcd_test_pred,prop.chisq=FALSE)