悬赏 1 个论坛币 未解决
各位好,本人是r的初学者。我想要取training set 和test set,但是在取test set时遇到了问题:test set取出来的值都为na,可是training set表现很正常。拜托各位大神了!谢谢!!
下面是我的code:(有问题部分已标红)
pima=read.table('Pima_Indians_Data.txt',sep = ",", header = TRUE)
#View(pima)
summary(pima)
## What to do with NAs?
sum(is.na(pima))
##there is no NAs in the dataset, but there are lots of unreasonable 0.
##change class to factor variable
pima$Class <- as.factor(pima$Class)
##change all unreasonable 0 to NA and then remove all NAs
pima$Plasma.glucose.concentration[pima$Plasma.glucose.concentration==0] =NA
pima$Diastolic.Blood.Pressure[pima$Diastolic.Blood.Pressure==0] =NA
pima$Triceps.skin.fold.thickness[pima$Triceps.skin.fold.thickness==0] =NA
pima$Two.hour.serum.insulin[pima$Two.hour.serum.insulin==0] =NA
pima$BMI [pima$BMI ==0] =NA
library(dplyr)
pima <- pima %>% group_by(Age) %>% mutate(BMI=ifelse(is.na(BMI),mean(BMI,na.rm=T),BMI))
pima <- as.data.frame(pima %>% group_by(Age,BMI) %>% mutate(Plasma.glucose.concentration=ifelse(is.na(Plasma.glucose.concentration),mean(Plasma.glucose.concentration,na.rm=T),Plasma.glucose.concentration),
Diastolic.Blood.Pressure=ifelse(is.na(Diastolic.Blood.Pressure),mean(Diastolic.Blood.Pressure,na.rm=T),Diastolic.Blood.Pressure),
Triceps.skin.fold.thickness=ifelse(is.na(Triceps.skin.fold.thickness),mean(Triceps.skin.fold.thickness,na.rm=T),Triceps.skin.fold.thickness),
Two.hour.serum.insulin=ifelse(is.na(Two.hour.serum.insulin),mean(Two.hour.serum.insulin,na.rm=T),Two.hour.serum.insulin)))
pima <- as.data.frame(pima %>% group_by(Age) %>% mutate(Plasma.glucose.concentration=ifelse(is.na(Plasma.glucose.concentration),mean(Plasma.glucose.concentration,na.rm=T),Plasma.glucose.concentration),
Diastolic.Blood.Pressure=ifelse(is.na(Diastolic.Blood.Pressure),mean(Diastolic.Blood.Pressure,na.rm=T),Diastolic.Blood.Pressure),
Triceps.skin.fold.thickness=ifelse(is.na(Triceps.skin.fold.thickness),mean(Triceps.skin.fold.thickness,na.rm=T),Triceps.skin.fold.thickness),
Two.hour.serum.insulin=ifelse(is.na(Two.hour.serum.insulin),mean(Two.hour.serum.insulin,na.rm=T),Two.hour.serum.insulin)))
pima <- as.data.frame(pima %>% group_by(BMI) %>% mutate(Plasma.glucose.concentration=ifelse(is.na(Plasma.glucose.concentration),mean(Plasma.glucose.concentration,na.rm=T),Plasma.glucose.concentration),
Diastolic.Blood.Pressure=ifelse(is.na(Diastolic.Blood.Pressure),mean(Diastolic.Blood.Pressure,na.rm=T),Diastolic.Blood.Pressure),
Triceps.skin.fold.thickness=ifelse(is.na(Triceps.skin.fold.thickness),mean(Triceps.skin.fold.thickness,na.rm=T),Triceps.skin.fold.thickness),
Two.hour.serum.insulin=ifelse(is.na(Two.hour.serum.insulin),mean(Two.hour.serum.insulin,na.rm=T),Two.hour.serum.insulin)))
sum(is.na(pima))
pima <- pima[complete.cases(pima), ]
str(pima)
##create a training set and a test set
str(pima)
set.seed(12345)
pima_rand <- pima[order(runif(700)), ]
pima_train <- pima_rand[1:700, ]
pima_test <- pima_rand[701:766, ]
str(pima_train)
##'data.frame': 700 obs. of 9 variables:
$ Num.times.pregnant : int 1 0 4 3 5 7 4 7 4 1 ...
$ Plasma.glucose.concentration: num 189 95 127 158 158 83 97 150 117 91 ...
$ Diastolic.Blood.Pressure : num 60 80 88 76 84 78 60 78 64 54 ...
$ Triceps.skin.fold.thickness : num 23 45 11 36 41 26 23 29 27 25 ...
$ Two.hour.serum.insulin : num 846 92 155 245 210 ...
$ BMI : num 30.1 36.5 34.5 31.6 39.4 29.3 28.2 35.2 33.2 25.2 ...
$ Diabetes.pedigree.function : num 0.398 0.33 0.598 0.851 0.395 0.767 0.443 0.692 0.23 0.234 ...
$ Age : int 59 26 28 28 29 36 22 54 24 23 ...
$ Class : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 2 1 1 ...
str(pima_test)
##'data.frame': 66 obs. of 9 variables:
$ Num.times.pregnant : int NA NA NA NA NA NA NA NA NA NA ...
$ Plasma.glucose.concentration: num NA NA NA NA NA NA NA NA NA NA ...
$ Diastolic.Blood.Pressure : num NA NA NA NA NA NA NA NA NA NA ...
$ Triceps.skin.fold.thickness : num NA NA NA NA NA NA NA NA NA NA ...
$ Two.hour.serum.insulin : num NA NA NA NA NA NA NA NA NA NA ...
$ BMI : num NA NA NA NA NA NA NA NA NA NA ...
$ Diabetes.pedigree.function : num NA NA NA NA NA NA NA NA NA NA ...
$ Age : int NA NA NA NA NA NA NA NA NA NA ...
$ Class : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...