data mining applications with R第11章，319页的图11.8怎么画

斜白

1828

收藏 2014-07-11

悬赏 50 个论坛币未解决

## Chapter 11 Predicting Seabed Hardness Using Random Forest in R

setwd("C:/temp/")
hard <- read.table("hardness.csv", header = TRUE, sep = ",")
class(hard)
hard[hard==9999] <- NA
str(hard)
hard$hardness2 <- 0
hard$hardness2[hard$hardness == "hard"] <- 1
hard2 <- na.omit(hard)
dim(hard2)

library(randomForest)
tuneRF(hard2[,-c(1,17,18)], hard2[,17], ntreeTry = 100)
set.seed(123)
rf.1 <- randomForest(hard2[,-c(1,17,18)], hard2[,17], data = hard2,importance = TRUE, ntree = 500, proximity = TRUE)
varImpPlot(rf.1)
names(rf.1)

dev.rf1 <- predict(rf.1, hard2)
table(hard2[,17], dev.rf1)
result1 <- replicate(100,rfcv(hard2[,-c(1,17,18)], hard2[,17], scale ="non.log", cv.fold =10,step = -1), simplify = FALSE)
error.cv <- sapply(result1, "[[", "error.cv")
matplot(result1[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type = "l",lwd = c(2, rep(1, ncol(error.cv))), col = c(2, rep(1, ncol(error.cv))), lty = 1, xlab = "Number of variables", ylab = "CV Error")

plot(result1[[1]]$n.var, (1-rowMeans(error.cv))*100, type = "l", lwd = 2, col = 2, lty = 1, xlab = "Number of variables", ylab = "Correct classification rate (%)")

rf.cv <- function (trainx, trainy, cv.fold = 10,
mtry = function(p) max(1, floor(sqrt(p))), ntree=500, ...) {
classRF <- is.factor(trainy)
n <- nrow(trainx)
p <- ncol(trainx)
cv.pred <- NULL
if (classRF) {
      f <- trainy
}    else {
      stop ("This function is only for categorical response variable")
}
nlvl <- table(f)
idx <- numeric(n)
for (i in 1:length(nlvl)) {
      idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold,
         length = nlvl[i]))
}
for (i in 1:cv.fold) {
      all.rf <- randomForest(trainx[idx != i, , drop = FALSE],
         trainy[idx != i], trainx[idx == i, , drop = FALSE],
         trainy[idx == i], mtry = mtry(p), ntree=ntree)
      cv.pred[idx == i] <- all.rf$test$predicted
}

require(psy)
      data1<-as.data.frame(cbind(cv.pred, trainy))
      kappa.cv<-ckappa(data1)$kappa
      ccr.cv<-sum(diag(table(data1)))/sum(table(data1))*100

list(kappa.cv = kappa.cv, ccr.cv = ccr.cv, predicted = cv.pred)
}

ccr.cv.1 <-NULL; kappa.cv.1<-NULL; mccr.cv.1<-NULL; mkappa.cv.1 <-NULL
for (i in 1:100){
rfcv.1<-rf.cv(hard2[,-c(1,17,18)], hard2[,17],cv.fold =10, ntree = 500)
ccr.cv.1[i]<-rfcv.1$ccr.cv;kappa.cv.1[i]<-rfcv.1$kappa.cv
mccr.cv.1[i]<-mean(ccr.cv.1);mkappa.cv.1[i]<-mean(kappa.cv.1)}

x <- c(1:100)
par(mfrow=c(2,1), font.axis = 2, font.lab = 2)
plot(ccr.cv.1 ~ x, xlab = "Iteration", ylab = "Correct classification rate")
points(mccr.cv.1 ~ x, col = 2)
plot(kappa.cv.1 ~ x, xlab = "Iteration", ylab = "Kappa")
points(mkappa.cv.1 ~ x, col = 2)