原本数据只有1764条,我想对数据做层次聚类,再用cutree显示分类,可是显示分类时竟然有3549个数据。新手啊,为了做毕设才学的,好着急,以下是代码和数据在附件里,望各路大神帮帮小女子啊!!
csv <- read.table("body1.txt",header=T)
csv  <-t(csv)
mystopwords<-unlist (read.table("StopWords.txt",stringsAsFactors=F))
library(tm)
#移除数字
removeNumbers = function(x) { ret = gsub("[0-90123456789]","",x) }
#中文分词,也可以考虑使用rmmseg4j、rsmartcn
wordsegment<- function(x) {
library(Rwordseg)
segmentCN(x)
}
#去除停止词,效果比较差,可以进一步完善
removeStopWords = function(x,words) {
ret = character(0)
index <- 1
it_max <- length(x)
while (index <= it_max) {
if (length(words[words==x[index]]) <1) ret <- c(ret,x[index])
index <- index +1
}
ret
}
Sys.setenv(JAVA_HOME='C:/ProgramFiles/Java/jdk1.6.0_43/jre')
sample.words <- lapply(csv, removeNumbers)
sample.words <- lapply(sample.words, wordsegment)
#先处理中文分词,再处理stopwords,防止全局替换丢失信息
sample.words <- lapply(sample.words, removeStopWords, mystopwords)
#构建语料库
corpus = Corpus(VectorSource(sample.words))
 
sample.dtm<- DocumentTermMatrix(corpus, control = list(wordLengths = c(2, Inf)))
dtm2 = removeSparseTerms(sample.dtm,sparse=0.99)
d <- dist(dtm2, method ="euclidean")
fit <- hclust(d,method="ward.D")
plot(fit)
cutree(fit,k=8)#聚类的图显示的是8类,但数据太多特别密集,也看不出有多少数据,不知道是1700条还是3500条