这个是主题模型建立:
library(slam)
summary(col_sums(sample.dtm))
term_tfidf <- tapply(sample.dtm$v/row_sums( sample.dtm)[ sample.dtm$i],
sample.dtm$j, mean)*
log2(nDocs( sample.dtm)/col_sums( sample.dtm > 0))
summary(term_tfidf)
sample.dtm <- sample.dtm[, term_tfidf >= 0.1]
sample.dtm <- sample.dtm[row_sums(sample.dtm) > 0,]
k <- length(unique(csv$type))
library(topicmodels)
SEED <- 2012
sample_TM <- list(
VEM = LDA( sample.dtm, k=k, control = list(seed = SEED)),
VEM_fixed = LDA( sample.dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA( sample.dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin =
1000, thin = 100, iter = 1000)),
CTM = CTM( sample.dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em =
list(tol = 10^-3))))
sapply(sample_TM[1:2], slot, "alpha")
sapply(sample_TM, function(x) mean(apply(posterior(x)$topics,1, function(z)
-sum(z*log(z)))))
Topic <- topics(sample_TM[["VEM"]], 1)
Terms <- terms(sample_TM[["VEM"]], 5)
Terms[, 1:6]
这一部分完了后是否就可以拿新文本进行测试了?