Latent Semantic Analysis

TODO

一般我们会使用内置的 text2vec::moview_review 数据集。让我们清理它一下，然后生成 DTM：

library(stringr)
library(text2vec)
data("movie_review")
# select 500 rows for faster running times
movie_review_test = movie_review[501:1000, ]
movie_review_train = movie_review[1:500, ]
prep_fun = function(x) {
  x %>%
    # make text lower case
    str_to_lower %>%
    # remove non-alphanumeric symbols
    str_replace_all("[^[:alnum:]]", " ") %>%
    # collapse multiple spaces
    str_replace_all("\\s+", " ")
}
movie_review_train$review = prep_fun(movie_review_train$review)
it = itoken(movie_review_train$review, progressbar = FALSE)
v = create_vocabulary(it) %>%
  prune_vocabulary(doc_proportion_max = 0.1, term_count_min = 5)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)

接着我们进行 tf-idf 转换，拟合模型，并进行 LSA 建模。

tfidf = TfIdf$new()
lsa = LSA$new(n_topics = 10)

# pipe friendly transformation
dtm_tfidf_lsa = dtm %>%
  fit_transform(tfidf) %>%
  fit_transform(lsa)

我们可以优雅地使用在新数据上进行同样的转换。

new_data = movie_review_test
new_data_dtm_tfidf_lsa =
  new_data$review %>%
  itoken(preprocessor = prep_fun, progressbar = FALSE) %>%
  create_dtm(vectorizer) %>%
  transform(tfidf) %>%
  transform(lsa)
head(new_data_dtm_tfidf_lsa)

##           [,1]        [,2]         [,3]         [,4]         [,5]
## 1 0.0010757498 -0.07883588  0.012792492  0.001803077 -0.016498829
## 2 0.0010309045 -0.07827933  0.002480813  0.007431587 -0.013045284
## 3 0.0008995570 -0.09534925  0.004880675 -0.007976679 -0.033347235
## 4 0.0009023917 -0.09403420 -0.012338197  0.007145261 -0.007353982
## 5 0.0010801880 -0.07638067  0.010959332  0.011642504 -0.017630506
## 6 0.0009748510 -0.08042964  0.008732528  0.007173264 -0.014004544
##            [,6]         [,7]          [,8]         [,9]        [,10]
## 1  0.0103262362 -0.009852323 -0.0064973268  0.002295156  0.003112725
## 2 -0.0001511818 -0.007605538 -0.0076489837  0.005334299  0.003352278
## 3 -0.0028268780 -0.016753258 -0.0423168722 -0.521279542  0.038697373
## 4  0.0171269927 -0.005365014 -0.0002391057 -0.007157377  0.012386326
## 5 -0.0034931519 -0.005086364  0.0020006889 -0.001886155 -0.003575192
## 6 -0.0124146267 -0.003978910 -0.0209399126  0.008283099  0.011479659

Latent Dirichlet Allocation

TODO

tokens = movie_review$review %>%
  tolower %>%
  word_tokenizer
# turn off progressbar because it won't look nice in rmd
it = itoken(tokens, ids = movie_review$id, progressbar = FALSE)
v = create_vocabulary(it) %>%
  prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer, type = "lda_c")

lda_model =
  LDA$new(n_topics = 10, vocabulary = v,
          doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr =
  lda_model$fit_transform(dtm, n_iter = 1000, convergence_tol = 0.01,
                          check_convergence_every_n = 10)

lda_model$plot()

LDAvis

主题模型

Dmitriy Selivanov

2017-07-05

Latent Semantic Analysis

Latent Dirichlet Allocation