TODO
一般我们会使用内置的 text2vec::moview_review
数据集。让我们清理它一下,然后生成 DTM:
library(stringr)
library(text2vec)
data("movie_review")
# select 500 rows for faster running times
movie_review_test = movie_review[501:1000, ]
movie_review_train = movie_review[1:500, ]
prep_fun = function(x) {
x %>%
# make text lower case
str_to_lower %>%
# remove non-alphanumeric symbols
str_replace_all("[^[:alnum:]]", " ") %>%
# collapse multiple spaces
str_replace_all("\\s+", " ")
}
movie_review_train$review = prep_fun(movie_review_train$review)
it = itoken(movie_review_train$review, progressbar = FALSE)
v = create_vocabulary(it) %>%
prune_vocabulary(doc_proportion_max = 0.1, term_count_min = 5)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
接着我们进行 tf-idf 转换,拟合模型,并进行 LSA 建模。
tfidf = TfIdf$new()
lsa = LSA$new(n_topics = 10)
# pipe friendly transformation
dtm_tfidf_lsa = dtm %>%
fit_transform(tfidf) %>%
fit_transform(lsa)
我们可以优雅地使用在新数据上进行同样的转换。
new_data = movie_review_test
new_data_dtm_tfidf_lsa =
new_data$review %>%
itoken(preprocessor = prep_fun, progressbar = FALSE) %>%
create_dtm(vectorizer) %>%
transform(tfidf) %>%
transform(lsa)
head(new_data_dtm_tfidf_lsa)
## [,1] [,2] [,3] [,4] [,5]
## 1 0.0010757498 -0.07883588 0.012792492 0.001803077 -0.016498829
## 2 0.0010309045 -0.07827933 0.002480813 0.007431587 -0.013045284
## 3 0.0008995570 -0.09534925 0.004880675 -0.007976679 -0.033347235
## 4 0.0009023917 -0.09403420 -0.012338197 0.007145261 -0.007353982
## 5 0.0010801880 -0.07638067 0.010959332 0.011642504 -0.017630506
## 6 0.0009748510 -0.08042964 0.008732528 0.007173264 -0.014004544
## [,6] [,7] [,8] [,9] [,10]
## 1 0.0103262362 -0.009852323 -0.0064973268 0.002295156 0.003112725
## 2 -0.0001511818 -0.007605538 -0.0076489837 0.005334299 0.003352278
## 3 -0.0028268780 -0.016753258 -0.0423168722 -0.521279542 0.038697373
## 4 0.0171269927 -0.005365014 -0.0002391057 -0.007157377 0.012386326
## 5 -0.0034931519 -0.005086364 0.0020006889 -0.001886155 -0.003575192
## 6 -0.0124146267 -0.003978910 -0.0209399126 0.008283099 0.011479659
TODO
tokens = movie_review$review %>%
tolower %>%
word_tokenizer
# turn off progressbar because it won't look nice in rmd
it = itoken(tokens, ids = movie_review$id, progressbar = FALSE)
v = create_vocabulary(it) %>%
prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer, type = "lda_c")
lda_model =
LDA$new(n_topics = 10, vocabulary = v,
doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr =
lda_model$fit_transform(dtm, n_iter = 1000, convergence_tol = 0.01,
check_convergence_every_n = 10)
lda_model$plot()