-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathterm_pca.R
34 lines (25 loc) · 861 Bytes
/
term_pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
library(tidyverse)
library(tidytext)
library(irlba)
library(quanteda)
data(stop_words)
load("./biorxiv_data.Rda") #R dataset of paper info
papers <- dat %>%
select(title, abstract, issued)
word_counts <- papers %>%
unnest_tokens(word, abstract) %>%
count(title, word, sort = TRUE) %>%
ungroup()
word_freqs <- word_counts %>%
anti_join(stop_words) %>%
bind_tf_idf(word, title, n)
term_mat <- word_freqs %>%
cast_dfm(title, word, tf) %>%
as.matrix()
# term_pca <- prcomp(term_mat,center = TRUE, scale. = TRUE)
term_pca <- term_mat %*% irlba(term_mat, nv=5, nu=0, center=colMeans(term_mat), right_only=TRUE)$v
term_pca_df <- as_tibble(term_pca) %>%
rename_(.dots = setNames(names(.), paste0("PC", 1:5))) %>%
mutate(title = rownames(term_pca)) %>%
left_join(dat, by = "title")
save(term_pca_df,file = "./term_pca_df.Rda")