-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBECCA_stm.rmd
364 lines (271 loc) · 11.6 KB
/
BECCA_stm.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
---
title: "New Layout"
output:
flexdashboard::flex_dashboard:
theme: cerulean
logo: undp48.png
orientation: rows
runtime: shiny
vertical_layout: fill
runtime: shiny
---
```{r Load Packages, include=FALSE}
require(flexdashboard)
require(tm)
require(shiny)
require(shinyBS)
require(quanteda)
require(RYandexTranslate)
require(stm)
require(wordcloud)
require(htmlwidgets)
require(stmBrowser)
require(devtools)
require(jsonlite)
require(SnowballC)
require(ggplot2)
require(cluster)
require(fpc)
require(skmeans)
require(knitr)
require(xlsx) # for reading original xls files
require(plotly) # for interactive map
require(likert) # for Likert plots
require(RColorBrewer) # for the Brewer color palette used in Triad plots
require(grid) # required by Triad and likert plots
require(gridExtra) # for arrangin the Triad plots
require(mixtools) # for computing ellipsoids from clustering
require(sp)
require(magrittr)
require(plotly)
require(data.table)
require(xtable)
devtools::install_github("timelyportfolio/stmBrowser@htmlwidget") # This needs to be installed once
```
```{r Load All Datasets}
knitr::opts_knit$set(root.dir = normalizePath('./'))
# print(opts_knit$get("root.dir"))
## Load All Datasets
wd <- getwd()
dataset <- c("kyrgyzstan", "moldova", "unicef", "serbia", "tajikistan", "yemen")
# print(dataset)
for (d in 1:length(dataset)){
load(paste0(wd,"/clean_data/",dataset[d],"_clean.RData"))
dataset <- c("kyrgyzstan", "moldova", "unicef", "serbia", "tajikistan", "yemen")
assign(paste0(dataset[d]),clean)
}
```
Understanding Topics {data-navmenu="STM"}
=====================================
Understanding Topics {.sidebar}
-------------------------------------
```{r, echo=FALSE}
## Set Working Directory and create environment
stm.env <<- new.env()
stm.values <- reactiveValues()
if (length(ls(stm.env)) != 0) rm(ls(stm.env))
test_stm <<- 0 # Use test_model instead of running stm (for faster debugging).
## Choose Country
selectInput("STMCountry", label = "Select Country",
choices = c("Moldova","Kyrgyzstan UNDP","Kyrgyzstan Unicef", "Serbia","Tajikistan","Yemen"))
## Choose Language
radioButtons(inputId = "stm_lang",
label = "Select Language",
choices = c("Original" = "texts_org",
"English" = "texts_eng"))
## Choose K Topics
sliderInput("select_k", "Select number of topics", min = 5, max = 50, value = 25)
## Choose Prevalence Covariates
p("Choose Prevalence Covariates")
## Choose signifer type to drop-down
selectInput(inputId = "stm_sigs",
label = "Signifier Type",
choices = c("Select","Triads","Dyads","Stones","Questions","Descriptors"),
selected = "Select")
renderUI({
## Source covariate names from dataset
stm_dataset <<- switch(input$STMCountry,
"Moldova" = moldova,
"Kyrgyzstan UNDP" = kyrgyzstan,
"Kyrgyzstan Unicef" = unicef,
"Serbia" = serbia,
"Tajikistan" = tajikistan,
"Yemen" = yemen)
## Sort them by signifier type
source("sort_signifiers.R")
sigtypes(stm_dataset)
cov_choices <<- reactive({switch(input$stm_sigs,
"Select" = "Choose Signifier Type",
"Triads" = names(triads),
"Dyads" = names(dyads),
"Stones" = names(stones),
"Questions" = names(questions),
"Descriptors" = names(descriptors))})
## Choose covariates drop-down
conditionalPanel(condition = "input.stm_sigs != 'Select'",
checkboxGroupInput(inputId = "stm_cov",
label = "Select Covariates",
choices = cov_choices()))
})
## Action Buttons - Include chosen covariates, Reset them
actionButton("stm_include", "Include")
actionButton("reset_cov", "Reset")
## Object containing covariate selection
prev_cov <<- NULL
stm.env$covariates <- eventReactive(input$stm_include,{
c(prev_cov, input$stm_cov)
prev_cov <<- c(prev_cov, input$stm_cov)
})
hr()
## Run STM Model
actionButton("stm_go", "Run STM")
## Table displaying covariate selection
hr()
renderUI({
req(stm.env$covariates)
strong("Prevalence Covariates")
})
output$cov <- renderTable({
req(stm.env$covariates)
input$reset_cov
matrix(data = stm.env$covariates(), byrow = TRUE)},include.rownames=FALSE,include.colnames=FALSE)
## Reset Button function
observeEvent(input$reset_cov,{
prev_cov <<- NULL
updateSelectInput(session,
inputId = "stm_cov",
label = "Select Covariates",
choices = cov_choices(),
selected = "Select")
})
tableOutput("cov")
hr()
## Select Topic to view (Word clouds & Most Rep. Stories)
renderUI({
req(stm.values$topic_names)
selectInput("view_k", label = "View Topic", choices = stm.values$topic_names)
})
```
```{r}
## STM Fitting Function
observeEvent(input$stm_go,{
## K, Language
lang <- input$stm_lang
texts <- stm_dataset[[lang]]
stm.env$new_k <<- input$select_k
## Prevalence Formula
stm_formula <- NULL
if (exists("covariates()", envir = stm.env)){
for (p in 1:length(stm.env$covariates())){
stm_formula <- paste(stm_formula,stm.env$covariates()[p],"+")
}
stm_formula <- substr(stm_formula,1,nchar(stm_formula)-2)
stm_formula <- as.formula(paste("~",stm_formula))
}
stm.env$stm_formula <- stm_formula
## Text processing
set.seed(67)
temp<-textProcessor(documents=texts,metadata=stm_dataset)
meta<-temp$meta
vocab<-temp$vocab
docs<-temp$documents
out <- prepDocuments(docs, vocab, meta)
docs<- out$documents
vocab<-out$vocab
meta <-out$meta
meta$EntryDate <- as.numeric(meta$EntryDate, format="%m/%d/%Y")
meta$DQ2.Gender <- as.factor(meta$DQ2.Gender)
meta$DQ3.Education <- as.factor(meta$DQ3.Education)
meta$DQ1.Age <- as.factor(meta$DQ1.Age)
## Reset Sidebar Instruments
updateSelectInput(session,
inputId = "stm_sigs",
label = "Signifier Type",
choices = c("Select","Triads","Dyads","Stones","Questions","Descriptors"),
selected = "Select")
if(test_stm == 0){ ## Load test_model instead (faster debugging)
## Fit Model
if (!is.null(stm_formula)) {
## With covariates
withProgress(message = "Fitting STM...", detail = "This may take a while.",{
stm_model <- stm(docs, vocab, stm.env$new_k, prevalence = stm_formula, data = meta, init.type = "Spectral", max.em.its = 700)
})
} else {
## No covariates
withProgress(message = "Fitting STM...", detail = "This may take a while.",{
stm_model <- stm(docs, vocab, stm.env$new_k, data = meta, init.type = "Spectral", max.em.its = 700)
})
}
} else {load("test_stm.RData")}
## Retrieve model spec and export to environment
topic_words <- labelTopics(stm_model, n = 10)
topic_names <- topic_words$frex[,1:5]
topic_names <- apply(topic_names,1, function(topic_names) paste(topic_names, collapse = ", "))
stm.values$fitted_k <- stm_model$settings$dim$K
stm.values$topic_words <- topic_words
stm.values$topic_names <- topic_names
stm.values$stm_model <- stm_model
stm.values$meta <- meta
})
```
Row {.tabset}
-------------------------------------
### Topic Wordclouds and Representative Stories
```{r, echo=FALSE}
## Main Display Panel
fluidRow(
## Word Cloud
column(strong(align = "center", "Word Cloud"),hr(),
tags$img(align = "top",renderPlot({
req(stm.values$stm_model)
stm_model <- stm.values$stm_model
cloud(stm_model, topic = match(input$view_k, stm.values$topic_names), type=c("model", "documents"), fin = c(1,1), plt = c(0,1,1,0))
}, width = 550, height = 550)
), width = 6),
## Most Representative Stories
column(tags$div(strong(align = "center", "Most Representative Stories"),hr(),
renderPlot({
req(stm.values$stm_model)
stm_model <- stm.values$stm_model
plotQuote(findThoughts(stm_model, texts = as.character(stm.values$meta$texts_eng), topics = match(input$view_k, stm.values$topic_names), n=3)$docs[[1]], width = 55, text.cex = 1.2)
}, width = 450, height = 600))
, width = 6)
)
```
### STM Explained
**UNDERSTANDING TOPICS**
Structural topic modelling is a form of topic modelling, which itself is a statistical model from machine learning and natural language processing. It discovers underlying topics in textual data. STM assigns the different micro narratives to a number of abstract topics. On the "Understand topics" page you can attempt to understand what the different topics are about and whether they are useful in your analysis."Higest prob" shows words with highest probability of being within a topic. "Frex" shows frequent and exclusive words within a topic. Therefore, while two topics might share high probability words, the likelihood for sharing "frex" words is less. "Lift" and "score" are other measures that are not relevant, unless you have experience with topic modeling.
The word cloud graphically visualize a topic. The larger the displayed word are, the more common they are in the text.The text on the right show three documents that the model assumes to be representative of the topic. You can find more information about Structural Topic Modeling and read academic papers utilizing it here: [http://structuraltopicmodel.com/](http://structuraltopicmodel.com/)
**EXPLORE TOPICS**
STM Brower is an interactive D3 visualisation created by [Freeman, Chuang, Roberts, Stewart and Tingley (2015)](https://github.com/mroberts/stmBrowser) , that helps you explore the topics in the text and metadata covariate relationships (for example gender, education, etc.). Narratives will be displayed on the right side if you click on them. Narratives can be placed within multiple topics, so they are rarely fully within a single topic.
### Debugging
```{r}
## Temporary Panel For Debugging
inputPanel(textInput(inputId = "debugger","Print Console"),actionButton("run", "Execute"))
observeEvent(input$run,{
output$debug <- renderPrint(eval(parse(text=isolate(input$debugger)), envir = parent.frame(1)))
})
htmlOutput("debug")
conditionalPanel("updateBusy() || $('html').hasClass('shiny-busy')",
id='progressIndicator',
h1("Fitting STM..."),
div(id='progress',
includeHTML("timer.js"))
)
```
Topic Exploration {data-navmenu="STM"}
=====================================
Topic Exploration {.sidebar}
-------------------------------------
```{r, eval=FALSE}
selectInput("BrowserCountry", label = "Select Country",
choices = c("Moldova" = "moldova", "Kyrgyzstan UNDP" = "kyrgyzstan", "Kyrgyzstan Unicef" = "unicef", "Serbia" = "serbia",
"Tajikistan" = "tajikistan", "Yemen" = "yemen"),
selected = "moldova")
```
STM Browser {data-width=1000}
-------------------------------------
### Topic modelling
```{r, echo=FALSE, eval = FALSE}
stmBrowser_widget(moldovaSTM, data=moldovameta, c("EntryDate","DQ1.Age","DQ3.Education", "DQ2.Gender", "Q7.Score"),text="texts_eng", labeltype='frex')
```