-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateModel.R
35 lines (26 loc) · 1.04 KB
/
createModel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
source("common.R")
cleanDb <- dbInit(cleanDbName)
cleanData <- as(cleanDb, "list")
processType <- function(typeName, removeSingletons = TRUE) {
modelDbName <- modelName(typeName)
if (!file.exists(modelDbName)) {
dbCreate(modelDbName)
}
modelDb <- dbInit(modelDbName)
print(paste("Calculating percentages for", typeName, "at", date()))
percentages <- calculatePercentages(cleanData[[typeName]], singletonsThreshold = 1)
print(paste("Creating maps for", typeName, "at", date()))
ngramMap <- mapGramsWrapper(percentages)
print(paste("Creating model for", typeName, "at", date()))
modelDb$model <- makeNgramModel2(percentages, ngramMap)
dbReorganize(modelDb)
}
processType('tweets')
processType('blogs')
processType('news')
processType('geah', FALSE)
# Perplexity calculation
# The cross-entropy is the average of the negative logarithm of the word probabilities.
# In Figure 7.2 , next to each probability you can find its negative log2.
#I would like to commend the rapporteur on his work.
print(paste("Completed at", date()))