-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_analysis.R
executable file
·137 lines (108 loc) · 5.23 KB
/
text_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/Rscript
suppressMessages(library(tm))
suppressMessages(library(wordcloud))
suppressMessages(library(slam))
suppressMessages(library(lubridate))
suppressMessages(library(parallel))
suppressMessages(library(proxy))
suppressMessages(library(docopt))
doc<-"This script does an initial cleaning and analysis of a set of
documents (the corpus). It will ouptut a series of plots to
describe the vocabulary as well as a create corpus that can be used by
for topicmodeling by the topic_modeling.R
Usage: text_analysis.R -x=<pubmed> [-r=<nih>] -d=<dir> [-s=<stopwords>] [-c=<cores>] [--reset]
text_analysis.R [-x=<pubmed>] -r=<nih> -d=<dir> [-s=<stopwords>] [-c=<cores>] [--reset]
Options:
-x --xml=<pubmed> Pubmed results in XML format
-r --reporter=<nih> NIH Reporter export in CSV format
-s --stopwords=<stopwords> Stop word list, one word per line, plain text [default: stopwords.txt]
-d --dir=<dir> Directory to write Corpus and meta data outputs
-c --cores=<cores> Number of cores to use for Corpus processing [default: 16]
--reset Force a reprocessing of the Corpus, the default is to not reprocess the corpus if one exists
-h --help This helpful message"
my_opts<-docopt(doc)
print(my_opts) ##This is for testing purposes
extraFunFile<-"textMine_funcs.R"
if (file.exists(extraFunFile)) {
source(extraFunFile, keep.source=TRUE);
}
source("makeCorpus.R")
dir.create("results/",showWarnings = F)
resultsPath<-paste0("results/",getDate())
dir.create(resultsPath)
corpusPath<-paste0("data/",my_opts$dir)
dir.create(corpusPath, recursive = T, showWarnings = F)
dir.create(paste0(corpusPath,"/Corpus"), showWarnings = F)
file.copy(from=my_opts$xml,to=paste0(corpusPath))
print(c("XML file is null:",!is.null(my_opts$xml)))
if(!is.null(my_opts$xml)){
print("Processing Corpus....")
pubmed.df<-pubmedParse(my_opts$xml)
metaData<-pubmed.df[,1:5]
metaData[,"FY.Q"]<-quarter(pubmed.df[,"pubdate.df"]+91, with_year=T)
metaData[,"FY"]<-floor(metaData[,"FY.Q"])
abstrCorpus<-makeCorpus(abstr.df = pubmed.df,stopwordsList = my_opts$stopwords, cores = my_opts$cores)
writeCorpus(abstrCorpus, paste0(corpusPath,"/Corpus"))
write.csv(metaData, file=paste0(corpusPath,"/CorpusMetaData.txt"),
row.names=F)
} else {
##read in corpus docs.
print("Loading previous corpus...")
abstrCorpus<-Corpus(DirSource(paste0(corpusPath,"/Corpus")),
readerControl = list(language="english"))
metaData<-read.csv(paste0(corpusPath, "CorpusMetaData.txt"),colClasses=c('character','character','Date','character','numeric'))
for (x in c("PMID","GrantID","Date", "FY", "FY.Q")) {
meta(abstrCorpus, x)<-metaData[,x]
}
}
####Extra Corpus cleaning
###Descriptives of Corpus
png(paste0(resultsPath, "/Abstracts_per_FY.png"), height=1000, width=1200, units="px")
par(mfrow=c(2,1), cex=2)
barplot(tapply(meta(abstrCorpus)[,"FY"], meta(abstrCorpus)[,"FY.Q"],length), main="Abstracts per FY.Q", las=2)
barplot(tapply(meta(abstrCorpus)[,"FY"], meta(abstrCorpus)[,"FY"],length ), main="Abstracts per FY", las=2)
dev.off()
################
##Term Document Matrix Creation
################
#This is the basic data structure to mine for term usage trends, clustering, association rule mining, etc.
tdm.monogram.tfidf<-TermDocumentMatrix(abstrCorpus,
control = list(weighting=weightTfIdf))
tdm.monogram<-TermDocumentMatrix(abstrCorpus)
#################
##Ngram Analysis
#################
#tdm.bigram <- TermDocumentMatrix(abstrCorpus, control = list(tokenize = NgramTokenizer))
##function(x) weightTfIdf(x,normalize=F)))
##Run one of the following commands before proceeding.
##tdm.monogram are single term frequencies.
##tdm.bigram are two term frequencies
tdm<-tdm.monogram
#tdm<-tdm.monogram.tfidf
#tdm<-tdm.bigram
tdm.sp<-TermDocumentMatrix(spCorpus)
###########
##TermFreq exploration and visualization
###########
tfidfHisto(tdm.monogram.tfidf ,fact = "FY", "mean")
tfHisto(tdm,"FY")
tf<-rowSums(as.matrix(tdm))
tf<-tf[order(-tf)]
tf.bi<-row_sums(tdm.bigram)
tf.bi<-tf.bi[order(-tf.bi)]
png(paste0(resultsPath,"/Zipfs_plots.png"), height=3000, width=3000, units="px")
par(mfrow=c(2,1), cex=4)
plot(tf.bi, ylim=c(0,12500),lwd=2,type="l", xlab="Rank Ordered Terms", ylab="Term Count", main="Zipf's Law plot")
lines(tf, col="blue", lwd=2)
legend("topright",lty=c(1,1), col=c("black","blue"), legend=c("Bigrams", "Unigrams"), bty="n")
plot(tf.bi, xlim=c(0,10000),lwd=2,ylim=c(0,1000), type="l", xlab="Rank Ordered Terms", ylab="Term Count", main="Zipf's Law plot")
lines(tf, col="blue",lwd=2)
dev.off()
##This is probably an inappropriate graphic as tf-idf does not summarize well across the corpus
##tf-idf is really a measure of a words importance in a document
#wordCloud(tdm.monogram.tfidf,fact="FY", 75, "mean","tfidf")
#wordCloud(tdm.monogram.tfidf,fact="FY.Q", 75, "mean","tfidf")
wordCloud(tdm,fact="FY", 75, pre="tf")
wordCloud(tdm,fact="FY.Q", 75, pre="tf")
wordCloudMontage(tdm = tdm.sp,file = "SP_TF_wordcloud.png", path = resultsPath)
#wordCloudMontage(tdm = tdm.sp.tfidf,f=0.001,file = "SP_TfIdf_wordcloud.png", path = resultsPath)