-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathTextMiningBasics.R
98 lines (80 loc) · 3.45 KB
/
TextMiningBasics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#Text Mining Basics in R:
#Using speech "I have a dream" given by Martin Luther King to american citizens.
#install.packages("tm") # for text mining
#install.packages("SnowballC") # for text stemming
#install.packages("wordcloud") # word-cloud generator
#install.packages("RColorBrewer") # color palettes
# Load
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
#Text mining----
#1.load the text
#The text is loaded using Corpus() function from text mining (tm) package.
#Corpus is a list of a document (in our case, we only have one document).
#We start by importing the text file. Choose the text file as below:
text <- readLines(file.choose())
data <- as.data.frame(text)
View(data)
#Load the data as a corpus
# A corpus is a collection of documents.
docs <- Corpus(VectorSource(text))
#Inspect the content of the document
inspect(docs)
#Text transformation:----
#Transformation is performed using tm_map() function to replace, for example,
#special characters from the text. Replacing "/", "@" and "|" with space:
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
#Cleaning the text
#tm_map() function is used to remove unnecessary white space, to convert text to lower case,
#to remove common stopwords like 'the', "we".
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
#Text stemming:----
#bringing the word to its root form,i.e., converting 'gone' to 'go'.
docs <- tm_map(docs, stemDocument)
#Build a term-document matrix----
#Document matrix is a table containing the frequency of the words.
#The function TermDocumentMatrix() from text mining package can be used as follow :
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
#Generate the Word cloud----
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
#The above word cloud clearly shows that "Will", "freedom", "dream", "day" and "together" are
#the five most important words in the speech document being analyzed.
#Explore frequent terms and their associations----
#You can have a look at the frequent terms in the term-document matrix as follow.
#In the example below we want to find words that occur at least four times :
findFreqTerms(dtm, lowfreq = 4)
#Association between terms using fxn: findAssocs()
#We analyze the association between frequent terms using findAssocs()function.
#Below code identifies which words are associated with "freedom" in speech "I have a dream" being analyzed.
findAssocs(dtm, terms = "freedom", corlimit = 0.3)
#Frequency table of words
head(d, 10)
#Plot word frequencies
#The frequency of the first 10 frequent words are plotted :
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")