-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNEWclusters&wordclouds.R
96 lines (68 loc) · 2.74 KB
/
NEWclusters&wordclouds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
## Processing stage
library(tm)
moldova <- read.csv("Moldova_1.csv")
library("tm")
moldovatext <- Corpus(VectorSource(moldova$"Your.experience"))
moldovatext <- tm_map(moldovatext, removePunctuation)
for(j in seq(moldovatext))
{
moldovatext[[j]] <- gsub("/", " ", moldovatext[[j]])
moldovatext[[j]] <- gsub("@", " ", moldovatext[[j]])
moldovatext[[j]] <- gsub("\\|", " ", moldovatext[[j]])
}
moldovatext <- tm_map(moldovatext, removeNumbers)
moldovatext <- tm_map(moldovatext, tolower)
moldovatext <- tm_map(moldovatext, removeWords, stopwords("english"))
library(SnowballC)
moldovatext <- tm_map(moldovatext, stemDocument)
moldovatext <- tm_map(moldovatext, stripWhitespace)
moldovatext <- tm_map(moldovatext, PlainTextDocument)
## Stage the data
dtm <- DocumentTermMatrix(moldovatext)
dtm
tdm <- TermDocumentMatrix(moldovatext)
tdm
freq <- colSums(as.matrix(dtm))
length(freq)
ord <- order(freq)
findFreqTerms(dtm, lowfreq=20)
wf <- data.frame(word=names(freq), freq=freq)
wf$word <- factor(wf$word, levels = wf$word[order(wf$freq)])
library(ggplot2)
p <- ggplot(subset(wf, freq>20), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
library(wordcloud)
wordcloud(names(freq), freq, min.freq=20)
wordcloud(names(freq), freq, min.freq=20, scale=c(5, .1), random.color = FALSE, colors=brewer.pal(6,"Reds") )
dtmss <- removeSparseTerms(dtm, 0.95) # This makes a matrix that is only 15% empty space, maximum.
inspect(dtmss)
library(cluster)
d <- dist(t(dtmss), method="euclidian")
fit <- hclust(d=d, method="ward")
fit
plot(fit, hang=-1)
plot.new()
plot(fit, hang=-1)
groups <- cutree(fit, k=5) # "k=" defines the number of clusters you are using
rect.hclust(fit, k=5, border="red")
library(fpc)
d <- dist(t(dtmss), method="euclidean")
kfit <- kmeans(d,3)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
d <- dist(t(dtmss), method="maximum")
kfit <- kmeans(d,3)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
d <- dist(t(dtmss), method="manhattan")
kfit <- kmeans(d,3)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
d <- dist(t(dtmss), method="canberra")
kfit <- kmeans(d,3)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
d <- dist(t(dtmss), method="binary")
kfit <- kmeans(d,3)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
d <- dist(t(dtmss), method="minkowski")
kfit <- kmeans(d,3)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)