-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path4. Hierarchical clustering in dataframe
28 lines (25 loc) · 1.1 KB
/
4. Hierarchical clustering in dataframe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# Using news of different years to do hierarchical clustering
import scipy.cluster.hierarchy as hac
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
( You can skip this part. I use it to reorganize the text within my dataframe)
# Create a dataframe that group up all news together based on years
allnews = pd.DataFrame(columns = ['news'])
for news in net:
text = news.cutted.str.cat(sep = " ")
allnews = allnews.append({'news':text},ignore_index = True)
allnews.index = ['2013','2014','2015','2016','2017','2018']
# allnews[:1]
# Words frequency based on tfidf
words = 100 # I choose 100 words to do clustering, you can choose the number you want
tf_vectorizer = CountVectorizer(strip_accents = None,
max_features = words,
stop_words='english')
tf = tf_vectorizer.fit_transform(allnews.news)
weight = tf.toarray()
# Dendrogram
tree = hac.linkage(weight,method = 'complete',metric = 'euclidean')
hac.dendrogram(tree1,labels = allnews.index)
plt.ylabel('distance')
plt.title('News')
# You can see the result in the graph