-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathremove_redundancy.py
221 lines (173 loc) · 7.97 KB
/
remove_redundancy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import pandas as pd
import glob
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import Birch
from sklearn.decomposition import PCA
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import unidecode
'''
This script performs redundancy removal on RevDet Dataset.
It performs stopword removal and stemming on article title, clusters news on title and locations,
further clusters them on counts,
retains one news from each cluster,
and outputs per group file to output folder
'''
def tokenize_and_stem(text, porter_stemmer, stop_words):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
#filtered_tokens = [unidecode.unidecode(word) for word in filtered_tokens if word[0].isupper()]
stems = [porter_stemmer.stem(t) for t in filtered_tokens]
return stems
def one_hot_encode(df):
mlb = MultiLabelBinarizer(sparse_output=True)
sparse_df = mlb.fit_transform(df)
return sparse_df
def remove_stop_words(df, stop_words):
for row in df.itertuples():
if type(row.heading) == float:
df.loc[row.Index, 'heading'] = ['#']
continue
porter_stemmer = PorterStemmer()
processed_data = tokenize_and_stem(
row.heading, porter_stemmer, stop_words)
stop_word_removed_data = []
for word in processed_data:
if word.lower() not in stop_words:
stop_word_removed_data.append(word)
df.loc[row.Index, 'heading'] = ' '.join(stop_word_removed_data)
return df
def run(input_dir, output_dir):
stop_words = set(stopwords.words('english'))
# parameters, determined through experiments
birch_thresh = 2.4
count_thresh = 0.1
perform_pca = False
path = input_dir
output_path = output_dir
file_name = '*.csv'
all_files = glob.glob(os.path.join(path, file_name))
for f in all_files:
file_prefix = f.split('.')[0]
file_prefix = file_prefix.split('\\')[-1]
df = pd.read_csv(f, header=None, encoding='latin-1')
df.columns = ['record_id', 'date', 'url', 'counts', 'themes', 'locations', 'persons',
'organizations', 'tone', 'heading']
# Retaining only those news which have non-null locations and heading
df = df[pd.notnull(df['locations'])]
df = df[pd.notnull(df['heading'])]
# removing news with wrong scraped title e.g. bloomberg instead of article title
try:
mask = (df['heading'].str.len() >= 20)
df = df.loc[mask]
except:
continue
# retaining original heading for analysis afterwards
df['heading_original'] = df['heading']
# stop-word removal and stemming
df = remove_stop_words(df, stop_words)
df_locations = pd.DataFrame(df['locations'])
df_heading = pd.DataFrame(df['heading'])
# dictionary that maps row number to row, helps later in forming clusters through cluster labels
row_dict = df.copy(deep=True)
row_dict.fillna('', inplace=True)
row_dict.index = range(len(row_dict))
row_dict = row_dict.to_dict('index')
try:
df_locations = pd.DataFrame(
df_locations['locations'].str.split(';')) # splitting locations
except:
continue
for row in df_locations.itertuples():
for i in range(0, len(row.locations)):
try:
row.locations[i] = (row.locations[i].split('#'))[
3] # for retaining only ADM1 Code
except:
continue
sparse_heading = one_hot_encode(df_heading['heading'])
sparse_locations = one_hot_encode(df_locations['locations'])
df = hstack([sparse_heading, sparse_locations])
# Reducing dimensions through principal component analysis
if perform_pca:
pca = PCA(n_components=None)
df = pd.DataFrame(pca.fit_transform(df))
brc = Birch(branching_factor=50, n_clusters=None,
threshold=birch_thresh, compute_labels=True)
try:
predicted_labels = brc.fit_predict(df)
except:
continue
clusters = {}
n = 0
for item in predicted_labels:
if item in clusters:
clusters[item].append(
list((row_dict[n]).values())) # since row_dict[n] is itself a dictionary
else:
clusters[item] = [list((row_dict[n]).values())]
n += 1
# clustering within each cluster, on counts
# dictionary which maps original_cluster_key to new clusters within that cluster
count_clusters = {}
for item in clusters:
count_clusters[item] = {}
cluster_df = pd.DataFrame(clusters[item])
cluster_row_dict = cluster_df.copy(deep=True)
cluster_row_dict.fillna('', inplace=True)
cluster_row_dict.index = range(len(cluster_row_dict))
cluster_row_dict = cluster_row_dict.to_dict('index')
df_counts = pd.DataFrame(cluster_df[cluster_df.columns[[3]]])
df_counts.columns = ['counts']
df_counts = pd.DataFrame(
df_counts['counts'].str.split(';')) # splitting counts
for row in df_counts.itertuples():
for i in range(0, len(row.counts)):
try:
temp_list = row.counts[i].split('#')
row.counts[i] = temp_list[0] + '#' + temp_list[1] + '#' + temp_list[
5] # for retaining only COUNT_TYPE and QUANTITY and LOCATION ADM1 Code
except:
continue
row.counts[:] = [x for x in row.counts if not x.startswith(
'CRISISLEX')] # Removing CRISISLEX Entries due to elevated false positive rate
if len(row.counts) == 1 and row.counts[0] == '':
# so that news with no counts are clustered together
row.counts.append('#')
row.counts.pop(0)
if row.counts[len(row.counts) - 1] == '':
row.counts.pop()
mlb4 = MultiLabelBinarizer()
df_counts = pd.DataFrame(mlb4.fit_transform(df_counts['counts']),
columns=mlb4.classes_, index=df_counts.index)
brc2 = Birch(branching_factor=50, n_clusters=None,
threshold=count_thresh, compute_labels=True)
predicted_labels2 = brc2.fit_predict(df_counts)
n2 = 0
for item2 in predicted_labels2:
if item2 in count_clusters[item]:
count_clusters[item][item2].append(
list((cluster_row_dict[
n2]).values())) # since cluster_row_dict[n2] is itself a dictionary
else:
count_clusters[item][item2] = [
list((cluster_row_dict[n2]).values())]
n2 += 1
data = []
for item in count_clusters:
for item2 in count_clusters[item]:
data.append(count_clusters[item][item2][0])
df = pd.DataFrame(data)
df.sort_values(by=[0], inplace=True)
df.to_csv(output_path+file_prefix+'.csv',
sep=',', index=0, header=None)