-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus.py
176 lines (162 loc) · 6.2 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# OneMillionPostsCorpus
# Deep Learning classification on a data set of german online discussions
# Student Project by: Jens Becker, Julius Plehn, Oliver Pola
#
# Access corpus data
import os
import shutil
import wget
import tarfile
import sqlite3
import numpy as np
def get_conn():
url = 'https://github.com/OFAI/million-post-corpus/releases/download/v1.0.0/million_post_corpus.tar.bz2'
db = 'data/million_post_corpus/corpus.sqlite3'
zip = os.path.split(url)[-1]
if not os.path.exists(db):
old_db = 'data/corpus.sqlite3'
if os.path.exists(old_db):
print('Rearrange data dir...')
os.rename('data', 'million_post_corpus')
else:
if not os.path.exists(zip):
print('Downloading corpus file...')
wget.download(url, zip)
print()
print('Extract corpus DB...')
with tarfile.open(zip) as tar:
tar.extractall()
os.makedirs('data', exist_ok=True)
shutil.move('million_post_corpus', 'data')
if os.path.exists(zip):
os.remove(zip)
conn = sqlite3.connect(db)
return conn
def get_training(conn):
cur = conn.cursor()
x = [] # posts
y = [] # categories
# Select all posts that have any annotation, treating absent values as 0
# sql = ''' SELECT ID_Post, Headline, Body FROM Posts
# WHERE EXISTS
# (SELECT 1 FROM Annotations_consolidated
# WHERE Posts.ID_Post = Annotations_consolidated.ID_Post)
# '''
# Select all posts that have annotations for each category
sql = ''' SELECT ID_Post, Headline, Body FROM Posts
WHERE EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'SentimentNegative')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'SentimentNeutral')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'SentimentPositive')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'OffTopic')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'Inappropriate')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'Discriminating')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'PossiblyFeedback')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'PersonalStories')
AND EXISTS
(SELECT 1 FROM Annotations_consolidated
WHERE Posts.ID_Post = Annotations_consolidated.ID_Post
AND Category = 'ArgumentsUsed')
'''
cur.execute(sql)
for row in cur.fetchall():
id_post = row[0]
# merge Headline, Body to one text entry
x.append(merge_text(row[1], row[2]))
# categories for one post will be a vector, so y will be a matrix
y.append(get_categorylist(conn, id_post))
return np.array(x), np.array(y)
def merge_text(a, b, separator='\r\n'):
if a == None or a == '':
return b
elif b == 'None' or b == '':
return a
else:
return f'{a}{separator}{b}'
# translate category name to vector index by: corpus.categories['OffTopic']
# translate index to category name by: list(corpus.categories)[3]
categories = {
'SentimentNegative' : 0,
'SentimentNeutral' : 1,
'SentimentPositive' : 2,
'OffTopic' : 3,
'Inappropriate' : 4,
'Discriminating' : 5,
'PossiblyFeedback' : 6,
'PersonalStories' : 7,
'ArgumentsUsed' : 8
}
def get_categorylist(conn, id_post):
## Categories content:
## Ord = 1, Name = SentimentNegative
## Ord = 2, Name = SentimentNeutral
## Ord = 3, Name = SentimentPositive
## Ord = 4, Name = OffTopic
## Ord = 5, Name = Inappropriate
## Ord = 6, Name = Discriminating
## Ord = 7, Name = PossiblyFeedback
## Ord = 8, Name = PersonalStories
## Ord = 9, Name = ArgumentsUsed
cur = conn.cursor()
categories = [0, 0, 0, 0, 0, 0, 0, 0, 0]
sql = ''' SELECT Ord, Value FROM Annotations_consolidated
JOIN Categories
ON Annotations_consolidated.Category = Categories.Name
WHERE ID_Post = ?
ORDER BY Ord
'''
param = (id_post, )
cur.execute(sql, param)
for row in cur.fetchall():
categories[row[0] - 1] = row[1]
return categories
if __name__ == '__main__':
# Test
conn = get_conn()
cur = conn.cursor()
all_tables = ['Articles', 'Posts', 'Newspaper_Staff', 'Annotations', 'Annotations_consolidated', 'CrossValSplit', 'Categories']
print('Test: Count tables...')
for table in all_tables:
cur.execute(f'SELECT COUNT(*) FROM {table}')
for row in cur.fetchall():
print(f'{table} count = {row[0]}')
print()
print('Test: List Categories...')
cur.execute(f'SELECT Name, Ord FROM Categories')
for row in cur.fetchall():
print(f'Ord = {row[1]}, Name = {row[0]}')
print()
print('Test: Training data...')
x, y = get_training(conn)
print(f'x[0] = {x[0]}')
print(f'y[0] = {y[0]}')
print(f'Training count = {x.shape[0]}')
print()
print('Test: Categories apply...')
for c in categories:
i = categories[c]
a = np.sum(y[:,i] == 1)
print(f'{c}: applies = {a} ({100 * a / y.shape[0]:.2f} %)')