-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_lists_from_db.py
executable file
·235 lines (200 loc) · 7.7 KB
/
read_lists_from_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import codecs
from collections import Counter
import csv
import string
from nltk import FreqDist
import nltk
from psycopg2._psycopg import DatabaseError
import sys
from parsers.string_parser import StringParser, latin_letters
import psycopg2
__author__ = 'Katharine'
conn_string = "dbname='nlstudent' user = 'nlstudent' password ='2015pink'"
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()
f = StringParser()
s = {}
get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN
list_member_rec as lm ON l.list_id = lm.list_id
where lm.member_id = %s"""
get_listcount_for_member = """SELECT count FROM member_list_count_rec as lm JOIN
list_rec as l ON l.list_id = lm.list_id
where lm.member_id = %s;"""
get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l
where list_id = %s"""
get_all_listinfo_for_all_lists = """SELECT l.list_id,l.list_name,l.list_description FROM list_rec as l"""
store_wordfreq_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l
where list_id = %s"""
get_all_lists = """SELECT l.list_name,l.list_description FROM list_rec as l """
# check if the word is already in the database
def word_not_in_db(word):
cursor.execute("SELECT word_id FROM list_word_rec WHERE word = %s", (word,))
return cursor.fetchone() is None
# knowing the word is in db, get its word_id
def get_word_id(word):
cursor.execute("SELECT word_id FROM list_word_rec WHERE word = %s", (word,))
return cursor.fetchone()
def delete_word_frequency_rec(article_id):
delete_word_frequency_rec = "DELETE FROM word_frequency_rec WHERE article_id = %s;"
cursor.execute(delete_word_frequency_rec,[article_id])
conn.commit()
def isEmpty(self, dictionary):
for element in dictionary:
if element:
return True
return False
# # read member id from a file
# reader = csv.reader(open('memberid.csv', 'r'))
#
# for current_line in reader:
# curr_string = ''.join(current_line)
# member_id = int(curr_string)
#
# cursor.execute(get_listcount_for_member,[member_id])
# mlistcount = cursor.fetchall()[0]
# cursor.execute(get_listinfo_for_member,[member_id])
#
# tstrout = ''
# rows = cursor.fetchall()
# for row in rows:
# # for current_line in reader:
# c_line=str(row)
# # c_line=str(current_line.strip())
# c_line = ''.join(filter(lambda x: x in string.printable, c_line))
# # print(c_line)
# if len(c_line):
# s = f.parse(c_line,True)
# strout = ''
# if s is not None:
# for item in s.items():
# # print(1,str(item[1]))
# word = []
# for word in item[1]:
# strout = strout+' '+word
# if len(strout):
# tstrout = tstrout+' '+strout
# # print(tstrout)
# words = nltk.tokenize.word_tokenize(tstrout)
# the_list_dist = FreqDist(words)
#
# print('for',member_id,'on',mlistcount[0],'lists:',the_list_dist.most_common(20))
def get_list_dists_for(member_id):
print(member_id, file=sys.stderr)
# cursor.execute(get_listcount_for_member, [member_id])
# mlistcount = cursor.fetchone()[0]
cursor.execute(get_listinfo_for_member, [member_id])
tstrout = ''
rows = cursor.fetchall()
for row in rows:
c_line = str(row)
c_line = ''.join(filter(lambda x: x in string.printable, c_line))
if len(c_line):
parsed_text = f.parse(c_line, True)
strout = ''
if parsed_text is not None:
for item in s.items():
# print(1,str(item[1]))
word = []
for word in item[1]:
strout = strout + ' ' + word
if len(strout):
tstrout = tstrout + ' ' + strout
# print(tstrout)
words = nltk.tokenize.word_tokenize(tstrout)
the_list_dist = FreqDist(words)
return str(member_id) + " on " + str(len(rows)) + " lists: " + str(the_list_dist.most_common(20))
def get_list_dists(list_id):
cursor.execute(get_listinfo_for_list, [list_id])
c = Counter()
for row in cursor:
list_text = ' '.join(row)
# print(list_text)
parsed_text = f.parse(list_text, is_list=True)
if parsed_text is not None:
c.update(parsed_text['text'])
c.update(dict.fromkeys(parsed_text['bigrams'], 1))
c.update(dict.fromkeys(parsed_text['entities'], 1))
the_list_dist = {k: v for k, v in c.items()}
return the_list_dist
# # read list ids from a file
# byte_reader = open('testdata.csv', 'rb')
#
# byte_reader = codecs.open('testdata.csv', 'r')
# #
# read list ids from database
cursor.execute(get_all_listinfo_for_all_lists)
with open("listinfo.txt", "wt") as out_file:
for current_line in cursor:
# convert to string
current_string = current_line
# print(len(current_line),current_line)
if not len(current_string):
print('empty string... go to next record')
continue
print(current_string)
current_string = str(current_string)
current_string = current_string.split(',')
list_id = current_string[0]
list_id = list_id.lstrip('(')
# print(list_id)
if not len(list_id):
print('list_id,',list_id,' zero length... go to next record')
continue
if not list_id.isdigit():
print('list_id not numeric... go to next record')
continue
curr_string = ' '.join(current_string[1:])
# print('starting with text :', curr_string)
# print('working on list :',list_id)
# list_dict = get_list_dists(list_id)
c = Counter()
parsed_text = f.parse(curr_string, is_list=True)
# print(parsed_text)
if parsed_text is not None:
c.update(parsed_text['text'])
c.update(dict.fromkeys(parsed_text['bigrams'], 1))
c.update(dict.fromkeys(parsed_text['entities'], 1))
# else:
# print('parsed text is empty')
# cycle through dictionary
# Write a file
if not c.items():
out_file.write('\n')
out_file.write('for list_id = ')
out_file.write(list_id)
out_file.write('parsed text is empty\n')
# print('parsed text is empty')
else:
out_file.write('\nlist_id: ')
out_file.write(list_id)
if len(list_id)>6:
out_file.write('\t')
elif len(list_id)>3:
out_file.write('\t\t')
else:
out_file.write('\t\t\t')
for word,frequency in c.items():
out_file.write(word)
out_file.write(' ')
# print(word)
# print('storing words')
# cursor.execute("SELECT word_id FROM list_word_rec WHERE word = %s", (word,))
#
# if word_not_in_db(word):
# #it's not in database so store this as a new word
# store_word_rec = """INSERT INTO list_word_rec(word_id,word)
# VALUES (DEFAULT,%s)
# RETURNING word_id """
# cursor.execute(store_word_rec,[word])
# #store the current value of word_id
# word_id = cursor.fetchone()[0]
# conn.commit()
#
# else:
# word_id = get_word_id(word)
#
# store_list_wf_rec = """INSERT INTO list_wf_rec(list_id,word_id,frequency)
# VALUES (%s,%s,%s) """
# cursor.execute(store_list_wf_rec,(list_id,word_id,frequency))
#
# conn.commit()