-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_lj_subsample_dataset.py
92 lines (77 loc) · 2.98 KB
/
create_lj_subsample_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import logging
from functools import partial
from multiprocessing import pool
from random import shuffle, seed
import psycopg2
from inquire_sql_backend.query.util import is_russian
from inquire_sql_backend.semantics.embeddings.util import tokenize
logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(name)-18s: %(message)s",
level=logging.DEBUG
)
log = logging.getLogger(__name__)
DEC2FLOAT = psycopg2.extensions.new_type(
psycopg2.extensions.DECIMAL.values,
'DEC2FLOAT',
lambda value, curs: float(value) if value is not None else None)
psycopg2.extensions.register_type(DEC2FLOAT)
conn = psycopg2.connect("host=localhost dbname=inquire_newparse user=postgres password=12344321")
def sample_journals_from_db(filename, word_count):
log.debug("Getting usernames..")
all_users = get_all_users()
log.debug("Found %s users. " % len(all_users))
log.debug("Collecting at least %s words worth of journals.." % word_count)
curr_word_count = 0
p = pool.Pool(14)
with open(filename, "w") as outf, open(filename + ".usernames", "w") as outfusers:
for idx, userid in enumerate(all_users):
if idx % 100 == 0:
log.debug(
"Got %s users, %s words so far (%s%%)." % (idx, curr_word_count, curr_word_count / word_count * 100)
)
tmp_cur = conn.cursor()
tmp_cur.execute(
"""
SELECT
s.sent_text
FROM
post_sents s,
posts p
WHERE
p.userid = %s and
s.post_id = p.post_id
ORDER BY
s.post_id asc,
s.sent_num asc;
""", (userid, ))
filtered = (
row[0].replace("'", "").replace('"', " `` ", 1).replace('"', " '' ", 1).lower()
for row in tmp_cur if not is_russian(row[0])
)
# db_cursor, curit2 = itertools.tee(filtered)
# texts = (row[-1] for row in curit2)
tokens_it = p.imap(partial(tokenize, remove_stop=False), filtered, chunksize=1000)
outfusers.write("%s\n" % userid)
for sent in tokens_it:
curr_word_count += len(sent)
outf.write(" ".join(sent)+"\n")
if curr_word_count >= word_count:
return
def get_all_users():
seed(123)
tmp_cur = conn.cursor()
tmp_cur.execute(
"""
SELECT
u.userid
FROM
users u;
""")
# sort so that the shuffle afterwards is deterministic for our key
all_users = sorted([int(row[0]) for row in tmp_cur])
shuffle(all_users)
return all_users
if __name__ == '__main__':
fname = "/commuter/inquire_data_root/livejournal_sample/dataset/billion_words.txt"
sample_journals_from_db(fname, 984845269) # this is the number of words in bookCorpus
log.debug("Wrote saved index to %s" % fname)