-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_doc_trectext.py
52 lines (40 loc) · 1.07 KB
/
extract_doc_trectext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# gets trectext documents as input and outputs docno\ttext
# example usage:
#python3 extract_trectext.py filename
import sys
import json
import re
fname = sys.argv[1]
queries = list()
count_succ = 0
count_err = 0
fname_pure = fname.split('/')[-1]
out_file = fname + '.num_query'
doc = ''
def filter(text):
text = text.lower()
return re.sub('[^0-9a-zA-Z ]+', ' ', text)
def extract(doc):
text = re.findall(r'<TEXT>(.*?)<\/TEXT>', doc, re.DOTALL)
num = re.findall(r'<DOCNO>(.*?)<\/DOCNO>', doc, re.DOTALL)
if len(text) > 0 and len(num) > 0:
return num[-1], filter(text[-1])
else:
return num[-1], ''
def make_elem(num, text):
return num + '\t' + text + '\n'
with open(out_file, 'w') as out:
with open(fname, 'r') as f:
for line in f:
if '</DOC>' in line:
doc += line
doc_name, doc_text = extract(doc)
if doc_text:
count_succ += 1
else:
count_err += 1
out.write(make_elem(doc_name.strip(), doc_text.strip()))
doc = ''
else:
doc += line
print('Extracted '+ str(count_succ) + ' Docs, with ' + str(count_err) + ' Empty Docs.')