-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtestReader.py
199 lines (168 loc) · 7.02 KB
/
testReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 9 10:15:49 2018
@author: Deep
"""
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import collections
import numpy as np
from keras.preprocessing.sequence import pad_sequences
english_stopwords = stopwords.words('english')
# 缩写字符的替换和无意义字符的去除
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " ( ", string)
string = re.sub(r"\)", " ) ", string)
string = re.sub(r"\?", " ? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def get_sqac_pairs_tokenizes(qa_dir, ins_dir):
data_qa = list(open(qa_dir,'r'))
qac_pairs = []
qac_list = []
for line in data_qa:
line = line.strip()
line = line.split("\t")
del line[1]
qac_list.append(line)
d = list(open("H:/SemEval t11/data/test_qa.txt",'r'))
c = []
for line in d:
line = line.strip()
line = line.split("\t")
del line[1]
c.append(line)
l = len(qac_list)
m = 0
for i in qac_list:
while m < l:
qac_list[m].append(c[m][0])
m += 1
for item in qac_list:
story_idx = item[6]
question = item[1]
answer = []
answer1 = item[3]
answer2 = item[4]
answer.append(answer1)
answer.append(answer2)
correct = int(item[5])
for a in answer:
torf = True
if ((correct == 0) and (a == answer[0])):
torf = True
elif ((correct == 0) and (a == answer[1])):
torf = False
elif (a == answer[0]):
torf = False
else:
torf = True
qac_pairs.append([story_idx, question, a, torf])
data_instance = list(open(ins_dir,'r'))
story = []
sqac_pairs = []
for line in data_instance:
line = line.strip()
line = line.split("\t")
story.append(line)
del story[0]
for item in story:
for item1 in qac_pairs:
if item1[0] == item[0]:
sqac_pairs.append([item[2], item1[1], item1[2], item1[3]])
sqac_pairs_tokenizes = []
for item in sqac_pairs:
temp = clean_str(item[0])
item[0] = temp
story = item[0]
temp = clean_str(item[1])
item[1] = temp
question = item[1]
temp = clean_str(item[2])
item[2] = temp
answer = item[2]
# 分词
tt = TweetTokenizer()
swords = tt.tokenize(story)
qwords = tt.tokenize(question)
awords = tt.tokenize(answer)
is_correct = item[3]
# 去停用词
swords = [sw for sw in swords if not sw in english_stopwords]
# qwords = [qw for qw in qwords if not qw in english_stopwords]
# awords = [aw for aw in awords if not aw in english_stopwords]
# 去标点
# swords = [sw for sw in swords if not sw in english_punctuations]
# qwords = [qw for qw in qwords if not qw in english_punctuations]
# awords = [aw for aw in awords if not aw in english_punctuations]
# 词干化
# swords = [lemmatizer.lemmatize(sw) for sw in swords]
# swords = [lemmatizer.lemmatize(sw, pos='v') for sw in swords]
#
# qwords = [lemmatizer.lemmatize(qw) for qw in qwords]
# qwords = [lemmatizer.lemmatize(qw, pos='v') for qw in qwords]
#
# awords = [lemmatizer.lemmatize(aw) for aw in awords]
# awords = [lemmatizer.lemmatize(aw, pos='v') for aw in awords]
sqac_pairs_tokenizes.append([swords, qwords, awords, is_correct])
return sqac_pairs_tokenizes
"""
测试代码,返回的是[[story的分词形式], [question的分词形式], [answer的分词形式], [True/False]]的四元组形式
qa_dir = "C:/Users/Deep/Desktop/gold_data_task_11/parsed/test_data_qa.txt"
ins_dir = "C:/Users/Deep/Desktop/gold_data_task_11/parsed/test_data_ins.txt"
sqac_pairs_tokenizes = get_sqac_pairs_tokenizes(qa_dir, ins_dir)
"""
# 建立sqa 3元组的词表, 索引从1开始,0作为mask
def build_vocab_from_sqac_pairs_tokenizes(sqac_pairs_tokenizes):
wordcounts = collections.Counter()
for sqatriple in sqac_pairs_tokenizes:
for sword in sqatriple[0]:
wordcounts[sword] += 1
for qword in sqatriple[1]:
wordcounts[qword] += 1
for aword in sqatriple[2]:
wordcounts[aword] += 1
words = [wordcount[0] for wordcount in wordcounts.most_common()]
word2idx = {w: i+1 for i, w in enumerate(words)} # 0 = mask
return word2idx
"""
测试代码,返回{word:idx}的字典,idx越小代表该词出现的频率越高,索引从1开始,0作为mask
sqac_pairs_tokenizes = get_sqac_pairs_tokenizes(qa_dir, ins_dir)
word2idx = build_vocab_from_sqac_pairs_tokenizes(sqac_pairs_tokenizes)
"""
# 得到story, question, answer的向量化表示(以词表中的value值作为一个word的数字表示,值越小代表出现的频率越高)
# 并以各自的最大长度作为统一长度,不够的在句子后面补0
# 返回Xs(story的向量表示), Xq(question的向量表示),Xa(answer的向量表示), Y(samples*2的矩阵,[1,0]代表True,反之False)
def vectorize_sqac_pairs_tokenizes(sqac_pairs_tokenizes, word2idx, story_maxlen,
question_maxlen, answer_maxlen):
Xs, Xq, Xa, Y= [], [], [], []
for sqatriple in sqac_pairs_tokenizes:
Xs.append([word2idx[sword] for sword in sqatriple[0]])
Xq.append([word2idx[qword] for qword in sqatriple[1]])
Xa.append([word2idx[aword] for aword in sqatriple[2]])
Y.append(np.array([1, 0]) if sqatriple[3] else np.array([0, 1]))
return (pad_sequences(Xs, maxlen=story_maxlen, padding="post"),
pad_sequences(Xq, maxlen=question_maxlen, padding="post"),
pad_sequences(Xa, maxlen=answer_maxlen, padding="post"),
np.array(Y))
"""
测试代码
sqac_pairs_tokenizes = get_sqac_pairs_tokenizes(qa_dir, ins_dir)
word2idx = build_vocab_from_sqac_pairs_tokenizes(sqac_pairs_tokenizes)
story_maxlen = max([len(sqatriple[0]) for sqatriple in sqac_pairs_tokenizes])
question_maxlen = max([len(sqatriple[1]) for sqatriple in sqac_pairs_tokenizes])
answer_maxlen = max([len(sqatriple[2]) for sqatriple in sqac_pairs_tokenizes])
Xs, Xq, Xa = vectorize_sqac_pairs_tokenizes(sqac_pairs_tokenizes, word2idx, story_maxlen,
question_maxlen, answer_maxlen)
"""