forked from bcaitech1/p3-mrc-gaama
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils_qa.py
411 lines (368 loc) · 17.6 KB
/
utils_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Pre-processing
Post-processing utilities for question answering.
"""
import pickle
import collections
import json
import logging
import os
from typing import Optional, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from scipy.sparse import csr_matrix
import numpy as np
from tqdm.auto import tqdm
from konlpy.tag import Mecab
import torch
import random
from transformers import is_torch_available, PreTrainedTokenizerFast
from transformers.trainer_utils import get_last_checkpoint
logger = logging.getLogger(__name__)
mecab = Mecab()
def tokenize(text):
# return text.split(" ")
return mecab.morphs(text)
def set_seed(seed: int):
"""
Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
installed).
Args:
seed (:obj:`int`): The seed to set.
"""
random.seed(seed)
np.random.seed(seed)
if is_torch_available():
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def postprocess_qa_predictions(
examples,
features,
predictions: Tuple[np.ndarray, np.ndarray],
version_2_with_negative: bool = False,
n_best_size: int = 20,
max_answer_length: int = 30,
null_score_diff_threshold: float = 0.0,
output_dir: Optional[str] = None,
prefix: Optional[str] = None,
is_world_process_zero: bool = True,
):
"""
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
original contexts. This is the base postprocessing functions for models that only return start and end logits.
Args:
examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the underlying dataset contains examples with no answers.
n_best_size (:obj:`int`, `optional`, defaults to 20):
The total number of n-best predictions to generate when looking for an answer.
max_answer_length (:obj:`int`, `optional`, defaults to 30):
The maximum length of an answer that can be generated. This is needed because the start and end predictions
are not conditioned on one another.
null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
The threshold used to select the null answer: if the best answer has a score that is less than the score of
the null answer minus this threshold, the null answer is selected for this example (note that the score of
the null answer for an example giving several features is the minimum of the scores for the null answer on
each feature: all features must be aligned on the fact they `want` to predict a null answer).
Only useful when :obj:`version_2_with_negative` is :obj:`True`.
output_dir (:obj:`str`, `optional`):
If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
:obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether this process is the main process or not (used to determine if logging/saves should be done).
"""
assert (
len(predictions) == 2
), "`predictions` should be a tuple with two elements (start_logits, end_logits)."
all_start_logits, all_end_logits = predictions
assert len(predictions[0]) == len(
features
), f"Got {len(predictions[0])} predictions and {len(features)} features."
# Build a map example to its corresponding features.
example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
features_per_example[example_id_to_index[feature["example_id"]]].append(i)
# The dictionaries we have to fill.
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
if version_2_with_negative:
scores_diff_json = collections.OrderedDict()
# Logging.
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
logger.info(
f"Post-processing {len(examples)} example predictions split into {len(features)} features."
)
# Let's loop over all the examples!
for example_index, example in enumerate(tqdm(examples)):
# Those are the indices of the features associated to the current example.
feature_indices = features_per_example[example_index]
min_null_prediction = None
prelim_predictions = []
# Looping through all the features associated to the current example.
for feature_index in feature_indices:
# We grab the predictions of the model for this feature.
start_logits = all_start_logits[feature_index]
end_logits = all_end_logits[feature_index]
# This is what will allow us to map some the positions in our logits to span of texts in the original
# context.
offset_mapping = features[feature_index]["offset_mapping"]
# Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
# available in the current feature.
token_is_max_context = features[feature_index].get(
"token_is_max_context", None
)
# Update minimum null prediction.
feature_null_score = start_logits[0] + end_logits[0]
if (
min_null_prediction is None
or min_null_prediction["score"] > feature_null_score
):
min_null_prediction = {
"offsets": (0, 0),
"score": feature_null_score,
"start_logit": start_logits[0],
"end_logit": end_logits[0],
}
# Go through all possibilities for the `n_best_size` greater start and end logits.
start_indexes = np.argsort(start_logits)[
-1 : -n_best_size - 1 : -1
].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
# to part of the input_ids that are not in the context.
if (
start_index >= len(offset_mapping)
or end_index >= len(offset_mapping)
or offset_mapping[start_index] is None
or offset_mapping[end_index] is None
):
continue
# Don't consider answers with a length that is either < 0 or > max_answer_length.
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
# Don't consider answer that don't have the maximum context available (if such information is
# provided).
if (
token_is_max_context is not None
and not token_is_max_context.get(str(start_index), False)
):
continue
prelim_predictions.append(
{
"offsets": (
offset_mapping[start_index][0],
offset_mapping[end_index][1],
),
"score": start_logits[start_index] + end_logits[end_index],
"start_logit": start_logits[start_index],
"end_logit": end_logits[end_index],
}
)
if version_2_with_negative:
# Add the minimum null prediction
prelim_predictions.append(min_null_prediction)
null_score = min_null_prediction["score"]
# Only keep the best `n_best_size` predictions.
predictions = sorted(
prelim_predictions, key=lambda x: x["score"], reverse=True
)[:n_best_size]
# Add back the minimum null prediction if it was removed because of its low score.
if version_2_with_negative and not any(
p["offsets"] == (0, 0) for p in predictions
):
predictions.append(min_null_prediction)
# Use the offsets to gather the answer text in the original context.
context = example["context"]
for pred in predictions:
offsets = pred.pop("offsets")
pred["text"] = context[offsets[0] : offsets[1]]
# In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
# failure.
if len(predictions) == 0 or (
len(predictions) == 1 and predictions[0]["text"] == ""
):
predictions.insert(
0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}
)
# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
# the LogSumExp trick).
scores = np.array([pred.pop("score") for pred in predictions])
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
# Include the probabilities in our predictions.
for prob, pred in zip(probs, predictions):
pred["probability"] = prob
# Pick the best prediction. If the null answer is not possible, this is easy.
if not version_2_with_negative:
# strip 추가
# all_predictions[example["id"]] = predictions[0]["text"]
all_predictions[example["id"]] = predictions[0]["text"].strip()
else:
# Otherwise we first need to find the best non-empty prediction.
i = 0
while predictions[i]["text"] == "":
i += 1
best_non_null_pred = predictions[i]
# Then we compare to the null prediction using the threshold.
score_diff = (
null_score
- best_non_null_pred["start_logit"]
- best_non_null_pred["end_logit"]
)
scores_diff_json[example["id"]] = float(
score_diff
) # To be JSON-serializable.
if score_diff > null_score_diff_threshold:
all_predictions[example["id"]] = ""
else:
# all_predictions[example["id"]] = best_non_null_pred["text"]
all_predictions[example["id"]] = best_non_null_pred["text"].strip()
# Make `predictions` JSON-serializable by casting np.float back to float.
all_nbest_json[example["id"]] = [
{
k: (
float(v)
if isinstance(v, (np.float16, np.float32, np.float64))
else v
)
for k, v in pred.items()
}
for pred in predictions
]
# If we have an output_dir, let's save all those dicts.
if output_dir is not None:
assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
prediction_file = os.path.join(
output_dir,
"predictions.json" if prefix is None else f"predictions_{prefix}".json,
)
nbest_file = os.path.join(
output_dir,
"nbest_predictions.json"
if prefix is None
else f"nbest_predictions_{prefix}".json,
)
if version_2_with_negative:
null_odds_file = os.path.join(
output_dir,
"null_odds.json" if prefix is None else f"null_odds_{prefix}".json,
)
logger.info(f"Saving predictions to {prediction_file}.")
with open(prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n")
logger.info(f"Saving nbest_preds to {nbest_file}.")
with open(nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n")
if version_2_with_negative:
logger.info(f"Saving null_odds to {null_odds_file}.")
with open(null_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4, ensure_ascii=False) + "\n")
return all_predictions
def check_no_error(training_args, data_args, tokenizer, datasets):
# Detecting last checkpoint.
last_checkpoint = None
if (
os.path.isdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Tokenizer check: this script requires a fast tokenizer.
if not isinstance(tokenizer, PreTrainedTokenizerFast):
raise ValueError(
"This example script only works for models that have a fast tokenizer. Checkout the big table of models "
"at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
"requirement"
)
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
if "validation" not in datasets:
raise ValueError("--do_eval requires a validation dataset")
return last_checkpoint, max_seq_length
class BM25(object):
def __init__(self, tokenizer=tokenize, ngram_range=(1, 2), max_features=50000, b=0.75, k1=1.6):
# 만약 tfidfvectorizer가 있으면 불러와서 저장, fit함수 저장
tfidfv_path = '/opt/ml/input/data/tfidv.bin'
if os.path.isfile(tfidfv_path):
with open(tfidfv_path, "rb") as file:
self.vectorizer = pickle.load(file)
self.is_fit = True
print('load the tfidfv')
else :
self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False,
tokenizer=tokenize, ngram_range=(1, 2), max_features=max_features)
self.is_fit = False
self.b = b
self.k1 = k1
def fit_transform(self, context):
if not self.is_fit :
self.vectorizer.fit(context)
self.is_fit = True
y = super(TfidfVectorizer, self.vectorizer).transform(context)
self.avdl = y.sum(1).mean()
b, k1, avdl = self.b, self.k1, self.avdl
len_y = y.sum(1).A1
y = y.tocsc()
denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
numer = y * (k1 + 1)
p_embedding = (numer / denom)
return csr_matrix(p_embedding, dtype = np.float16)
def fit(self, X):
""" Fit IDF to documents X """
if not self.is_fit :
self.vectorizer.fit(X)
self.is_fit = True
y = super(TfidfVectorizer, self.vectorizer).transform(X)
self.avdl = y.sum(1).mean()
def transform(self, X):
X = super(TfidfVectorizer, self.vectorizer).transform(X)
assert sparse.isspmatrix_csr(X)
idf = self.vectorizer._tfidf.transform(X, copy=False)
#idf.todense()
#idf.data -= 1
#idf.tocsc()
return idf