-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathc4200m_get_target_sentences.py
65 lines (55 loc) · 1.93 KB
/
c4200m_get_target_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Looks up C4 sentences by their hashes and stores them in a TSV file."""
import hashlib
import heapq
from absl import app
import tensorflow_datasets as tfds
LOGGING_STEPS = 100000
def main(argv):
if len(argv) != 3 and len(argv) != 4:
raise app.UsageError(
"python3 c4200m_get_target_sentences.py <edits-tsv> <output-tsv>"
" [<lang>]"
)
edits_tsv_path = argv[1]
output_tsv_path = argv[2]
if len(argv) == 4 and argv[3] != "en":
tfds_name = "c4/multilingual:3.1.0"
split = argv[3]
else:
tfds_name = "c4/en:2.2.1"
split = "train"
print("Loading C4_200M target sentence hashes from %r..." % edits_tsv_path)
remaining_hashes = set()
with open(edits_tsv_path) as edits_tsv_reader:
for tsv_line in edits_tsv_reader:
remaining_hashes.add(tsv_line.split("\t", 1)[0])
print(
"Searching for %d target sentences in the dataset %r split %r..."
% (len(remaining_hashes), tfds_name, split)
)
target_sentences = []
for num_done_examples, example in enumerate(
tfds.load(tfds_name, split=split)
):
for line in example["text"].numpy().decode("utf-8").split("\n"):
line_md5 = hashlib.md5(line.encode("utf-8")).hexdigest()
if line_md5 in remaining_hashes:
heapq.heappush(target_sentences, (line_md5, line))
remaining_hashes.remove(line_md5)
if not remaining_hashes:
break
if num_done_examples % LOGGING_STEPS == 0:
print(
"-- %d C4 examples done, %d sentences still to be found"
% (num_done_examples, len(remaining_hashes))
)
print(
"Found %d target sentences (%d not found)."
% (len(target_sentences), len(remaining_hashes))
)
print("Writing C4_200M sentence pairs to %r..." % output_tsv_path)
with open(output_tsv_path, "w") as output_tsv_writer:
while target_sentences:
output_tsv_writer.write("%s\t%s\n" % heapq.heappop(target_sentences))
if __name__ == "__main__":
app.run(main)