-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathremove_header_mapper.py
52 lines (41 loc) · 1.79 KB
/
remove_header_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------
import regex as re
from ..base_op import OPERATORS, Mapper
@OPERATORS.register_module('remove_header_mapper')
class RemoveHeaderMapper(Mapper):
"""Mapper to remove headers at the beginning of documents in Latex
samples."""
_batched_op = True
def __init__(self, drop_no_head: bool = True, *args, **kwargs):
"""
Initialization method.
:param drop_no_head: whether to drop sample texts without
headers.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.pattern = r'^(.*?)('
self.pattern += r'\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
self.pattern += r'\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
self.pattern += r'\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
self.pattern += r'\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
self.pattern += r'\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
self.pattern += r'\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}'
self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}'
self.pattern += r')'
self.drop_no_head = drop_no_head
def process_batched(self, samples):
for idx, text in enumerate(samples[self.text_key]):
if not re.search(self.pattern, text, flags=re.DOTALL):
if self.drop_no_head:
text = ''
continue
text = re.sub(pattern=self.pattern,
repl=r'\2',
string=text,
flags=re.DOTALL)
samples[self.text_key][idx] = text
return samples