-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathremove_comments_mapper.py
61 lines (47 loc) · 1.82 KB
/
remove_comments_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------
from typing import List, Union
import regex as re
from ..base_op import OPERATORS, Mapper
@OPERATORS.register_module('remove_comments_mapper')
class RemoveCommentsMapper(Mapper):
"""
Mapper to remove comments in different kinds of documents.
Only support 'tex' for now.
"""
_batched_op = True
def __init__(self,
doc_type: Union[str, List[str]] = 'tex',
inline: bool = True,
multiline: bool = True,
*args,
**kwargs):
"""
Initialization method.
:param doc_type: Type of document to remove comments.
:param inline: Whether to remove inline comments.
:param multiline: Whether to remove multiline comments.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.doc_type = doc_type
self.inline = inline
self.multiline = multiline
def process_batched(self, samples):
# TODO: remove different comments by sample type
for idx, text in enumerate(samples[self.text_key]):
if self.inline:
# remove all in comments within a line
text = re.sub(pattern=r'[^\\]%.+$',
repl=r'',
string=text,
flags=re.MULTILINE)
if self.multiline:
text = re.sub(pattern=r'(?m)^%.*\n?',
repl=r'',
string=text,
flags=re.MULTILINE)
samples[self.text_key][idx] = text
return samples