-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathclean_html_mapper.py
42 lines (30 loc) · 1.19 KB
/
clean_html_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------
from data_juicer.utils.lazy_loader import LazyLoader
from ..base_op import OPERATORS, Mapper
selectolax = LazyLoader('selectolax', 'selectolax')
OP_NAME = 'clean_html_mapper'
@OPERATORS.register_module(OP_NAME)
class CleanHtmlMapper(Mapper):
"""Mapper to clean html code in text samples."""
_batched_op = True
def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
def process_batched(self, samples):
def _clean_html(raw_html):
raw_html = raw_html.replace('<li>', '\n*')
raw_html = raw_html.replace('</li>', '')
raw_html = raw_html.replace('<ol>', '\n*')
raw_html = raw_html.replace('</ol>', '')
parser = selectolax.parser.HTMLParser(raw_html)
return parser.text()
samples[self.text_key] = [
_clean_html(text) for text in samples[self.text_key]
]
return samples