diff --git a/parsel/parser/__init__.py b/parsel/parser/__init__.py new file mode 100644 index 00000000..f0bb4574 --- /dev/null +++ b/parsel/parser/__init__.py @@ -0,0 +1,36 @@ +from lxml import etree +from lxml.etree import XMLParser as _UnsafeXMLParser +from lxml.html import HTMLParser as _HTMLParser + + +class _LXMLBaseParser(object): + + def __init__(self, parser_cls): + self._parser = parser_cls(recover=True, encoding='utf8') + + def parse(self, text, base_url): + body = text.strip().replace('\x00', '').encode('utf8') or b'' + root = etree.fromstring(body, parser=self._parser, base_url=base_url) + if root is None: + root = etree.fromstring(b'', parser=self._parser, + base_url=base_url) + return root + + +class HTMLParser(_LXMLBaseParser): + + def __init__(self): + super(HTMLParser, self).__init__(_HTMLParser) + + +class _XMLParser(_UnsafeXMLParser): + + def __init__(self, *args, **kwargs): + kwargs.setdefault('resolve_entities', False) + super(_XMLParser, self).__init__(*args, **kwargs) + + +class XMLParser(_LXMLBaseParser): + + def __init__(self): + super(XMLParser, self).__init__(_XMLParser) diff --git a/parsel/parser/html.py b/parsel/parser/html.py new file mode 100644 index 00000000..305fd6d1 --- /dev/null +++ b/parsel/parser/html.py @@ -0,0 +1,3 @@ +from parsel.parser import HTMLParser + +HTML_PARSER = HTMLParser() diff --git a/parsel/parser/xml.py b/parsel/parser/xml.py new file mode 100644 index 00000000..f54bb55e --- /dev/null +++ b/parsel/parser/xml.py @@ -0,0 +1,3 @@ +from parsel.parser import XMLParser + +XML_PARSER = XMLParser() diff --git a/parsel/selector.py b/parsel/selector.py index ac528534..6e824cd6 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -3,14 +3,39 @@ """ import sys +from importlib import import_module +from warnings import warn import six -from lxml import etree, html +from lxml import etree from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator +def _load_object(path): + """Load an object given its absolute object path, and return it. + + `path` can point to a class, function, variable or a class instance. For + example: ``'parsel.parser.html.HTML_PARSER'``. + """ + + try: + dot = path.rindex('.') + except ValueError: + raise ValueError("Error loading object '%s': not a full path" % path) + + module, name = path[:dot], path[dot+1:] + mod = import_module(module) + + try: + obj = getattr(mod, name) + except AttributeError: + raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) + + return obj + + class CannotRemoveElementWithoutRoot(Exception): pass @@ -21,14 +46,16 @@ class CannotRemoveElementWithoutParent(Exception): class SafeXMLParser(etree.XMLParser): def __init__(self, *args, **kwargs): + warn('parsel.selector.SafeXMLParser is deprecated', + DeprecationWarning, stacklevel=2) kwargs.setdefault('resolve_entities', False) super(SafeXMLParser, self).__init__(*args, **kwargs) _ctgroup = { - 'html': {'_parser': html.HTMLParser, + 'html': {'_parser': 'parsel.parser.html.HTML_PARSER', '_csstranslator': HTMLTranslator(), '_tostring_method': 'html'}, - 'xml': {'_parser': SafeXMLParser, + 'xml': {'_parser': 'parsel.parser.xml.XML_PARSER', '_csstranslator': GenericTranslator(), '_tostring_method': 'xml'}, } @@ -46,6 +73,8 @@ def _st(st): def create_root_node(text, parser_cls, base_url=None): """Create root node for text using given parser class. """ + warn('parsel.selector.create_root_node is deprecated', + DeprecationWarning, stacklevel=2) body = text.strip().replace('\x00', '').encode('utf8') or b'' parser = parser_cls(recover=True, encoding='utf8') root = etree.fromstring(body, parser=parser, base_url=base_url) @@ -198,7 +227,7 @@ class Selector(object): def __init__(self, text=None, type=None, namespaces=None, root=None, base_url=None, _expr=None): self.type = st = _st(type or self._default_type) - self._parser = _ctgroup[st]['_parser'] + self._parser = _load_object(_ctgroup[st]['_parser']) self._csstranslator = _ctgroup[st]['_csstranslator'] self._tostring_method = _ctgroup[st]['_tostring_method'] @@ -219,7 +248,7 @@ def __getstate__(self): raise TypeError("can't pickle Selector objects") def _get_root(self, text, base_url=None): - return create_root_node(text, self._parser, base_url=base_url) + return self._parser.parse(text=text, base_url=base_url) def xpath(self, query, namespaces=None, **kwargs): """ diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py new file mode 100644 index 00000000..74736c2f --- /dev/null +++ b/tests/test_deprecations.py @@ -0,0 +1,21 @@ +# -*- coding:utf-8 -*- + + +from unittest import TestCase +from warnings import catch_warnings + +from parsel.selector import create_root_node, SafeXMLParser +from lxml.html import HTMLParser + + +class TestDeprecations(TestCase): + + def test_create_root_node(self): + with catch_warnings(record=True) as warnings: + create_root_node(u'…', HTMLParser) + self.assertEqual(len(warnings), 1) + + def test_SafeXMLParser(self): + with catch_warnings(record=True) as warnings: + parser = SafeXMLParser() + self.assertEqual(len(warnings), 1) diff --git a/tests/test_selector.py b/tests/test_selector.py index 376b0f71..077c0991 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -7,6 +7,7 @@ from parsel import Selector from parsel.selector import ( + _load_object, CannotRemoveElementWithoutRoot, CannotRemoveElementWithoutParent, ) @@ -913,3 +914,24 @@ def test_set(self): //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)''').extract(), [u'url', u'name', u'startDate', u'location', u'offers']) + + +try: + ModuleNotFoundError +except NameError: + ModuleNotFoundError = ImportError + + +class LoadObjectTestCase(unittest.TestCase): + + def test_incomplete_path(self): + with self.assertRaises(ValueError): + object = _load_object('parsel') + + def test_inexistent_module(self): + with self.assertRaises(ModuleNotFoundError): + object = _load_object('parsel.inexistent.inexistent') + + def test_inexistent_object(self): + with self.assertRaises(NameError): + object = _load_object('parsel.parser.inexistent')