Merge pull request #46 from weblyzard/fix/html-comment-ofuscation

Fix/html comment ofuscation
weblyzard · Aug 2, 2022 · d7e2afa · d7e2afa
2 parents 6fa9516 + 2254390
commit d7e2afa
Show file tree

Hide file tree

Showing 12 changed files with 50 additions and 60 deletions.
diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py
@@ -60,7 +60,8 @@
 """
 
 import re
-import lxml.html
+from lxml.html import fromstring, HtmlElement
+from lxml.etree import ParserError
 
 from typing import Dict, Optional, Any
 
@@ -70,7 +71,7 @@
 RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')
 
 
-def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]:
+def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
     """Obtain the HTML parse tree for the given HTML content.
 
     Args:
@@ -87,7 +88,10 @@ def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]:
     if html_content.startswith('<?xml '):
         html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)
 
-    return lxml.html.fromstring(html_content)
+    try:
+        return fromstring(html_content)
+    except ParserError:
+        return fromstring('<pre>' + html_content + '</pre>')
 
 
 def get_text(html_content: str, config: ParserConfig = None) -> str:

diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py
@@ -70,7 +70,6 @@
                            whitespace=WhiteSpace.pre),
     'plaintext': HtmlElement(display=Display.block,
                              whitespace=WhiteSpace.pre),
-
 }
 
 RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy()

diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -4,6 +4,7 @@
 from typing import List
 
 import lxml.html
+from lxml.etree import Comment
 
 from inscriptis.annotation import Annotation
 from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
@@ -86,25 +87,25 @@ def _parse_html_tree(self, tree):
         Args:
             tree: the HTML tree to parse.
         """
-        # ignore comments
-        if not isinstance(tree.tag, str):
-            return
+        if isinstance(tree.tag, str):
+            self.handle_starttag(tree.tag, tree.attrib)
+            cur = self.tags[-1]
+            cur.canvas.open_tag(cur)
 
-        self.handle_starttag(tree.tag, tree.attrib)
-        cur = self.tags[-1]
-        cur.canvas.open_tag(cur)
+            self.tags[-1].write(tree.text)
 
-        self.tags[-1].write(tree.text)
+            for node in tree:
+                self._parse_html_tree(node)
 
-        for node in tree:
-            self._parse_html_tree(node)
+            self.handle_endtag(tree.tag)
+            prev = self.tags.pop()
+            prev.canvas.close_tag(prev)
 
-        self.handle_endtag(tree.tag)
-        prev = self.tags.pop()
-        prev.canvas.close_tag(prev)
+            # write the tail text to the element's container
+            self.tags[-1].write(tree.tail)
 
-        # write the tail text to the element's container
-        self.tags[-1].write_tail(tree.tail)
+        elif tree.tag is Comment and tree.tail:
+            self.tags[-1].canvas.write(self.tags[-1], tree.tail)
 
     def get_text(self) -> str:
         """Return the text extracted from the HTML page."""

diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py
@@ -2,6 +2,6 @@
 
 __author__ = 'Albert Weichselbraun, Fabian Odoni'
 __author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch'
-__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni'
+__copyright__ = '2016-2022 Albert Weichselbraun, Fabian Odoni'
 __license__ = 'Apache 2.0'
-__version__ = '2.2.0'
+__version__ = '2.3.0'
diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py
@@ -71,20 +71,9 @@ def write(self, text: str):
         """Write the given HTML text to the element's canvas."""
         if not text or self.display == Display.none:
             return
-
         self.canvas.write(self, ''.join(
             (self.prefix, text, self.suffix)))
 
-    def write_tail(self, text: str):
-        """Write the given tail text the the element's canvas.
-
-        Args:
-            text: the text to write
-        """
-        if not text or self.display == Display.none:
-            return
-        self.write(text)
-
     def set_canvas(self, canvas) -> 'HtmlElement':
         self.canvas = canvas
         return self

diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py
@@ -115,11 +115,10 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:
 
         # the easy case - the cell has only one line :)
         if len(self.blocks) == 1:
-            annotations = horizontal_shift(self.annotations,
-                                           self.line_width[0],
-                                           self.width, self.align, idx)
             self.line_width[0] = self.width
-            return annotations
+            return horizontal_shift(self.annotations,
+                                    self.line_width[0],
+                                    self.width, self.align, idx)
 
         # the more challenging one - multiple cell lines
         line_break_pos = list(accumulate(self.line_width))

diff --git a/tests/html/html-comment-ofuscation.html b/tests/html/html-comment-ofuscation.html
@@ -0,0 +1 @@
+<html><body><span class="price-detailed__unit-price"><span>$<!--o-->90.<!--o-->74</span></span></body></html>
diff --git a/tests/html/html-comment-ofuscation.txt b/tests/html/html-comment-ofuscation.txt
@@ -0,0 +1 @@
+$90.74
diff --git a/tests/html/table-pre.html b/tests/html/table-pre.html
@@ -18,7 +18,7 @@ <h1>Pre elements that have been nested in a table.</h1>
 	<td>
 <pre>
 int b = 1;
-for (int a=0; a<10; a++) {
+for (int a=0; a&lt;10; a++) {
    System.out.println(a);
    b = b * a;
    System.out.println(b);

diff --git a/tests/test_empty_string.py b/tests/test_empty_string.py
@@ -9,8 +9,9 @@
 
 
 def test_empty_and_corrupt():
-    assert get_text("test").strip() == "test"
-    assert get_text("  ") == ""
-    assert get_text("") == ""
-    assert get_text("<<<").strip() == "<<"
+    assert get_text('test').strip() == 'test'
+    assert get_text('  ') == ''
+    assert get_text('') == ''
+    # test for the behaviour of older and recent lxml versions.
+    assert get_text('<<<').strip() in ('<<<', '<<', '')
 
diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py
@@ -53,4 +53,4 @@ def test_formatting():
     # and prefixes + suffixes
     h.prefix = '>>'
     h.suffix = '<<'
-    assert  _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
+    assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
diff --git a/tox.ini b/tox.ini
@@ -1,8 +1,8 @@
 # standard unit tests
 [testenv:pytest]
-deps = pytest
-       pytest-coverage
-commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests
+deps = pytest ~= 7.1.2
+       pytest-cov ~= 3.0.0
+commands = py.test --cov-config=.coveragerc --cov=inscriptis ./tests
 
 # python packaging best practices
 [testenv:pyroma]
@@ -12,31 +12,27 @@ commands = pyroma .
 # checks compatible with flake 4
 [testenv:flake8-4]
 deps = flake8 ~= 4.0.1
-       flake8-blind-except ~= 0.2.0
-       flake8-bandit ~= 2.1.2
-       flake8-bugbear ~= 21.9.2
+       flake8-blind-except ~= 0.2.1
+       flake8-bandit ~= 3.0.0
+       flake8-bugbear ~= 22.7.1
        flake8-builtins ~= 1.5.3
        flake8-cognitive-complexity ~= 0.1.0
        flake8-colors ~= 0.1.9
-       flake8-comprehensions ~= 3.7.0
+       flake8-comprehensions ~= 3.10.0
        flake8-docstrings ~= 1.6.0
-       flake8-eradicate ~= 1.2.0
-       flake8-expression-complexity ~= 0.0.9
+       flake8-encodings ~= 0.5.0.post1
+       flake8-eradicate ~= 1.2.1
+       flake8-expression-complexity ~= 0.0.11
        flake8-string-format ~= 0.3.0
        flake8-tuple ~= 0.4.1
        flake8-logging-format ~= 0.6.0
        flake8-pytest ~= 1.3
        flake8-quotes ~= 3.3.1
        flake8-raise ~= 0.0.5
-       flake8-simplify ~= 0.14.2
-       pep8-naming ~= 0.12.1
+       flake8-simplify ~= 0.19.2
+       pep8-naming ~= 0.13.1
        flake8-mutable ~= 1.2.0
-commands = flake8
-
-# checks compatible with flake < 4.0.0
-[testenv:flake8-3]
-deps = flake8 < 4.0.0
-       flake8-use-pathlib ~= 0.2.0
+       flake8-use-pathlib ~= 0.2.1
 commands = flake8
 
 [flake8]
@@ -51,11 +47,10 @@ exclude = .tox
 # S410 - bind to all IPs is okay in the case of the Web service, since it is
 #        aimed for use with docker.
 # W503 - replaced with W504
-# E402 - required for importing inscriptis metadata in setup.py
 # D102 - missing docstring in public method
 # D105 - missing docstring in magic method (e.g., __str__)
 # D107 - missing docstring in __init__
-ignore = S104, S410, W503, E402, D107, D105, D102
+ignore = S104, S410, W503, D107, D105, D102
 show-source = true
 enable-extensions=G
 application-import-names = inscriptis
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<html><body><span class="price-detailed__unit-price"><span>$<!--o-->90.<!--o-->74</span></span></body></html>