Skip to content

Commit

Permalink
Merge pull request #46 from weblyzard/fix/html-comment-ofuscation
Browse files Browse the repository at this point in the history
Fix/html comment ofuscation
  • Loading branch information
AlbertWeichselbraun authored Aug 2, 2022
2 parents 6fa9516 + 2254390 commit d7e2afa
Show file tree
Hide file tree
Showing 12 changed files with 50 additions and 60 deletions.
10 changes: 7 additions & 3 deletions src/inscriptis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
"""

import re
import lxml.html
from lxml.html import fromstring, HtmlElement
from lxml.etree import ParserError

from typing import Dict, Optional, Any

Expand All @@ -70,7 +71,7 @@
RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')


def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]:
def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
"""Obtain the HTML parse tree for the given HTML content.
Args:
Expand All @@ -87,7 +88,10 @@ def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]:
if html_content.startswith('<?xml '):
html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)

return lxml.html.fromstring(html_content)
try:
return fromstring(html_content)
except ParserError:
return fromstring('<pre>' + html_content + '</pre>')


def get_text(html_content: str, config: ParserConfig = None) -> str:
Expand Down
1 change: 0 additions & 1 deletion src/inscriptis/css_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@
whitespace=WhiteSpace.pre),
'plaintext': HtmlElement(display=Display.block,
whitespace=WhiteSpace.pre),

}

RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy()
Expand Down
29 changes: 15 additions & 14 deletions src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List

import lxml.html
from lxml.etree import Comment

from inscriptis.annotation import Annotation
from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
Expand Down Expand Up @@ -86,25 +87,25 @@ def _parse_html_tree(self, tree):
Args:
tree: the HTML tree to parse.
"""
# ignore comments
if not isinstance(tree.tag, str):
return
if isinstance(tree.tag, str):
self.handle_starttag(tree.tag, tree.attrib)
cur = self.tags[-1]
cur.canvas.open_tag(cur)

self.handle_starttag(tree.tag, tree.attrib)
cur = self.tags[-1]
cur.canvas.open_tag(cur)
self.tags[-1].write(tree.text)

self.tags[-1].write(tree.text)
for node in tree:
self._parse_html_tree(node)

for node in tree:
self._parse_html_tree(node)
self.handle_endtag(tree.tag)
prev = self.tags.pop()
prev.canvas.close_tag(prev)

self.handle_endtag(tree.tag)
prev = self.tags.pop()
prev.canvas.close_tag(prev)
# write the tail text to the element's container
self.tags[-1].write(tree.tail)

# write the tail text to the element's container
self.tags[-1].write_tail(tree.tail)
elif tree.tag is Comment and tree.tail:
self.tags[-1].canvas.write(self.tags[-1], tree.tail)

def get_text(self) -> str:
"""Return the text extracted from the HTML page."""
Expand Down
4 changes: 2 additions & 2 deletions src/inscriptis/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

__author__ = 'Albert Weichselbraun, Fabian Odoni'
__author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch'
__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni'
__copyright__ = '2016-2022 Albert Weichselbraun, Fabian Odoni'
__license__ = 'Apache 2.0'
__version__ = '2.2.0'
__version__ = '2.3.0'
11 changes: 0 additions & 11 deletions src/inscriptis/model/html_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,9 @@ def write(self, text: str):
"""Write the given HTML text to the element's canvas."""
if not text or self.display == Display.none:
return

self.canvas.write(self, ''.join(
(self.prefix, text, self.suffix)))

def write_tail(self, text: str):
"""Write the given tail text the the element's canvas.
Args:
text: the text to write
"""
if not text or self.display == Display.none:
return
self.write(text)

def set_canvas(self, canvas) -> 'HtmlElement':
self.canvas = canvas
return self
Expand Down
7 changes: 3 additions & 4 deletions src/inscriptis/model/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,10 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:

# the easy case - the cell has only one line :)
if len(self.blocks) == 1:
annotations = horizontal_shift(self.annotations,
self.line_width[0],
self.width, self.align, idx)
self.line_width[0] = self.width
return annotations
return horizontal_shift(self.annotations,
self.line_width[0],
self.width, self.align, idx)

# the more challenging one - multiple cell lines
line_break_pos = list(accumulate(self.line_width))
Expand Down
1 change: 1 addition & 0 deletions tests/html/html-comment-ofuscation.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><body><span class="price-detailed__unit-price"><span>$<!--o-->90.<!--o-->74</span></span></body></html>
1 change: 1 addition & 0 deletions tests/html/html-comment-ofuscation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
$90.74
2 changes: 1 addition & 1 deletion tests/html/table-pre.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ <h1>Pre elements that have been nested in a table.</h1>
<td>
<pre>
int b = 1;
for (int a=0; a<10; a++) {
for (int a=0; a&lt;10; a++) {
System.out.println(a);
b = b * a;
System.out.println(b);
Expand Down
9 changes: 5 additions & 4 deletions tests/test_empty_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@


def test_empty_and_corrupt():
assert get_text("test").strip() == "test"
assert get_text(" ") == ""
assert get_text("") == ""
assert get_text("<<<").strip() == "<<"
assert get_text('test').strip() == 'test'
assert get_text(' ') == ''
assert get_text('') == ''
# test for the behaviour of older and recent lxml versions.
assert get_text('<<<').strip() in ('<<<', '<<', '')

2 changes: 1 addition & 1 deletion tests/test_model_html_element_canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ def test_formatting():
# and prefixes + suffixes
h.prefix = '>>'
h.suffix = '<<'
assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
33 changes: 14 additions & 19 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# standard unit tests
[testenv:pytest]
deps = pytest
pytest-coverage
commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests
deps = pytest ~= 7.1.2
pytest-cov ~= 3.0.0
commands = py.test --cov-config=.coveragerc --cov=inscriptis ./tests

# python packaging best practices
[testenv:pyroma]
Expand All @@ -12,31 +12,27 @@ commands = pyroma .
# checks compatible with flake 4
[testenv:flake8-4]
deps = flake8 ~= 4.0.1
flake8-blind-except ~= 0.2.0
flake8-bandit ~= 2.1.2
flake8-bugbear ~= 21.9.2
flake8-blind-except ~= 0.2.1
flake8-bandit ~= 3.0.0
flake8-bugbear ~= 22.7.1
flake8-builtins ~= 1.5.3
flake8-cognitive-complexity ~= 0.1.0
flake8-colors ~= 0.1.9
flake8-comprehensions ~= 3.7.0
flake8-comprehensions ~= 3.10.0
flake8-docstrings ~= 1.6.0
flake8-eradicate ~= 1.2.0
flake8-expression-complexity ~= 0.0.9
flake8-encodings ~= 0.5.0.post1
flake8-eradicate ~= 1.2.1
flake8-expression-complexity ~= 0.0.11
flake8-string-format ~= 0.3.0
flake8-tuple ~= 0.4.1
flake8-logging-format ~= 0.6.0
flake8-pytest ~= 1.3
flake8-quotes ~= 3.3.1
flake8-raise ~= 0.0.5
flake8-simplify ~= 0.14.2
pep8-naming ~= 0.12.1
flake8-simplify ~= 0.19.2
pep8-naming ~= 0.13.1
flake8-mutable ~= 1.2.0
commands = flake8

# checks compatible with flake < 4.0.0
[testenv:flake8-3]
deps = flake8 < 4.0.0
flake8-use-pathlib ~= 0.2.0
flake8-use-pathlib ~= 0.2.1
commands = flake8

[flake8]
Expand All @@ -51,11 +47,10 @@ exclude = .tox
# S410 - bind to all IPs is okay in the case of the Web service, since it is
# aimed for use with docker.
# W503 - replaced with W504
# E402 - required for importing inscriptis metadata in setup.py
# D102 - missing docstring in public method
# D105 - missing docstring in magic method (e.g., __str__)
# D107 - missing docstring in __init__
ignore = S104, S410, W503, E402, D107, D105, D102
ignore = S104, S410, W503, D107, D105, D102
show-source = true
enable-extensions=G
application-import-names = inscriptis
Expand Down

0 comments on commit d7e2afa

Please sign in to comment.