Skip to content

Commit

Permalink
Merge pull request #84 from weblyzard/fix/bug-81-custom-html-handling2
Browse files Browse the repository at this point in the history
Fix/bug 81 custom html handling2
  • Loading branch information
AlbertWeichselbraun authored Mar 5, 2024
2 parents fc9ee5c + 504863d commit 667b356
Show file tree
Hide file tree
Showing 40 changed files with 652 additions and 261 deletions.
24 changes: 17 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ HTML to annotated text conversion
---------------------------------
convert and annotate HTML from a Web page using the provided annotation rules.

Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::

$ inscript https://www.fhgr.ch -r annotation-profile.json

Expand Down Expand Up @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
specified with the ``-p`` or ``--postprocessor`` command line argument::

$ inscript https://www.fhgr.ch \
-r ./examples/annotation-profile.json \
-r ./annotation/examples/annotation-profile.json \
-p surface


Expand Down Expand Up @@ -474,7 +474,8 @@ be used within a program:
.. code-block:: python
import urllib.request
from inscriptis import get_annotated_text, ParserConfig
from inscriptis import get_annotated_text
from inscriptis.model.config import ParserConfig
url = "https://www.fhgr.ch"
html = urllib.request.urlopen(url).read().decode('utf-8')
Expand Down Expand Up @@ -533,15 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
.. code-block:: python
inscriptis = Inscriptis(html, config)
from inscriptis import ParserConfig
from inscriptis.html_engine import Inscriptis
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
inscriptis.start_tag_handler_dict['a'] = my_handle_start_a
inscriptis.end_tag_handler_dict['a'] = my_handle_end_a
my_mapping = CustomHtmlTagHandlerMapping(
start_tag_mapping={'a': my_handle_start_a},
end_tag_mapping={'a': my_handle_end_a}
)
inscriptis = Inscriptis(html_tree,
ParserConfig(custom_html_tag_handler_mapping=my_mapping))
text = inscriptis.get_text()
In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``.
You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
Optimizing memory consumption
-----------------------------
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
43 changes: 43 additions & 0 deletions examples/custom-html-handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3

"""
Custom HTML tag handling example.
Add a custom HTML handler for the bold <b> tag which encloses
bold text with "**".
Example:
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
"""
from typing import Dict

from inscriptis import ParserConfig
from inscriptis.html_engine import Inscriptis
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
from lxml.html import fromstring


def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the opening <b> tag."""
state.tags[-1].write("**")


def my_handle_end_b(state: HtmlDocumentState) -> None:
"""Handle the closing </b> tag."""
state.tags[-1].write("**")


MY_MAPPING = CustomHtmlTagHandlerMapping(
start_tag_mapping={"b": my_handle_start_b},
end_tag_mapping={"b": my_handle_end_b},
)


HTML = "Welcome to <b>Chur</b>"

html_tree = fromstring(HTML)
inscriptis = Inscriptis(
html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING)
)
print(inscriptis.get_text())
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "inscriptis"
version = "2.4.0.1"
version = "2.5.0"
authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
description = "inscriptis - HTML to text converter."
keywords = ["HTML", "converter", "text"]
Expand Down Expand Up @@ -59,5 +59,5 @@ line-length = 88
target-version = ["py38", "py39", "py310", "py311", "py312"]
extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
include = '''
^/src/|^/tests/|^/benchmarking/
^/src/|^/tests/|^/benchmarking/|^/examples/
'''
11 changes: 6 additions & 5 deletions src/inscriptis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@
"""

import re
from lxml.html import fromstring, HtmlElement
from lxml.etree import ParserError

from typing import Dict, Optional, Any

from inscriptis.model.config import ParserConfig

from lxml.etree import ParserError
from lxml.html import fromstring, HtmlElement

from inscriptis.html_engine import Inscriptis

RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
Expand Down Expand Up @@ -132,5 +132,6 @@ def get_annotated_text(
return {}

inscriptis = Inscriptis(html_tree, config)
text = inscriptis.get_text()
labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
return {"text": inscriptis.get_text(), "label": labels}
return {"text": text, "label": labels}
6 changes: 3 additions & 3 deletions src/inscriptis/annotation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The model used for saving annotations."""

from typing import NamedTuple, Tuple
from typing import List
from typing import NamedTuple

from inscriptis.html_properties import HorizontalAlignment

Expand All @@ -25,8 +25,8 @@ class Annotation(NamedTuple):
"""the annotation's start index within the text output."""
end: int
"""the annotation's end index within the text output."""
metadata: Tuple[str]
"""a tuple of tags to be attached to the annotation."""
metadata: str
"""the tag to be attached to the annotation."""


def horizontal_shift(
Expand Down
3 changes: 2 additions & 1 deletion src/inscriptis/annotation/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""
from collections import defaultdict
from copy import copy
from typing import Dict, Tuple, List

from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT

Expand Down Expand Up @@ -85,7 +86,7 @@ def __init__(self, css_profile, model: dict):
self.css = css_profile

@staticmethod
def _parse(model: dict) -> "AnnotationModel":
def _parse(model: dict) -> Tuple[Dict, List]:
"""Compute the AnnotationModel from a model dictionary.
Returns:
Expand Down
13 changes: 6 additions & 7 deletions src/inscriptis/cli/inscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import argparse
import sys
from json import load, dumps
from typing import Optional
from pathlib import Path
from typing import Optional

import requests

from inscriptis import get_text, get_annotated_text
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.model.config import ParserConfig

DEFAULT_ENCODING = "utf8"
Expand Down Expand Up @@ -148,24 +148,23 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
Args:
url: URL to the HTML content, or None if the content is obtained from stdin.
encoding: used encoding.
timeout: timeout in seconds for retrieving the URL.
Returns:
The html_content or None, if no content could be extracted.
"""
if not url:
return sys.stdin.read()
elif Path(url).is_file():
with Path(url).open(
encoding=encoding or DEFAULT_ENCODING, errors="ignore"
) as f:
elif (p := Path(url)).is_file():
with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
return f.read()
elif url.startswith("http://") or url.startswith("https://"):
req = requests.get(url, timeout=timeout)
return req.content.decode(encoding or req.encoding)


def cli():
def cli() -> None:
"""Run the inscript command line client."""
args = parse_command_line()
if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):
Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/css_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
preventing cases where two words stick together.
"""

from inscriptis.model.html_element import HtmlElement
from inscriptis.html_properties import Display, WhiteSpace
from inscriptis.model.html_element import HtmlElement

STRICT_CSS_PROFILE = {
"body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),
Expand Down
Loading

0 comments on commit 667b356

Please sign in to comment.