Skip to content

Commit

Permalink
Merge pull request #375 from openzim/fix_perf_html_cache
Browse files Browse the repository at this point in the history
Cache HTML rewriting function signature
  • Loading branch information
benoit74 authored Aug 9, 2024
2 parents 6cad55b + 4c586a1 commit e80b30f
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Remove subsequent slashes in URLs, both in Python and JS (#365)
- Ignore non HTTP(S) WARC records (#351)
- Fix `vimeo_cdn_fix` fuzzy rule for proper operation in Javascript (#348)
- Performance issue linked to new "extensible" HTML rewriting rules (#370)

## [2.0.3] - 2024-07-24

Expand Down
24 changes: 17 additions & 7 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from collections import namedtuple
from collections.abc import Callable
from dataclasses import dataclass
from functools import cache
from html import escape
from html.parser import HTMLParser
from inspect import signature
from inspect import Signature, signature

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -85,6 +86,15 @@ def extract_base_href(content: str) -> str | None:
return None


@cache
def _cached_signature(func: Callable) -> Signature:
"""Returns the signature of a given callable
Result is cached to save performance when reused multiple times
"""
return signature(func)


class HtmlRewriter(HTMLParser):
def __init__(
self,
Expand Down Expand Up @@ -259,8 +269,8 @@ def _check_decorated_func_signature(expected_func: Callable, decorated_func: Cal
It checks that decorated function parameters have known names and proper types
"""
expected_params = signature(expected_func).parameters
func_params = signature(decorated_func).parameters
expected_params = _cached_signature(expected_func).parameters
func_params = _cached_signature(decorated_func).parameters
for name, param in func_params.items():
if name not in expected_params:
raise TypeError(
Expand Down Expand Up @@ -356,7 +366,7 @@ def _do_drop_attribute(
"attr_value": attr_value,
"attrs": attrs,
}.items()
if arg_name in signature(rule.func).parameters
if arg_name in _cached_signature(rule.func).parameters
}
)
is True
Expand Down Expand Up @@ -399,7 +409,7 @@ def _do_attribute_rewrite(
"base_href": base_href,
"notify_js_module": notify_js_module,
}.items()
if arg_name in signature(rule.func).parameters
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
Expand Down Expand Up @@ -429,7 +439,7 @@ def _do_tag_rewrite(
"attrs": attrs,
"auto_close": auto_close,
}.items()
if arg_name in signature(rule.func).parameters
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
Expand Down Expand Up @@ -460,7 +470,7 @@ def _do_data_rewrite(
"js_rewriter": js_rewriter,
"url_rewriter": url_rewriter,
}.items()
if arg_name in signature(rule.func).parameters
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
Expand Down

0 comments on commit e80b30f

Please sign in to comment.