From 4c586a113ca9fa8332a63e4c23bd58faa4592a34 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 7 Aug 2024 12:28:07 +0000 Subject: [PATCH] Cache HTML rewriting function signature --- CHANGELOG.md | 1 + src/warc2zim/content_rewriting/html.py | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bf619e..49de2ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove subsequent slashes in URLs, both in Python and JS (#365) - Ignore non HTTP(S) WARC records (#351) - Fix `vimeo_cdn_fix` fuzzy rule for proper operation in Javascript (#348) +- Performance issue linked to new "extensible" HTML rewriting rules (#370) ## [2.0.3] - 2024-07-24 diff --git a/src/warc2zim/content_rewriting/html.py b/src/warc2zim/content_rewriting/html.py index abb9dd4..c7232b8 100644 --- a/src/warc2zim/content_rewriting/html.py +++ b/src/warc2zim/content_rewriting/html.py @@ -3,9 +3,10 @@ from collections import namedtuple from collections.abc import Callable from dataclasses import dataclass +from functools import cache from html import escape from html.parser import HTMLParser -from inspect import signature +from inspect import Signature, signature from bs4 import BeautifulSoup @@ -85,6 +86,15 @@ def extract_base_href(content: str) -> str | None: return None +@cache +def _cached_signature(func: Callable) -> Signature: + """Returns the signature of a given callable + + Result is cached to save performance when reused multiple times + """ + return signature(func) + + class HtmlRewriter(HTMLParser): def __init__( self, @@ -259,8 +269,8 @@ def _check_decorated_func_signature(expected_func: Callable, decorated_func: Cal It checks that decorated function parameters have known names and proper types """ - expected_params = signature(expected_func).parameters - func_params = signature(decorated_func).parameters + expected_params = _cached_signature(expected_func).parameters + func_params = _cached_signature(decorated_func).parameters for name, param in func_params.items(): if name not in expected_params: raise TypeError( @@ -356,7 +366,7 @@ def _do_drop_attribute( "attr_value": attr_value, "attrs": attrs, }.items() - if arg_name in signature(rule.func).parameters + if arg_name in _cached_signature(rule.func).parameters } ) is True @@ -399,7 +409,7 @@ def _do_attribute_rewrite( "base_href": base_href, "notify_js_module": notify_js_module, }.items() - if arg_name in signature(rule.func).parameters + if arg_name in _cached_signature(rule.func).parameters } ) ) is not None: @@ -429,7 +439,7 @@ def _do_tag_rewrite( "attrs": attrs, "auto_close": auto_close, }.items() - if arg_name in signature(rule.func).parameters + if arg_name in _cached_signature(rule.func).parameters } ) ) is not None: @@ -460,7 +470,7 @@ def _do_data_rewrite( "js_rewriter": js_rewriter, "url_rewriter": url_rewriter, }.items() - if arg_name in signature(rule.func).parameters + if arg_name in _cached_signature(rule.func).parameters } ) ) is not None: