Merge pull request #227 from openzim/metadata_and_beartype

Protecting our API with beartype
openzim · Dec 17, 2024 · f68d568 · f68d568
2 parents a0a225b + 6f93ffe
commit f68d568
Show file tree

Hide file tree

Showing 35 changed files with 194 additions and 185 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -254,7 +254,7 @@ ban-relative-imports = "all"
 # _libkiwix mimics libkiwix C++ code, names obey C++ conventions
 "src/zimscraperlib/zim/_libkiwix.py" = ["N802", "N803", "N806"]
 # beartype must be first
-"src/zimscraperlib/zim/__init__.py" = ["E402"]
+"src/zimscraperlib/__init__.py" = ["E402"]
 
 [tool.pytest.ini_options]
 minversion = "7.3"
@@ -278,6 +278,7 @@ exclude_lines = [
   "no cov",
   "if __name__ == .__main__.:",
   "if TYPE_CHECKING:",
+  "class .*Protocol.*",
 ]
 
 [tool.pyright]

diff --git a/src/zimscraperlib/__init__.py b/src/zimscraperlib/__init__.py
@@ -4,6 +4,10 @@
 import logging as stdlogging
 import os
 
+from beartype.claw import beartype_this_package
+
+beartype_this_package()
+
 from zimscraperlib.constants import NAME
 from zimscraperlib.logging import getLogger
 

diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py
@@ -6,7 +6,7 @@
 import pathlib
 import subprocess
 from concurrent.futures import Future, ThreadPoolExecutor
-from typing import IO, ClassVar
+from typing import ClassVar
 
 import requests
 import requests.adapters
@@ -16,6 +16,7 @@
 
 from zimscraperlib import logger
 from zimscraperlib.constants import DEFAULT_WEB_REQUESTS_TIMEOUT
+from zimscraperlib.typing import SupportsSeekableWrite, SupportsWrite
 
 
 class YoutubeDownloader:
@@ -59,11 +60,10 @@ def download(
         future = self.executor.submit(self._run_youtube_dl, url, options or {})
         if not wait:
             return future
-        if not future.exception():
-            # return the result
-            return future.result()  # pyright: ignore
-        # raise the exception
-        raise future.exception()  # pyright: ignore
+        exc = future.exception()
+        if isinstance(exc, BaseException):
+            raise exc
+        return True
 
 
 class YoutubeConfig(dict):
@@ -176,7 +176,7 @@ def get_session(max_retries: int | None = 5) -> requests.Session:
 def stream_file(
     url: str,
     fpath: pathlib.Path | None = None,
-    byte_stream: IO[bytes] | None = None,
+    byte_stream: SupportsWrite[bytes] | SupportsSeekableWrite[bytes] | None = None,
     block_size: int | None = 1024,
     proxies: dict[str, str] | None = None,
     max_retries: int | None = 5,
@@ -216,24 +216,25 @@ def stream_file(
 
     total_downloaded = 0
     if fpath is not None:
-        fp = open(fpath, "wb")
-    elif (
-        byte_stream is not None
-    ):  # pragma: no branch (we use a precise condition to help type checker)
-        fp = byte_stream
+        fpath_handler = open(fpath, "wb")
+    else:
+        fpath_handler = None
 
     for data in resp.iter_content(block_size):
         total_downloaded += len(data)
-        fp.write(data)
+        if fpath_handler:
+            fpath_handler.write(data)
+        if byte_stream:
+            byte_stream.write(data)
 
         # stop downloading/reading if we're just testing first block
         if only_first_block:
             break
 
     logger.debug(f"Downloaded {total_downloaded} bytes from {url}")
 
-    if fpath:
-        fp.close()
-    else:
-        fp.seek(0)
+    if fpath_handler:
+        fpath_handler.close()
+    elif isinstance(byte_stream, SupportsSeekableWrite) and byte_stream.seekable():
+        byte_stream.seek(0)
     return total_downloaded, resp.headers
diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py
@@ -59,12 +59,12 @@ def iso_types(self) -> list[str]:
         return self["iso_types"]
 
     @property
-    def query(self) -> list[str]:
+    def query(self) -> str:
         """Query issued for these language details"""
         return self["query"]
 
     @property
-    def querytype(self) -> list[str]:
+    def querytype(self) -> str:
         """Type of query issued to retrieve language details"""
         return self["querytype"]
 

diff --git a/src/zimscraperlib/image/conversion.py b/src/zimscraperlib/image/conversion.py
@@ -5,7 +5,6 @@
 
 import io
 import pathlib
-from typing import IO
 
 import cairosvg.svg
 from PIL.Image import open as pilopen
@@ -17,9 +16,9 @@
 
 
 def convert_image(
-    src: pathlib.Path | IO[bytes],
-    dst: pathlib.Path | IO[bytes],
-    **params: str,
+    src: pathlib.Path | io.BytesIO,
+    dst: pathlib.Path | io.BytesIO,
+    **params: str | None,
 ) -> None:
     """convert an image file from one format to another
     params: Image.save() parameters. Depends on dest format.
@@ -31,7 +30,9 @@ def convert_image(
      to RGB. ex: RGB, ARGB, CMYK (and other PIL colorspaces)"""
 
     colorspace = params.get("colorspace")  # requested colorspace
-    fmt = params.pop("fmt").upper() if "fmt" in params else None  # requested format
+    fmt = (
+        str(params.pop("fmt")).upper() if params.get("fmt") else None
+    )  # requested format
     if not fmt:
         fmt = format_for(dst)
     if not fmt:
@@ -44,7 +45,7 @@ def convert_image(
 
 def convert_svg2png(
     src: str | pathlib.Path | io.BytesIO,
-    dst: pathlib.Path | IO[bytes],
+    dst: pathlib.Path | io.BytesIO,
     width: int | None = None,
     height: int | None = None,
 ):

diff --git a/src/zimscraperlib/image/optimization.py b/src/zimscraperlib/image/optimization.py
@@ -210,14 +210,14 @@ def optimize_webp(
     else:
         try:
             save_image(webp_image, dst, fmt="WEBP", **params)
-        except Exception as exc:
+        except Exception as exc:  # pragma: no cover
             if (
                 isinstance(src, pathlib.Path)
                 and isinstance(dst, pathlib.Path)
                 and src.resolve() != dst.resolve()
                 and dst.exists()
             ):
-                dst.unlink()  # pragma: no cover
+                dst.unlink()
             raise exc
     return dst
 

diff --git a/src/zimscraperlib/image/probing.py b/src/zimscraperlib/image/probing.py
@@ -7,7 +7,6 @@
 import io
 import pathlib
 import re
-from typing import IO
 
 import colorthief
 import PIL.Image
@@ -55,7 +54,7 @@ def is_hex_color(text: str) -> bool:
 
 
 def format_for(
-    src: pathlib.Path | IO[bytes],
+    src: pathlib.Path | io.BytesIO,
     *,
     from_suffix: bool = True,
 ) -> str | None:
@@ -95,7 +94,7 @@ def format_for(
 
 
 def is_valid_image(
-    image: pathlib.Path | IO[bytes] | bytes,
+    image: pathlib.Path | bytes | io.BytesIO,
     imformat: str,
     size: tuple[int, int] | None = None,
 ) -> bool:

diff --git a/src/zimscraperlib/image/utils.py b/src/zimscraperlib/image/utils.py
@@ -2,18 +2,19 @@
 # vim: ai ts=4 sts=4 et sw=4 nu
 from __future__ import annotations
 
+import io
 import pathlib
-from typing import IO
+from typing import IO, Any
 
 from PIL.Image import Image
 from PIL.ImageFile import ImageFile
 
 
 def save_image(
     src: Image | ImageFile,
-    dst: pathlib.Path | IO[bytes],
+    dst: pathlib.Path | IO[bytes] | io.BytesIO,
     fmt: str,
-    **params: str,
+    **params: Any,
 ) -> None:
     """PIL.Image.save() wrapper setting default parameters"""
     args = {"JPEG": {"quality": 100}, "PNG": {}}.get(fmt, {})

diff --git a/src/zimscraperlib/logging.py b/src/zimscraperlib/logging.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import io
 import logging
 import pathlib
 import sys
@@ -22,7 +23,7 @@
 def getLogger(  # noqa: N802 (intentionally matches the stdlib getLogger name)
     name: str,
     level: int = logging.INFO,
-    console: TextIO | None = sys.stdout,
+    console: TextIO | io.StringIO | None = sys.stdout,
     log_format: str | None = DEFAULT_FORMAT,
     file: pathlib.Path | None = None,
     file_level: int | None = None,

diff --git a/src/zimscraperlib/rewriting/css.py b/src/zimscraperlib/rewriting/css.py
@@ -186,29 +186,27 @@ def _process_node(self, node: ast.Node):
             )
         elif isinstance(node, ast.FunctionBlock):
             if node.lower_name == "url":  # pyright: ignore[reportUnknownMemberType]
-                url_node: ast.Node = node.arguments[0]  # pyright: ignore
+                url_node: ast.Node = node.arguments[0]
                 new_url = self.url_rewriter(
-                    url_node.value,  # pyright: ignore
+                    getattr(url_node, "value", ""),
                     self.base_href,
                 ).rewriten_url
-                url_node.value = str(new_url)  # pyright: ignore
-                url_node.representation = (  # pyright: ignore
-                    f'"{serialize_url(str(new_url))}"'
+                setattr(url_node, "value", str(new_url))  # noqa: B010
+                setattr(  # noqa: B010
+                    url_node, "representation", f'"{serialize_url(str(new_url))}"'
                 )
 
             else:
                 self._process_list(
-                    node.arguments,  # pyright: ignore
+                    getattr(node, "arguments", []),
                 )
         elif isinstance(node, ast.AtRule):
-            self._process_list(node.prelude)  # pyright: ignore
-            self._process_list(node.content)  # pyright: ignore
+            self._process_list(node.prelude)
+            self._process_list(node.content)
         elif isinstance(node, ast.Declaration):
-            self._process_list(node.value)  # pyright: ignore
+            self._process_list(node.value)
         elif isinstance(node, ast.URLToken):
-            new_url = self.url_rewriter(
-                node.value, self.base_href
-            ).rewriten_url  # pyright: ignore
+            new_url = self.url_rewriter(node.value, self.base_href).rewriten_url
             node.value = new_url
             node.representation = f"url({serialize_url(new_url)})"
 

diff --git a/src/zimscraperlib/rewriting/html.py b/src/zimscraperlib/rewriting/html.py
@@ -560,7 +560,7 @@ def rewrite_meta_charset_content(
         return
     if attr_name == "charset":
         return (attr_name, "UTF-8")
-    if attr_name == "content" and any(
+    if attr_name == "content" and any(  # pragma: no coverage (coverage bug)
         attr_name.lower() == "http-equiv"
         and attr_value
         and attr_value.lower() == "content-type"
@@ -574,7 +574,9 @@ def rewrite_onxxx_tags(
     attr_name: str, attr_value: str | None, js_rewriter: JsRewriter
 ) -> AttrNameAndValue | None:
     """Rewrite onxxx script attributes"""
-    if attr_value and attr_name.startswith("on") and not attr_name.startswith("on-"):
+    if (
+        attr_value and attr_name.startswith("on") and not attr_name.startswith("on-")
+    ):  # pragma: no coverage (coverage bug)
         return (attr_name, js_rewriter.rewrite(attr_value))
 
 
@@ -583,7 +585,7 @@ def rewrite_style_tags(
     attr_name: str, attr_value: str | None, css_rewriter: CssRewriter
 ) -> AttrNameAndValue | None:
     """Rewrite style attributes"""
-    if attr_value and attr_name == "style":
+    if attr_value and attr_name == "style":  # pragma: no coverage (coverage bug)
         return (attr_name, css_rewriter.rewrite_inline(attr_value))
 
 

diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py
@@ -41,8 +41,9 @@
 from __future__ import annotations
 
 import re
+from dataclasses import dataclass
 from pathlib import PurePosixPath
-from typing import ClassVar, NamedTuple
+from typing import ClassVar
 from urllib.parse import quote, unquote, urljoin, urlsplit
 
 import idna
@@ -51,7 +52,8 @@
 from zimscraperlib.rewriting.rules import FUZZY_RULES
 
 
-class AdditionalRule(NamedTuple):
+@dataclass
+class AdditionalRule:
     match: re.Pattern[str]
     replace: str
 
@@ -147,7 +149,8 @@ def check_validity(cls, value: str) -> None:
             raise ValueError(f"Unexpected password in value: {value} {parts.password}")
 
 
-class RewriteResult(NamedTuple):
+@dataclass
+class RewriteResult:
     absolute_url: str
     rewriten_url: str
     zim_path: ZimPath | None
@@ -382,9 +385,6 @@ def normalize(cls, url: HttpUrl) -> ZimPath:
         passed to python-libzim for UTF-8 encoding.
         """
 
-        if not isinstance(url, HttpUrl):
-            raise ValueError("Bad argument type passed, HttpUrl expected")
-
         url_parts = urlsplit(url.value)
 
         if not url_parts.hostname:

diff --git a/src/zimscraperlib/types.py b/src/zimscraperlib/types.py
@@ -47,13 +47,10 @@ def get_mime_for_name(
     MIME only guessed from file extension and not actual content.
 
     Filename with no extension are mapped to `no_ext_to`"""
-    try:
-        filename = pathlib.Path(filename)
-        if not filename.suffix:
-            return no_ext_to
-        return mimetypes.guess_type(f"{filename.stem}{filename.suffix}")[0] or fallback
-    except Exception:
-        return fallback
+    filename = pathlib.Path(filename)
+    if not filename.suffix:
+        return no_ext_to
+    return mimetypes.guess_type(f"{filename.stem}{filename.suffix}")[0] or fallback
 
 
 def init_types():