Skip to content

Commit

Permalink
Merge pull request #221 from openzim/safe_metadata_revamp
Browse files Browse the repository at this point in the history
Significantly enhance the safety of metadata manipulation
  • Loading branch information
benoit74 authored Dec 17, 2024
2 parents 76a6408 + 7e2efa1 commit a0a225b
Show file tree
Hide file tree
Showing 20 changed files with 1,671 additions and 664 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Renamed `filesystem.validate_zimfile_creatable` to `filesystem.file_creatable` to reflect general applicability to check file creation beyond ZIM files #200
- Remove any "ZIM" reference in exceptions while working with files #200
- Significantly enhance the safety of metadata manipulation (#205)
- add types for all metadata, one type per metadata name plus some generic ones for non-standard metadata
- all types are responsible to validate metadata value at initialization time
- validation checks for adherence to the ZIM specification and conventions are automated
- cleanup of unwanted control characters and stripping white characters are **automated in all text metadata**
- whenever possible, try to **automatically clean a "reasonably" bad metadata** (e.g. automaticall accept and remove duplicate tags - harmless - but not duplicate language codes - codes are supposed to be ordered, so it is a weird situation) ; this is an alignment of paradigm, because for some metadata the lib was permissive, while for other it was quite restrictive ; this PR tries to align this and **make the lib as permissive as possible**, avoiding to fail a scraper for something which could be automatically fixed
- it is now possible to disable ZIM conventions checks with `zim.metadata.check_metadata_conventions`
- simplify `zim.creator.Creator.config_metadata` by using these types and been more strict:
- add new `StandardMetadata` class for standard metadata, including list of mandatory one
- by default, all non-standard metadata must start with `X-` prefix
- this not yet an openZIM convention / specification, so it is possible to disable this check with `fail_on_missing_prefix` argument
- simplify `add_metadata`, use same metadata types
- simplify `zim.creator.Creator.start` with new types, and drop all metadata from memory after being passed to the libzim
- drop `zim.creator.convert_and_check_metadata` (not usefull anymore, simply use proper metadata type)
- move `MANDATORY_ZIM_METADATA_KEYS` and `DEFAULT_DEV_ZIM_METADATA` from `constants` to `zim.metadata` to avoid circular dependencies
- new `inputs.unique_values` utility function to compute the list of uniques values from a given list, but preserving initial list order
- in `__init__` of `zim.creator.Creator`, rename `disable_metadata_checks` to `check_metadata_conventions` for clarity and brevity
- beware that this manipulate the global `zim.metadata.check_metadata_conventions`, so if you have many creator running in parallel, they can't have different settings, last one initialized will "win"

### Added

Expand Down
15 changes: 9 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"regex>=2020.7.14",
"pymupdf>=1.24.0,<2.0",
"CairoSVG>=2.2.0,<3.0",
"beartype==0.19.0",
# youtube-dl should be updated as frequently as possible
"yt-dlp"
]
Expand All @@ -52,19 +53,19 @@ scripts = [
]
lint = [
"black==24.10.0",
"ruff==0.7.0",
"ruff==0.8.2",
]
check = [
"pyright==1.1.385",
"pytest==8.3.3",
"pyright==1.1.390",
"pytest==8.3.4",
]
test = [
"pytest==8.3.3",
"pytest==8.3.4",
"pytest-mock==3.14.0",
"coverage==7.5.3",
"coverage==7.6.9",
]
dev = [
"ipython==8.25.0",
"ipython==8.30.0",
"pre-commit==4.0.1",
"zimscraperlib[scripts]",
"zimscraperlib[lint]",
Expand Down Expand Up @@ -252,6 +253,8 @@ ban-relative-imports = "all"
"tests/**/*" = ["PLR2004", "S101", "TID252"]
# _libkiwix mimics libkiwix C++ code, names obey C++ conventions
"src/zimscraperlib/zim/_libkiwix.py" = ["N802", "N803", "N806"]
# beartype must be first
"src/zimscraperlib/zim/__init__.py" = ["E402"]

[tool.pytest.ini_options]
minversion = "7.3"
Expand Down
29 changes: 0 additions & 29 deletions src/zimscraperlib/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python3
# vim: ai ts=4 sts=4 et sw=4 nu

import base64
import pathlib
import re

Expand All @@ -22,34 +21,6 @@
# list of mimetypes we consider articles using it should default to FRONT_ARTICLE
FRONT_ARTICLE_MIMETYPES = ["text/html"]

# list of mandatory meta tags of the zim file.
MANDATORY_ZIM_METADATA_KEYS = [
"Name",
"Title",
"Creator",
"Publisher",
"Date",
"Description",
"Language",
"Illustration_48x48@1",
]

DEFAULT_DEV_ZIM_METADATA = {
"Name": "Test Name",
"Title": "Test Title",
"Creator": "Test Creator",
"Publisher": "Test Publisher",
"Date": "2023-01-01",
"Description": "Test Description",
"Language": "fra",
# blank 48x48 transparent PNG
"Illustration_48x48_at_1": base64.b64decode(
"iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB"
"ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN"
"SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC"
),
}

RECOMMENDED_MAX_TITLE_LENGTH = 30
MAXIMUM_DESCRIPTION_METADATA_LENGTH = 80
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH = 4000
Expand Down
14 changes: 2 additions & 12 deletions src/zimscraperlib/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

import os
import pathlib
from collections.abc import Callable
from typing import Any

import magic

Expand Down Expand Up @@ -44,15 +42,7 @@ def get_content_mimetype(content: bytes | str) -> str:
return MIME_OVERRIDES.get(detected_mime, detected_mime)


def delete_callback(
fpath: str | pathlib.Path,
callback: Callable | None = None,
*callback_args: Any,
):
"""helper deleting passed filepath, optionnaly calling an additional callback"""
def delete_callback(fpath: str | pathlib.Path):
"""helper deleting passed filepath"""

os.unlink(fpath)

# call the callback if requested
if callback and callable(callback):
callback.__call__(*callback_args)
2 changes: 1 addition & 1 deletion src/zimscraperlib/image/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def convert_image(
if not fmt:
raise ValueError("Impossible to guess destination image format")
with pilopen(src) as image:
if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace:
if (image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED) or colorspace:
image = image.convert(colorspace or "RGB") # noqa: PLW2901
save_image(image, dst, fmt, **params)

Expand Down
5 changes: 5 additions & 0 deletions src/zimscraperlib/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,8 @@ def compute_tags(
return {
tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
}


def unique_values(items: list) -> list:
"""Return unique values in input list while preserving list order"""
return list(dict.fromkeys(items))
26 changes: 26 additions & 0 deletions src/zimscraperlib/typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from __future__ import annotations

from collections.abc import Callable
from typing import Any, NamedTuple


class Callback(NamedTuple):
func: Callable
args: tuple[Any, ...] | None = None
kwargs: dict[str, Any] | None = None

@property
def callable(self) -> bool:
return callable(self.func)

def get_args(self) -> tuple[Any, ...]:
return self.args or ()

def get_kwargs(self) -> dict[str, Any]:
return self.kwargs or {}

def call_with(self, *args, **kwargs):
self.func.__call__(*args, **kwargs)

def call(self):
self.call_with(*self.get_args(), **self.get_kwargs())
13 changes: 8 additions & 5 deletions src/zimscraperlib/zim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
zim.items: item to add to creator
zim.archive: read ZIM files, accessing or searching its content"""

from beartype.claw import beartype_this_package
from libzim.writer import Blob # pyright: ignore

beartype_this_package()

from zimscraperlib.zim.archive import Archive
from zimscraperlib.zim.creator import Creator
from zimscraperlib.zim.filesystem import make_zim_file
Expand All @@ -24,14 +27,14 @@

__all__ = [
"Archive",
"Blob",
"Creator",
"make_zim_file",
"FileLikeProvider",
"FileProvider",
"Item",
"StaticItem",
"URLItem",
"FileProvider",
"StringProvider",
"FileLikeProvider",
"URLItem",
"URLProvider",
"Blob",
"make_zim_file",
]
13 changes: 8 additions & 5 deletions src/zimscraperlib/zim/_libkiwix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
from __future__ import annotations

import io
from collections import namedtuple
from typing import NamedTuple

MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"])
CounterMap = dict[
type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value) # pyright: ignore
]

class MimetypeAndCounter(NamedTuple):
mimetype: str
value: int


type CounterMap = dict[str, int]


def getline(src: io.StringIO, delim: str | None = None) -> tuple[bool, str]:
Expand Down
4 changes: 2 additions & 2 deletions src/zimscraperlib/zim/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import libzim.search # Query, Searcher # pyright: ignore
import libzim.suggestion # SuggestionSearcher # pyright: ignore

from zimscraperlib.zim._libkiwix import convertTags, parseMimetypeCounter
from zimscraperlib.zim._libkiwix import CounterMap, convertTags, parseMimetypeCounter


class Archive(libzim.reader.Archive):
Expand Down Expand Up @@ -101,7 +101,7 @@ def get_search_results_count(self, query: str) -> int:
return search.getEstimatedMatches()

@property
def counters(self) -> dict[str, int]:
def counters(self) -> CounterMap:
try:
return parseMimetypeCounter(self.get_text_metadata("Counter"))
except RuntimeError: # pragma: no cover (no ZIM avail to test itl)
Expand Down
Loading

0 comments on commit a0a225b

Please sign in to comment.