Merge pull request #230 from elfkuzco/compute-zim-languages

use language threshold to compute zim language metadata
openzim · Oct 25, 2024 · e4d9b0e · e4d9b0e
2 parents 4acb963 + 66c6fe4
commit e4d9b0e
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Change log level of "Video at {url} has not yet been translated into {requested_lang_code}" messages from warning to debug (way too verbose)
 - Disable preloading of subtitles in video.js
+- Add `--language-threshold` CLI argument for considering languages that appear in at least specified percentage of videos in `compute_zim_languages` (#212)
 
 ### Fixed
 

diff --git a/src/ted2zim/entrypoint.py b/src/ted2zim/entrypoint.py
@@ -175,6 +175,13 @@ def main():
         default=False,
     )
 
+    parser.add_argument(
+        "--language-threshold",
+        help="Consider languages present in at least percentage of videos",
+        default=0.5,
+        type=float,
+    )
+
     args = parser.parse_args()
     set_debug(args.debug)
     logger = get_logger()
@@ -201,6 +208,9 @@ def main():
         if not args.threads >= 1:
             parser.error("--threads must be provided a positive integer")
 
+        if not 0 < args.language_threshold <= 1:
+            parser.error("--language-threshold must be between 0 and 1.")
+
         scraper = Ted2Zim(**dict(args._get_kwargs()))
         scraper.run()
     except Exception as exc:

diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py
@@ -78,6 +78,7 @@ def __init__(
         tmp_dir,
         threads,
         disable_metadata_checks,
+        language_threshold,
     ):
         # video-encoding info
         self.video_format = video_format
@@ -98,6 +99,7 @@ def __init__(
         self.publisher = publisher
         self.name = name
         self.disable_metadata_checks = disable_metadata_checks
+        self.language_threshold = language_threshold
 
         if not self.disable_metadata_checks:
             # Validate ZIM metadata early so that we do not waste time doing operations
@@ -347,27 +349,32 @@ def compute_zim_languages(self):
         audio_lang_counts = {
             lang: len(list(group))
             for lang, group in groupby(
-                [video["native_talk_language"] for video in self.videos]
+                sorted(video["native_talk_language"] for video in self.videos)
             )
         }
 
         # count the number of videos per subtitle language
         subtitle_lang_counts = {
             lang: len(list(group))
             for lang, group in groupby(
-                [
+                sorted(
                     subtitle["languageCode"]
                     for video in self.videos
                     for subtitle in video["subtitles"]
-                ]
+                )
             )
         }
 
         # Attribute 10 "points" score to language in video audio and 1 "point" score
-        # to language in video subtitle
+        # to language in video subtitle if language is present in at least
+        # "threshold" percentage of videos.
         scored_languages = {
             k: 10 * audio_lang_counts.get(k, 0) + subtitle_lang_counts.get(k, 0)
             for k in list(audio_lang_counts.keys()) + list(subtitle_lang_counts.keys())
+            if self.is_language_above_threshold(
+                max(audio_lang_counts.get(k, 0), subtitle_lang_counts.get(k, 0)),
+                len(self.videos),
+            )
         }
 
         sorted_ted_languages = [
@@ -396,6 +403,15 @@ def compute_zim_languages(self):
             # Validate ZIM languages
             validate_language("Language", self.zim_languages)
 
+    def is_language_above_threshold(self, language_count: int, nb_videos: int) -> bool:
+        """check if a language appears in at least threshold percentage of videos"""
+        epsilon = 1e-5
+        appearance_fraction = language_count / nb_videos
+        return (
+            appearance_fraction >= self.language_threshold
+            or (abs(appearance_fraction - self.language_threshold)) <= epsilon
+        )
+
     def get_subtitle_dict(self, lang):
         """dict of language name and code from a larger dict lang