Skip to content

Commit

Permalink
Merge pull request #230 from elfkuzco/compute-zim-languages
Browse files Browse the repository at this point in the history
use language threshold to compute zim language metadata
  • Loading branch information
benoit74 authored Oct 25, 2024
2 parents 4acb963 + 66c6fe4 commit e4d9b0e
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Change log level of "Video at {url} has not yet been translated into {requested_lang_code}" messages from warning to debug (way too verbose)
- Disable preloading of subtitles in video.js
- Add `--language-threshold` CLI argument for considering languages that appear in at least specified percentage of videos in `compute_zim_languages` (#212)

### Fixed

Expand Down
10 changes: 10 additions & 0 deletions src/ted2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@ def main():
default=False,
)

parser.add_argument(
"--language-threshold",
help="Consider languages present in at least percentage of videos",
default=0.5,
type=float,
)

args = parser.parse_args()
set_debug(args.debug)
logger = get_logger()
Expand All @@ -201,6 +208,9 @@ def main():
if not args.threads >= 1:
parser.error("--threads must be provided a positive integer")

if not 0 < args.language_threshold <= 1:
parser.error("--language-threshold must be between 0 and 1.")

scraper = Ted2Zim(**dict(args._get_kwargs()))
scraper.run()
except Exception as exc:
Expand Down
24 changes: 20 additions & 4 deletions src/ted2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(
tmp_dir,
threads,
disable_metadata_checks,
language_threshold,
):
# video-encoding info
self.video_format = video_format
Expand All @@ -98,6 +99,7 @@ def __init__(
self.publisher = publisher
self.name = name
self.disable_metadata_checks = disable_metadata_checks
self.language_threshold = language_threshold

if not self.disable_metadata_checks:
# Validate ZIM metadata early so that we do not waste time doing operations
Expand Down Expand Up @@ -347,27 +349,32 @@ def compute_zim_languages(self):
audio_lang_counts = {
lang: len(list(group))
for lang, group in groupby(
[video["native_talk_language"] for video in self.videos]
sorted(video["native_talk_language"] for video in self.videos)
)
}

# count the number of videos per subtitle language
subtitle_lang_counts = {
lang: len(list(group))
for lang, group in groupby(
[
sorted(
subtitle["languageCode"]
for video in self.videos
for subtitle in video["subtitles"]
]
)
)
}

# Attribute 10 "points" score to language in video audio and 1 "point" score
# to language in video subtitle
# to language in video subtitle if language is present in at least
# "threshold" percentage of videos.
scored_languages = {
k: 10 * audio_lang_counts.get(k, 0) + subtitle_lang_counts.get(k, 0)
for k in list(audio_lang_counts.keys()) + list(subtitle_lang_counts.keys())
if self.is_language_above_threshold(
max(audio_lang_counts.get(k, 0), subtitle_lang_counts.get(k, 0)),
len(self.videos),
)
}

sorted_ted_languages = [
Expand Down Expand Up @@ -396,6 +403,15 @@ def compute_zim_languages(self):
# Validate ZIM languages
validate_language("Language", self.zim_languages)

def is_language_above_threshold(self, language_count: int, nb_videos: int) -> bool:
"""check if a language appears in at least threshold percentage of videos"""
epsilon = 1e-5
appearance_fraction = language_count / nb_videos
return (
appearance_fraction >= self.language_threshold
or (abs(appearance_fraction - self.language_threshold)) <= epsilon
)

def get_subtitle_dict(self, lang):
"""dict of language name and code from a larger dict lang
Expand Down

0 comments on commit e4d9b0e

Please sign in to comment.