From d5ca336d95204472cde613739de3886ee4a66627 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Tue, 15 Oct 2024 12:49:10 +0100 Subject: [PATCH] refactor to ignore speaker url if it is - --- CHANGELOG.md | 2 +- src/ted2zim/scraper.py | 16 ++++++---------- src/ted2zim/utils.py | 26 -------------------------- tests/test_utils.py | 25 ------------------------- 4 files changed, 7 insertions(+), 62 deletions(-) delete mode 100644 tests/test_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ca985af..ef2cfbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Restore functionality to resist temporary bad TED responses when parsing video pages (#209) - Retry video data extraction if `videoData` is missing from page data (#226) -- Skip download of speaker image if URL is invalid (#224) +- Skip download of speaker image if URL is "-" (#224) ## [3.0.2] - 2024-06-24 diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py index 92ca0eb..9e501d4 100644 --- a/src/ted2zim/scraper.py +++ b/src/ted2zim/scraper.py @@ -44,13 +44,7 @@ get_logger, ) from ted2zim.processing import post_process_video -from ted2zim.utils import ( - WebVTT, - get_main_title, - is_valid_uri, - request_url, - update_subtitles_list, -) +from ted2zim.utils import WebVTT, get_main_title, request_url, update_subtitles_list logger = get_logger() @@ -1039,12 +1033,14 @@ def download_speaker_image( ) if not downloaded_from_cache: try: - # download an image of the speaker + # Before downloading a speaker image, check if the URL exists. + # Sometimes, the URL from TED is "-" which is invalid. + # TODO: Implement a robust URL matcher if there are more invalid URLs if not video_speaker: logger.debug("Speaker doesn't have an image") - elif not is_valid_uri(video_speaker): + elif video_speaker == "-": logger.error( - f"Invalid speaker image URI {video_speaker!r} for " + f"Invalid speaker image URL {video_speaker} for " f"{video_title}" ) else: diff --git a/src/ted2zim/utils.py b/src/ted2zim/utils.py index bc24ba6..09b5f30 100644 --- a/src/ted2zim/utils.py +++ b/src/ted2zim/utils.py @@ -1,11 +1,9 @@ import contextlib import json import pathlib -import re import tempfile import time from http import HTTPStatus -from urllib.parse import urlsplit import requests @@ -171,27 +169,3 @@ def get_for(lang: str): return title return missing - - -def is_valid_uri(url: str) -> bool: - """check if uri is a valid uri. - - Adapted from: https://github.com/boto/botocore/blob/develop/botocore/utils.py#L1276 - """ - url_parts = urlsplit(url) - hostname = url_parts.hostname - max_hostname_length = 255 - if hostname is None or len(hostname) > max_hostname_length: - return False - - if hostname[-1] == ".": - hostname = hostname[:-1] - - allowed = re.compile( - r"""^((?!-) # ensure hostname does not begin with hyphen - [A-Z\d-]{1,63}(?