Skip to content

Commit

Permalink
refactor to ignore speaker url if it is -
Browse files Browse the repository at this point in the history
  • Loading branch information
elfkuzco committed Oct 15, 2024
1 parent 17ad543 commit d5ca336
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 62 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Restore functionality to resist temporary bad TED responses when parsing video pages (#209)
- Retry video data extraction if `videoData` is missing from page data (#226)
- Skip download of speaker image if URL is invalid (#224)
- Skip download of speaker image if URL is "-" (#224)

## [3.0.2] - 2024-06-24

Expand Down
16 changes: 6 additions & 10 deletions src/ted2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,7 @@
get_logger,
)
from ted2zim.processing import post_process_video
from ted2zim.utils import (
WebVTT,
get_main_title,
is_valid_uri,
request_url,
update_subtitles_list,
)
from ted2zim.utils import WebVTT, get_main_title, request_url, update_subtitles_list

logger = get_logger()

Expand Down Expand Up @@ -1039,12 +1033,14 @@ def download_speaker_image(
)
if not downloaded_from_cache:
try:
# download an image of the speaker
# Before downloading a speaker image, check if the URL exists.
# Sometimes, the URL from TED is "-" which is invalid.
# TODO: Implement a robust URL matcher if there are more invalid URLs

Check notice on line 1038 in src/ted2zim/scraper.py

View check run for this annotation

codefactor.io / CodeFactor

src/ted2zim/scraper.py#L1038

Unresolved comment '# TODO: Implement a robust URL matcher if there are more invalid URLs'. (C100)
if not video_speaker:
logger.debug("Speaker doesn't have an image")
elif not is_valid_uri(video_speaker):
elif video_speaker == "-":
logger.error(

Check warning on line 1042 in src/ted2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

src/ted2zim/scraper.py#L1042

Added line #L1042 was not covered by tests
f"Invalid speaker image URI {video_speaker!r} for "
f"Invalid speaker image URL {video_speaker} for "
f"{video_title}"
)
else:
Expand Down
26 changes: 0 additions & 26 deletions src/ted2zim/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import contextlib
import json
import pathlib
import re
import tempfile
import time
from http import HTTPStatus
from urllib.parse import urlsplit

import requests

Expand Down Expand Up @@ -171,27 +169,3 @@ def get_for(lang: str):
return title

return missing


def is_valid_uri(url: str) -> bool:
"""check if uri is a valid uri.
Adapted from: https://github.com/boto/botocore/blob/develop/botocore/utils.py#L1276
"""
url_parts = urlsplit(url)
hostname = url_parts.hostname
max_hostname_length = 255
if hostname is None or len(hostname) > max_hostname_length:
return False

if hostname[-1] == ".":
hostname = hostname[:-1]

allowed = re.compile(
r"""^((?!-) # ensure hostname does not begin with hyphen
[A-Z\d-]{1,63}(?<!-)\.)* # optional subdomain name (must not end with hyphen)
((?!-)[A-Z\d-]{1,63}(?<!-) # main domain name (must not end with hyphen)
)$""",
re.IGNORECASE | re.VERBOSE,
)
return bool(allowed.match(hostname))
25 changes: 0 additions & 25 deletions tests/test_utils.py

This file was deleted.

0 comments on commit d5ca336

Please sign in to comment.