diff --git a/rss007d0675444aa13fc/ExtractContent.py b/rss007d0675444aa13fc/ExtractContent.py index c89ac56..52f370f 100644 --- a/rss007d0675444aa13fc/ExtractContent.py +++ b/rss007d0675444aa13fc/ExtractContent.py @@ -152,6 +152,3 @@ def is_within_max_age(_now_time, _date, _max_age): # for cont in content: # for element in cont["visible_text"]: # print(element) - - - diff --git a/rss007d0675444aa13fc/__init__.py b/rss007d0675444aa13fc/__init__.py index 96893f1..73ad065 100644 --- a/rss007d0675444aa13fc/__init__.py +++ b/rss007d0675444aa13fc/__init__.py @@ -1,3 +1,4 @@ +import asyncio from datetime import datetime, timezone, timedelta from typing import AsyncGenerator import logging @@ -256,10 +257,12 @@ async def request_random_content(_n_articles, _max_age, _json_data, _max_number_ """ rss_ids = parse_reference_json_data(_json_data) - articles = await find_random_articles_with_max_age(_n_articles, rss_ids, _max_age, _max_number_of_tries) + articles = await find_random_articles_with_max_age( + _n_articles, rss_ids, _max_age, _max_number_of_tries + ) dict = [] - + print("articles -> {}".format(articles)) for article in articles: dict.append((article.url, article.language[:2])) @@ -289,21 +292,27 @@ async def find_random_articles_with_max_age(_n_articles, _rss_id_list, _max_age, now_time = now_time.strftime("%Y-%m-%d %H:%M:%S") # format correctly cumulative_tries = 0 current_try_count = 0 - + print("n_articles is {}".format(_n_articles)) while len(articles) < _n_articles: - + print("looping") if current_try_count > _max_number_of_tries: # stop here return articles rss_id = random.choice(_rss_id_list) rss = RSS(rss_id) - await extract_latest_items(rss) + try: + await extract_latest_items(rss) + except: + print("Could not extract latest items") + continue for link in rss.link_array: + print(link.link) current_try_count += 1 cumulative_tries += 1 if cumulative_tries > 5: + print("break bc of cumulative_tries") cumulative_tries = 0 break # break out of this for loop and move on to the next one if is_within_max_age(now_time, link.publish_date, _max_age) and link.link not in appended_urls: @@ -312,6 +321,8 @@ async def find_random_articles_with_max_age(_n_articles, _rss_id_list, _max_age, articles.append(Article(rss_id.source, rss_id.description, rss_id.language, link.title, link.link, link.publish_date, link.description)) if len(articles) == _n_articles: break + else: + print("Not within max age, max age is {}".format(_max_age)) return articles @@ -438,6 +449,7 @@ async def query(parameters: dict) -> AsyncGenerator[Item, None]: data = await get_json_dict() except Exception as e: logging.info(f"[RSS newsfeed] Error when fetching the FeedSource.json: {e}") + print("has `data`") """ Article data is accessible following this structure: self.rss_source // the RSS feed name that we are collecting from @@ -454,7 +466,8 @@ async def query(parameters: dict) -> AsyncGenerator[Item, None]: except Exception as e: logging.exception(f"[RSS newsfeed] Error when requesting content: {e}") articles = [] - + print("Got `articles`") + print("len of articles is {}".format(len(articles))) for article in articles: try: logging.info(f"[RSS newsfeed] FOUND ARTICLE: ") @@ -485,3 +498,13 @@ async def query(parameters: dict) -> AsyncGenerator[Item, None]: yield new_item except Exception as e: logging.info(f"[RSS newsfeed] Error during article yield: {e}") + + +async def main(): + print("starting") + while True: + async for result in query({}): + print(result) + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/setup.py b/setup.py index bc1e43b..9b079c0 100644 --- a/setup.py +++ b/setup.py @@ -9,9 +9,10 @@ "aiohttp", "tldextract>=3.1.0", "feedparser>=6.0.8", - "newspaper3k>=0.2.8", + "newspaper4k>=0.9.3.1", "pytz>=2023.3", - "python_dateutil>=2.8.2" + "python_dateutil>=2.8.2", + "lxml_html_clean>=0.1.1" ], extras_require={"dev": ["pytest", "pytest-cov", "pytest-asyncio"]}, ) diff --git a/tests/test_implementation.py b/tests/test_implementation.py index 2d493e7..ff0e3a3 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -1,5 +1,5 @@ from rss007d0675444aa13fc import query -from exorde_data.models import Item +from exorde_data import Item import pytest @@ -17,4 +17,4 @@ async def test_query(): print(f"Error: {str(e)}") import asyncio -asyncio.run(test_query()) \ No newline at end of file +asyncio.run(test_query())