diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py index aafbcfe..763e166 100644 --- a/interest/article_final_selection/process_article.py +++ b/interest/article_final_selection/process_article.py @@ -48,26 +48,28 @@ def __init__(self, gzip_file_path: str, article_id: int): self.selected: bool = False def read_article_from_gzip(self) -> ( - Tuple)[Union[str, None], Union[List[str], None]]: + Tuple)[Union[str, None], Union[List[str], None], Union[str, None]]: """ Read article content from a gzip file. Returns: - Tuple[Union[str, None], Union[list, None]]: A tuple containing - the title and body of the article. + Tuple[Union[str, None], Union[list, None], Union[str, None]]: + A tuple containing the title, body, and date of the article. """ try: with gzip.open(self._file_path, 'rt') as f: data = json.load(f) + metadata = data.get('newsletter_metadata', {}) + date = metadata.get('date', {}) articles = data.get('articles', {}) article = articles.get(str(self._article_id), {}) title = article.get('title', {}) body = article.get('body', {}) - return title, body + return title, body, date except Exception as e: # pylint: disable=broad-except logging.error("Error reading article %s from %s: %s", str(self._article_id), self._file_path, e) - return None, None + return None, None, None def process_article(self, clean_keywords: List[str]) -> str: """ @@ -79,7 +81,7 @@ def process_article(self, clean_keywords: List[str]) -> str: Returns: str: The processed article body. """ - self._title, self._body = self.read_article_from_gzip() + self._title, self._body, _ = self.read_article_from_gzip() if (self._title is None) or (self._body is None): return "" clean_title = clean(self._title) diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py index b9904c0..161140c 100644 --- a/scripts/step4_generate_output.py +++ b/scripts/step4_generate_output.py @@ -20,6 +20,7 @@ BODY_FIELD = "body" LABEL_FIELD = "label" SELECTED_FIELD = "selected" +DATE_FIELD = "date" OUTPUT_UNIT_KEY = "output_unit" SENTENCE_PER_SEGMENT_KEY = "sentences_per_segment" @@ -40,10 +41,13 @@ def read_article(row: pd.Series, formatter: TextFormatter) -> DataFrame: file_path = row[FILE_PATH_FIELD] article_id = row[ARTICLE_ID_FIELD] article_processor = ArticleProcessor(file_path, article_id) - title, body = article_processor.read_article_from_gzip() + title, body, date = article_processor.read_article_from_gzip() body_formatted = formatter.format_output(body) + dates = [date] * len(body_formatted) \ + if ((not formatter.is_fulltext) and body_formatted is not None) \ + else [date] titles = [title] * len(body_formatted) \ if ((not formatter.is_fulltext) and body_formatted is not None) \ else [title] @@ -57,6 +61,7 @@ def read_article(row: pd.Series, formatter: TextFormatter) -> DataFrame: if (not formatter.is_fulltext) and body_formatted is not None \ else [''] return pd.DataFrame({FILE_PATH_FIELD: files_path, + DATE_FIELD: dates, ARTICLE_ID_FIELD: articles_id, TITLE_FIELD: titles, BODY_FIELD: body_formatted, @@ -138,5 +143,5 @@ def find_articles_in_file(filepath: str, formatter: TextFormatter) -> ( if df is None: continue file_name = get_file_name_without_extension(articles_filepath) - df.to_csv(os.path.join(args.output_dir, 'articles_to_label_' + df.to_csv(os.path.join(args.output_dir, 'to_label_' + file_name+'.csv'), index=False)