Fix output (#15)

* fix the error that output was splitted by comma * fix doc * fix flake8 error * add date to output --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
UtrechtUniversity · Apr 22, 2024 · 7ade515 · 7ade515
1 parent a48aac7
commit 7ade515
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 8 deletions.
diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py
@@ -48,26 +48,28 @@ def __init__(self, gzip_file_path: str, article_id: int):
         self.selected: bool = False
 
     def read_article_from_gzip(self) -> (
-            Tuple)[Union[str, None], Union[List[str], None]]:
+            Tuple)[Union[str, None], Union[List[str], None], Union[str, None]]:
         """
         Read article content from a gzip file.
 
         Returns:
-            Tuple[Union[str, None], Union[list, None]]: A tuple containing
-            the title and body of the article.
+            Tuple[Union[str, None], Union[list, None], Union[str, None]]:
+            A tuple containing the title, body, and date of the article.
         """
         try:
             with gzip.open(self._file_path, 'rt') as f:
                 data = json.load(f)
+                metadata = data.get('newsletter_metadata', {})
+                date = metadata.get('date', {})
                 articles = data.get('articles', {})
                 article = articles.get(str(self._article_id), {})
                 title = article.get('title', {})
                 body = article.get('body', {})
-                return title, body
+                return title, body, date
         except Exception as e:  # pylint: disable=broad-except
             logging.error("Error reading article %s from %s: %s",
                           str(self._article_id), self._file_path, e)
-            return None, None
+            return None, None, None
 
     def process_article(self, clean_keywords: List[str]) -> str:
         """
@@ -79,7 +81,7 @@ def process_article(self, clean_keywords: List[str]) -> str:
         Returns:
             str: The processed article body.
         """
-        self._title, self._body = self.read_article_from_gzip()
+        self._title, self._body, _ = self.read_article_from_gzip()
         if (self._title is None) or (self._body is None):
             return ""
         clean_title = clean(self._title)

diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py
@@ -20,6 +20,7 @@
 BODY_FIELD = "body"
 LABEL_FIELD = "label"
 SELECTED_FIELD = "selected"
+DATE_FIELD = "date"
 
 OUTPUT_UNIT_KEY = "output_unit"
 SENTENCE_PER_SEGMENT_KEY = "sentences_per_segment"
@@ -40,10 +41,13 @@ def read_article(row: pd.Series, formatter: TextFormatter) -> DataFrame:
     file_path = row[FILE_PATH_FIELD]
     article_id = row[ARTICLE_ID_FIELD]
     article_processor = ArticleProcessor(file_path, article_id)
-    title, body = article_processor.read_article_from_gzip()
+    title, body, date = article_processor.read_article_from_gzip()
 
     body_formatted = formatter.format_output(body)
 
+    dates = [date] * len(body_formatted) \
+        if ((not formatter.is_fulltext) and body_formatted is not None) \
+        else [date]
     titles = [title] * len(body_formatted) \
         if ((not formatter.is_fulltext) and body_formatted is not None) \
         else [title]
@@ -57,6 +61,7 @@ def read_article(row: pd.Series, formatter: TextFormatter) -> DataFrame:
         if (not formatter.is_fulltext) and body_formatted is not None \
         else ['']
     return pd.DataFrame({FILE_PATH_FIELD: files_path,
+                         DATE_FIELD: dates,
                          ARTICLE_ID_FIELD: articles_id,
                          TITLE_FIELD: titles,
                          BODY_FIELD: body_formatted,
@@ -138,5 +143,5 @@ def find_articles_in_file(filepath: str, formatter: TextFormatter) -> (
         if df is None:
             continue
         file_name = get_file_name_without_extension(articles_filepath)
-        df.to_csv(os.path.join(args.output_dir, 'articles_to_label_'
+        df.to_csv(os.path.join(args.output_dir, 'to_label_'
                                + file_name+'.csv'), index=False)