Skip to content

Commit

Permalink
Fix output (#15)
Browse files Browse the repository at this point in the history
* fix the error that output was splitted by comma

* fix doc

* fix flake8 error

* add date to output

---------

Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
  • Loading branch information
parisa-zahedi and parisa-zahedi authored Apr 22, 2024
1 parent a48aac7 commit 7ade515
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 8 deletions.
14 changes: 8 additions & 6 deletions interest/article_final_selection/process_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,28 @@ def __init__(self, gzip_file_path: str, article_id: int):
self.selected: bool = False

def read_article_from_gzip(self) -> (
Tuple)[Union[str, None], Union[List[str], None]]:
Tuple)[Union[str, None], Union[List[str], None], Union[str, None]]:
"""
Read article content from a gzip file.
Returns:
Tuple[Union[str, None], Union[list, None]]: A tuple containing
the title and body of the article.
Tuple[Union[str, None], Union[list, None], Union[str, None]]:
A tuple containing the title, body, and date of the article.
"""
try:
with gzip.open(self._file_path, 'rt') as f:
data = json.load(f)
metadata = data.get('newsletter_metadata', {})
date = metadata.get('date', {})
articles = data.get('articles', {})
article = articles.get(str(self._article_id), {})
title = article.get('title', {})
body = article.get('body', {})
return title, body
return title, body, date
except Exception as e: # pylint: disable=broad-except
logging.error("Error reading article %s from %s: %s",
str(self._article_id), self._file_path, e)
return None, None
return None, None, None

def process_article(self, clean_keywords: List[str]) -> str:
"""
Expand All @@ -79,7 +81,7 @@ def process_article(self, clean_keywords: List[str]) -> str:
Returns:
str: The processed article body.
"""
self._title, self._body = self.read_article_from_gzip()
self._title, self._body, _ = self.read_article_from_gzip()
if (self._title is None) or (self._body is None):
return ""
clean_title = clean(self._title)
Expand Down
9 changes: 7 additions & 2 deletions scripts/step4_generate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
BODY_FIELD = "body"
LABEL_FIELD = "label"
SELECTED_FIELD = "selected"
DATE_FIELD = "date"

OUTPUT_UNIT_KEY = "output_unit"
SENTENCE_PER_SEGMENT_KEY = "sentences_per_segment"
Expand All @@ -40,10 +41,13 @@ def read_article(row: pd.Series, formatter: TextFormatter) -> DataFrame:
file_path = row[FILE_PATH_FIELD]
article_id = row[ARTICLE_ID_FIELD]
article_processor = ArticleProcessor(file_path, article_id)
title, body = article_processor.read_article_from_gzip()
title, body, date = article_processor.read_article_from_gzip()

body_formatted = formatter.format_output(body)

dates = [date] * len(body_formatted) \
if ((not formatter.is_fulltext) and body_formatted is not None) \
else [date]
titles = [title] * len(body_formatted) \
if ((not formatter.is_fulltext) and body_formatted is not None) \
else [title]
Expand All @@ -57,6 +61,7 @@ def read_article(row: pd.Series, formatter: TextFormatter) -> DataFrame:
if (not formatter.is_fulltext) and body_formatted is not None \
else ['']
return pd.DataFrame({FILE_PATH_FIELD: files_path,
DATE_FIELD: dates,
ARTICLE_ID_FIELD: articles_id,
TITLE_FIELD: titles,
BODY_FIELD: body_formatted,
Expand Down Expand Up @@ -138,5 +143,5 @@ def find_articles_in_file(filepath: str, formatter: TextFormatter) -> (
if df is None:
continue
file_name = get_file_name_without_extension(articles_filepath)
df.to_csv(os.path.join(args.output_dir, 'articles_to_label_'
df.to_csv(os.path.join(args.output_dir, 'to_label_'
+ file_name+'.csv'), index=False)

0 comments on commit 7ade515

Please sign in to comment.