Skip to content

Commit

Permalink
Define input file (#3)
Browse files Browse the repository at this point in the history
* define input file classes

* define document class

* apply pylint

* apply flake8

* replace relative with absolute import

* remove extra getter functions that are not used

* fix variable types in Article class

* fix mypy errors

* fix flake8 errors

* fix flake8 errors

* Fix mypy and flake8 issues

* Comment parser

* Fix flake8 issues

* Comment import parser

---------

Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
  • Loading branch information
3 people authored Mar 7, 2024
1 parent 8abfb9b commit 57235ab
Show file tree
Hide file tree
Showing 6 changed files with 393 additions and 13 deletions.
8 changes: 7 additions & 1 deletion interest/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
from interest.preprocessor.parser import XMLExtractor
# from interest.preprocessor.parser import XMLExtractor
from interest.delpher_kranten import KrantenFile

INPUT_FILE_TYPES = {
"delpher_kranten": KrantenFile

}
118 changes: 118 additions & 0 deletions interest/delpher_kranten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
Delpher Kranten Module
This module provides classes and functions for handling Delpher Kranten files.
"""

import json
import logging
import os
from typing import Optional
from interest.document import Document, Article
from interest.input_file import InputFile


class KrantenFile(InputFile):
"""
An InputFile implementation for Delpher Kranten.
Input is a zip file which includes one JSON file. The JSON file contains
metadata and articles from one issue of a newspaper.
Attributes:
METADATA_FIELD (str): The key for metadata field in JSON data.
TITLE_FIELD (str): The key for title field in metadata.
DATE_FIELD (str): The key for date field in metadata.
LANGUAGE_FIELD (str): The key for language field in metadata.
ARTICLES_FIELD (str): The key for articles field in JSON data.
ARTICLE_TITLE_FIELD (str): The key for title field in an article.
ARTICLE_BODY_FIELD (str): The key for body field in an article.
ENCODING (str): The encoding format for reading the file.
Methods:
read_json(json_file): Read JSON data from a file and parse it into
a Document object.
base_file_name(): Extract the base file name without extension from
the filepath.
doc(): Read the directory and parse the JSON file into a Document
object.
"""

METADATA_FIELD = "newsletter_metadata"
TITLE_FIELD = "title"
DATE_FIELD = "date"
LANGUAGE_FIELD = "language"
ARTICLES_FIELD = "articles"
ARTICLE_TITLE_FIELD = "title"
ARTICLE_BODY_FIELD = "body"
ENCODING = "utf-8"

def read_json(self, json_file) -> Optional[Document]:
"""
Read JSON data from a file and parse it into a Document object.
Args:
json_file: A file object containing JSON data.
Returns:
Optional[Document]: A Document object parsed from
the JSON data, or None if parsing fails.
"""
try:
json_data = json.load(json_file)
metadata = json_data[self.METADATA_FIELD]
document_title = metadata[self.TITLE_FIELD]
publish_date = metadata[self.DATE_FIELD]
language = metadata[self.LANGUAGE_FIELD]

articles_data = json_data[self.ARTICLES_FIELD]

articles = []
for article_id, article in articles_data.items():
article_title = article[self.ARTICLE_TITLE_FIELD]
article_body = article[self.ARTICLE_BODY_FIELD]
article = Article(article_id=article_id, title=article_title,
body=article_body)
articles.append(article)

document = Document(title=document_title,
publish_date=publish_date,
language=language,
articles=articles)
return document

except (json.JSONDecodeError, KeyError) as e:
logging.error("Error parsing JSON data: %s", e)
return None

def base_file_name(self) -> str:
"""
Extract the base file name without extension from the filepath.
Returns:
str: The base file name without extension.
"""
file_name_json = os.path.splitext(os.path.basename(self.filepath))[0]
base_file_name = os.path.splitext(file_name_json)[0]
return base_file_name

def doc(self) -> Optional[Document]:
"""
Read the directory and parse the JSON file into a Document
object.
Returns:
Optional[Document]: A Document object parsed from the
JSON data, or None if parsing fails.
"""
try:
logging.info("Reading directory '%s'...", self._filepath)
fh = self.open(encoding=self.ENCODING)
document = self.read_json(fh)
fh.close()
return document

except OSError as e:
logging.error("Error processing gzip file '%s': %s",
self._filepath, e)
return None
131 changes: 131 additions & 0 deletions interest/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# pylint: disable=too-few-public-methods
"""
This module defines the Document class, which represents a document
containing articles.
"""
from typing import Optional, List, Union
from datetime import datetime


class Article:
"""A class representing an article.
This class represents an article with an ID, title, and body text.
The body text can be provided as a list
of paragraphs, which will be joined into a single string.
Attributes:
id (str): The unique identifier of the article.
title (str): The title of the article.
body (str): The body text of the article, represented as
a single string.
"""
def __init__(self, article_id: str, title: str,
body: Union[str, List[str]]) -> None:
"""Initialize an Article object with the given ID, title, and body.
Args:
id (str): The unique identifier of the article.
title (str): The title of the article.
body (Union[str, List[str]): The body text of the article,
provided as a list of paragraphs.
"""
self.id = article_id
self.title = title
if isinstance(body, list):
article_body = '\n'.join(body)
self.text = article_body
else:
self.text = body


class Document:
"""
Represents a document containing articles.
Args:
title (str): The title of the document.
publish_date (str): The publication date of the document in
the format 'YYYY-MM-DD'.
language (str): The language of the document.
articles (List[Article]): A list of articles included in
the document.
Attributes:
_title (str): The title of the document.
_publish_date (str): The publication date of the document in
the format 'YYYY-MM-DD'.
_year (Optional[int]): The year of publication, extracted from
publish_date.
_language (str): The language of the document.
_articles (List[Article]): A list of articles included in the
document.
Properties:
title (str): Getter for the title of the document.
publish_date (str): Getter for the publication date of the
document.
year (Optional[int]): Getter for the year of publication.
decade (Optional[int]): Getter for the decade of publication.
language (str): Getter for the language of the document.
articles (List[Article]): Getter for the list of articles
included in the document.
"""
def __init__(self, title: str, publish_date: str, language: str,
articles: List[Article]) -> None:
self._year: Optional[int] = None
self._articles = articles
self._title = title
self._publish_date = publish_date
self._language = language

@property
def title(self) -> str:
"""
Getter for the title of the document.
Returns:
str: The title of the document.
"""
return self._title

@property
def year(self) -> Optional[int]:
"""
Getter for the year of publication.
Returns:
Optional[int]: The year of publication extracted
from publish_date, or None if it cannot be determined.
"""
if self._year is not None:
return self._year
try:
date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d')
self._year = date_obj.year
return self._year
except ValueError:
return None

@property
def decade(self) -> Optional[int]:
"""
Getter for the decade of publication.
Returns:
Optional[int]: The decade of publication extracted from
publish_date,
or None if it cannot be determined.
"""
_ = self.year
return int(self._year / 10) * 10 if self._year is not None else None

@property
def articles(self) -> List[Article]:
"""
Getter for the list of articles included in the document.
Returns:
List[Article]: The list of articles included in the document.
"""
return self._articles
119 changes: 119 additions & 0 deletions interest/input_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""
Input File Module
This module provides an abstract class for representing various input files.
"""

import abc
import gzip
from pathlib import Path
from typing import Iterable, TextIO, cast, Optional
from interest.document import Document, Article
import logging

# from .document_filter import DocumentFilter


class InputFile(abc.ABC):
"""
Abstract class for representing various input files.
Attributes:
_filepath (Path): The file path of the input file.
Methods:
__init__(filepath): Initialize the InputFile with a file path.
filepath(): Get the file path of the input file.
base_file_name(): Output a list of documents in the input file.
open(mode, encoding): Open the input file for reading.
articles(): Return all articles for the document found in the
input file.
doc(): Output a list of documents in the input file.
"""

def __init__(self, filepath: Path) -> None:
"""
Initialize the InputFile with a file path.
Args:
filepath (Path): The file path of the input file.
"""
self._filepath = filepath

@property
def filepath(self) -> Path:
"""
Get the file path of the input file.
Returns:
Path: The file path of the input file.
"""
return self._filepath

@abc.abstractmethod
def base_file_name(self) -> str:
"""
Output a list of documents in the input file.
This can be a singleton list if an input file contains only
one document.
Returns:
str: The base file name without extension.
"""
return NotImplemented

def open(self, mode: str = "rt", encoding=None) -> TextIO:
"""
Open the input file for reading.
Args:
mode (str): The file open mode.
encoding: The encoding format.
Returns:
TextIO: A file object for reading the input file.
"""
if self._filepath.suffix.startswith(".gz"):
return cast(TextIO, gzip.open(self._filepath, mode=mode,
encoding=encoding))

# Default to text file
return cast(TextIO, open(self._filepath,
mode=mode, encoding=encoding))

# pylint: disable=no-member
def articles(self) -> Iterable[Article]:
"""
Return all articles for the document found in the input file.
Yields:
Article: An article object.
"""
doc = self.doc()
if doc is not None:
yield from doc.articles
else:
logging.error("Document not found or is None for filepath: %s",
self.filepath)
return

@abc.abstractmethod
def doc(self) -> Optional[Document]:
"""
Output a list of documents in the input file.
This can be a singleton list if an input file contains only
one document.
Returns:
Document: A document object.
"""
return NotImplemented

# def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
# document = self.doc()
# if filter.filter_document(document):
# if document.articles() is not None:
# for article in document.articles():
# if filter.filter_article(article):
# yield article
2 changes: 1 addition & 1 deletion interest/preprocessor/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from interest.preprocessor.parser import XMLExtractor
# from interest.preprocessor.parser import XMLExtractor
Loading

0 comments on commit 57235ab

Please sign in to comment.