Define input file (#3)

* define input file classes * define document class * apply pylint * apply flake8 * replace relative with absolute import * remove extra getter functions that are not used * fix variable types in Article class * fix mypy errors * fix flake8 errors * fix flake8 errors * Fix mypy and flake8 issues * Comment parser * Fix flake8 issues * Comment import parser --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl> Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
UtrechtUniversity · Mar 7, 2024 · 57235ab · 57235ab
1 parent 8abfb9b
commit 57235ab
Show file tree

Hide file tree

Showing 6 changed files with 393 additions and 13 deletions.
diff --git a/interest/__init__.py b/interest/__init__.py
@@ -1 +1,7 @@
-from interest.preprocessor.parser import XMLExtractor
+# from interest.preprocessor.parser import XMLExtractor
+from interest.delpher_kranten import KrantenFile
+
+INPUT_FILE_TYPES = {
+    "delpher_kranten": KrantenFile
+
+}
diff --git a/interest/delpher_kranten.py b/interest/delpher_kranten.py
@@ -0,0 +1,118 @@
+"""
+Delpher Kranten Module
+
+This module provides classes and functions for handling Delpher Kranten files.
+"""
+
+import json
+import logging
+import os
+from typing import Optional
+from interest.document import Document, Article
+from interest.input_file import InputFile
+
+
+class KrantenFile(InputFile):
+    """
+    An InputFile implementation for Delpher Kranten.
+
+    Input is a zip file which includes one JSON file. The JSON file contains
+    metadata and articles from one issue of a newspaper.
+
+    Attributes:
+        METADATA_FIELD (str): The key for metadata field in JSON data.
+        TITLE_FIELD (str): The key for title field in metadata.
+        DATE_FIELD (str): The key for date field in metadata.
+        LANGUAGE_FIELD (str): The key for language field in metadata.
+        ARTICLES_FIELD (str): The key for articles field in JSON data.
+        ARTICLE_TITLE_FIELD (str): The key for title field in an article.
+        ARTICLE_BODY_FIELD (str): The key for body field in an article.
+        ENCODING (str): The encoding format for reading the file.
+
+    Methods:
+        read_json(json_file): Read JSON data from a file and parse it into
+        a Document object.
+        base_file_name(): Extract the base file name without extension from
+        the filepath.
+        doc(): Read the directory and parse the JSON file into a Document
+        object.
+    """
+
+    METADATA_FIELD = "newsletter_metadata"
+    TITLE_FIELD = "title"
+    DATE_FIELD = "date"
+    LANGUAGE_FIELD = "language"
+    ARTICLES_FIELD = "articles"
+    ARTICLE_TITLE_FIELD = "title"
+    ARTICLE_BODY_FIELD = "body"
+    ENCODING = "utf-8"
+
+    def read_json(self, json_file) -> Optional[Document]:
+        """
+                Read JSON data from a file and parse it into a Document object.
+
+                Args:
+                    json_file: A file object containing JSON data.
+
+                Returns:
+                    Optional[Document]: A Document object parsed from
+                    the JSON data, or None if parsing fails.
+        """
+        try:
+            json_data = json.load(json_file)
+            metadata = json_data[self.METADATA_FIELD]
+            document_title = metadata[self.TITLE_FIELD]
+            publish_date = metadata[self.DATE_FIELD]
+            language = metadata[self.LANGUAGE_FIELD]
+
+            articles_data = json_data[self.ARTICLES_FIELD]
+
+            articles = []
+            for article_id, article in articles_data.items():
+                article_title = article[self.ARTICLE_TITLE_FIELD]
+                article_body = article[self.ARTICLE_BODY_FIELD]
+                article = Article(article_id=article_id, title=article_title,
+                                  body=article_body)
+                articles.append(article)
+
+            document = Document(title=document_title,
+                                publish_date=publish_date,
+                                language=language,
+                                articles=articles)
+            return document
+
+        except (json.JSONDecodeError, KeyError) as e:
+            logging.error("Error parsing JSON data: %s", e)
+            return None
+
+    def base_file_name(self) -> str:
+        """
+               Extract the base file name without extension from the filepath.
+
+               Returns:
+                   str: The base file name without extension.
+        """
+        file_name_json = os.path.splitext(os.path.basename(self.filepath))[0]
+        base_file_name = os.path.splitext(file_name_json)[0]
+        return base_file_name
+
+    def doc(self) -> Optional[Document]:
+        """
+                Read the directory and parse the JSON file into a Document
+                object.
+
+                Returns:
+                    Optional[Document]: A Document object parsed from the
+                    JSON data, or None if parsing fails.
+        """
+        try:
+            logging.info("Reading directory '%s'...", self._filepath)
+            fh = self.open(encoding=self.ENCODING)
+            document = self.read_json(fh)
+            fh.close()
+            return document
+
+        except OSError as e:
+            logging.error("Error processing gzip file '%s': %s",
+                          self._filepath, e)
+            return None
diff --git a/interest/document.py b/interest/document.py
@@ -0,0 +1,131 @@
+# pylint: disable=too-few-public-methods
+"""
+This module defines the Document class, which represents a document
+containing articles.
+"""
+from typing import Optional, List, Union
+from datetime import datetime
+
+
+class Article:
+    """A class representing an article.
+
+        This class represents an article with an ID, title, and body text.
+        The body text can be provided as a list
+        of paragraphs, which will be joined into a single string.
+
+        Attributes:
+            id (str): The unique identifier of the article.
+            title (str): The title of the article.
+            body (str): The body text of the article, represented as
+            a single string.
+    """
+    def __init__(self, article_id: str, title: str,
+                 body: Union[str, List[str]]) -> None:
+        """Initialize an Article object with the given ID, title, and body.
+
+                Args:
+                    id (str): The unique identifier of the article.
+                    title (str): The title of the article.
+                    body (Union[str, List[str]): The body text of the article,
+                    provided as a list of paragraphs.
+        """
+        self.id = article_id
+        self.title = title
+        if isinstance(body, list):
+            article_body = '\n'.join(body)
+            self.text = article_body
+        else:
+            self.text = body
+
+
+class Document:
+    """
+        Represents a document containing articles.
+
+        Args:
+            title (str): The title of the document.
+            publish_date (str): The publication date of the document in
+            the format 'YYYY-MM-DD'.
+            language (str): The language of the document.
+            articles (List[Article]): A list of articles included in
+             the document.
+
+        Attributes:
+            _title (str): The title of the document.
+            _publish_date (str): The publication date of the document in
+            the format 'YYYY-MM-DD'.
+            _year (Optional[int]): The year of publication, extracted from
+            publish_date.
+            _language (str): The language of the document.
+            _articles (List[Article]): A list of articles included in the
+             document.
+
+        Properties:
+            title (str): Getter for the title of the document.
+            publish_date (str): Getter for the publication date of the
+            document.
+            year (Optional[int]): Getter for the year of publication.
+            decade (Optional[int]): Getter for the decade of publication.
+            language (str): Getter for the language of the document.
+            articles (List[Article]): Getter for the list of articles
+            included in the document.
+    """
+    def __init__(self, title: str, publish_date: str, language: str,
+                 articles: List[Article]) -> None:
+        self._year: Optional[int] = None
+        self._articles = articles
+        self._title = title
+        self._publish_date = publish_date
+        self._language = language
+
+    @property
+    def title(self) -> str:
+        """
+            Getter for the title of the document.
+
+            Returns:
+                str: The title of the document.
+        """
+        return self._title
+
+    @property
+    def year(self) -> Optional[int]:
+        """
+            Getter for the year of publication.
+
+            Returns:
+                Optional[int]: The year of publication extracted
+                from publish_date, or None if it cannot be determined.
+        """
+        if self._year is not None:
+            return self._year
+        try:
+            date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d')
+            self._year = date_obj.year
+            return self._year
+        except ValueError:
+            return None
+
+    @property
+    def decade(self) -> Optional[int]:
+        """
+            Getter for the decade of publication.
+
+            Returns:
+                Optional[int]: The decade of publication extracted from
+                publish_date,
+                or None if it cannot be determined.
+        """
+        _ = self.year
+        return int(self._year / 10) * 10 if self._year is not None else None
+
+    @property
+    def articles(self) -> List[Article]:
+        """
+            Getter for the list of articles included in the document.
+
+            Returns:
+                List[Article]: The list of articles included in the document.
+        """
+        return self._articles
diff --git a/interest/input_file.py b/interest/input_file.py
@@ -0,0 +1,119 @@
+"""
+Input File Module
+This module provides an abstract class for representing various input files.
+"""
+
+import abc
+import gzip
+from pathlib import Path
+from typing import Iterable, TextIO, cast, Optional
+from interest.document import Document, Article
+import logging
+
+# from .document_filter import DocumentFilter
+
+
+class InputFile(abc.ABC):
+    """
+    Abstract class for representing various input files.
+
+    Attributes:
+        _filepath (Path): The file path of the input file.
+
+    Methods:
+        __init__(filepath): Initialize the InputFile with a file path.
+        filepath(): Get the file path of the input file.
+        base_file_name(): Output a list of documents in the input file.
+        open(mode, encoding): Open the input file for reading.
+        articles(): Return all articles for the document found in the
+        input file.
+        doc(): Output a list of documents in the input file.
+    """
+
+    def __init__(self, filepath: Path) -> None:
+        """
+               Initialize the InputFile with a file path.
+
+               Args:
+                   filepath (Path): The file path of the input file.
+        """
+        self._filepath = filepath
+
+    @property
+    def filepath(self) -> Path:
+        """
+                Get the file path of the input file.
+
+                Returns:
+                    Path: The file path of the input file.
+        """
+        return self._filepath
+
+    @abc.abstractmethod
+    def base_file_name(self) -> str:
+        """
+        Output a list of documents in the input file.
+
+        This can be a singleton list if an input file contains only
+        one document.
+
+        Returns:
+            str: The base file name without extension.
+        """
+        return NotImplemented
+
+    def open(self, mode: str = "rt", encoding=None) -> TextIO:
+        """
+                Open the input file for reading.
+
+                Args:
+                    mode (str): The file open mode.
+                    encoding: The encoding format.
+
+                Returns:
+                    TextIO: A file object for reading the input file.
+        """
+        if self._filepath.suffix.startswith(".gz"):
+            return cast(TextIO, gzip.open(self._filepath, mode=mode,
+                                          encoding=encoding))
+
+        # Default to text file
+        return cast(TextIO, open(self._filepath,
+                                 mode=mode, encoding=encoding))
+
+    # pylint: disable=no-member
+    def articles(self) -> Iterable[Article]:
+        """
+        Return all articles for the document found in the input file.
+
+        Yields:
+            Article: An article object.
+        """
+        doc = self.doc()
+        if doc is not None:
+            yield from doc.articles
+        else:
+            logging.error("Document not found or is None for filepath: %s",
+                          self.filepath)
+            return
+
+    @abc.abstractmethod
+    def doc(self) -> Optional[Document]:
+        """
+            Output a list of documents in the input file.
+
+            This can be a singleton list if an input file contains only
+            one document.
+
+            Returns:
+                Document: A document object.
+        """
+        return NotImplemented
+
+    # def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
+    #     document = self.doc()
+    #     if filter.filter_document(document):
+    #         if document.articles() is not None:
+    #             for article in document.articles():
+    #                 if filter.filter_article(article):
+    #                     yield article
diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py
@@ -1 +1 @@
-from interest.preprocessor.parser import XMLExtractor
+# from interest.preprocessor.parser import XMLExtractor
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from interest.preprocessor.parser import XMLExtractor
		# from interest.preprocessor.parser import XMLExtractor