generated from UtrechtUniversity/re-python-package-setuptools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* define input file classes * define document class * apply pylint * apply flake8 * replace relative with absolute import * remove extra getter functions that are not used * fix variable types in Article class * fix mypy errors * fix flake8 errors * fix flake8 errors * Fix mypy and flake8 issues * Comment parser * Fix flake8 issues * Comment import parser --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl> Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
- Loading branch information
1 parent
8abfb9b
commit 57235ab
Showing
6 changed files
with
393 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,7 @@ | ||
from interest.preprocessor.parser import XMLExtractor | ||
# from interest.preprocessor.parser import XMLExtractor | ||
from interest.delpher_kranten import KrantenFile | ||
|
||
INPUT_FILE_TYPES = { | ||
"delpher_kranten": KrantenFile | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
""" | ||
Delpher Kranten Module | ||
This module provides classes and functions for handling Delpher Kranten files. | ||
""" | ||
|
||
import json | ||
import logging | ||
import os | ||
from typing import Optional | ||
from interest.document import Document, Article | ||
from interest.input_file import InputFile | ||
|
||
|
||
class KrantenFile(InputFile): | ||
""" | ||
An InputFile implementation for Delpher Kranten. | ||
Input is a zip file which includes one JSON file. The JSON file contains | ||
metadata and articles from one issue of a newspaper. | ||
Attributes: | ||
METADATA_FIELD (str): The key for metadata field in JSON data. | ||
TITLE_FIELD (str): The key for title field in metadata. | ||
DATE_FIELD (str): The key for date field in metadata. | ||
LANGUAGE_FIELD (str): The key for language field in metadata. | ||
ARTICLES_FIELD (str): The key for articles field in JSON data. | ||
ARTICLE_TITLE_FIELD (str): The key for title field in an article. | ||
ARTICLE_BODY_FIELD (str): The key for body field in an article. | ||
ENCODING (str): The encoding format for reading the file. | ||
Methods: | ||
read_json(json_file): Read JSON data from a file and parse it into | ||
a Document object. | ||
base_file_name(): Extract the base file name without extension from | ||
the filepath. | ||
doc(): Read the directory and parse the JSON file into a Document | ||
object. | ||
""" | ||
|
||
METADATA_FIELD = "newsletter_metadata" | ||
TITLE_FIELD = "title" | ||
DATE_FIELD = "date" | ||
LANGUAGE_FIELD = "language" | ||
ARTICLES_FIELD = "articles" | ||
ARTICLE_TITLE_FIELD = "title" | ||
ARTICLE_BODY_FIELD = "body" | ||
ENCODING = "utf-8" | ||
|
||
def read_json(self, json_file) -> Optional[Document]: | ||
""" | ||
Read JSON data from a file and parse it into a Document object. | ||
Args: | ||
json_file: A file object containing JSON data. | ||
Returns: | ||
Optional[Document]: A Document object parsed from | ||
the JSON data, or None if parsing fails. | ||
""" | ||
try: | ||
json_data = json.load(json_file) | ||
metadata = json_data[self.METADATA_FIELD] | ||
document_title = metadata[self.TITLE_FIELD] | ||
publish_date = metadata[self.DATE_FIELD] | ||
language = metadata[self.LANGUAGE_FIELD] | ||
|
||
articles_data = json_data[self.ARTICLES_FIELD] | ||
|
||
articles = [] | ||
for article_id, article in articles_data.items(): | ||
article_title = article[self.ARTICLE_TITLE_FIELD] | ||
article_body = article[self.ARTICLE_BODY_FIELD] | ||
article = Article(article_id=article_id, title=article_title, | ||
body=article_body) | ||
articles.append(article) | ||
|
||
document = Document(title=document_title, | ||
publish_date=publish_date, | ||
language=language, | ||
articles=articles) | ||
return document | ||
|
||
except (json.JSONDecodeError, KeyError) as e: | ||
logging.error("Error parsing JSON data: %s", e) | ||
return None | ||
|
||
def base_file_name(self) -> str: | ||
""" | ||
Extract the base file name without extension from the filepath. | ||
Returns: | ||
str: The base file name without extension. | ||
""" | ||
file_name_json = os.path.splitext(os.path.basename(self.filepath))[0] | ||
base_file_name = os.path.splitext(file_name_json)[0] | ||
return base_file_name | ||
|
||
def doc(self) -> Optional[Document]: | ||
""" | ||
Read the directory and parse the JSON file into a Document | ||
object. | ||
Returns: | ||
Optional[Document]: A Document object parsed from the | ||
JSON data, or None if parsing fails. | ||
""" | ||
try: | ||
logging.info("Reading directory '%s'...", self._filepath) | ||
fh = self.open(encoding=self.ENCODING) | ||
document = self.read_json(fh) | ||
fh.close() | ||
return document | ||
|
||
except OSError as e: | ||
logging.error("Error processing gzip file '%s': %s", | ||
self._filepath, e) | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# pylint: disable=too-few-public-methods | ||
""" | ||
This module defines the Document class, which represents a document | ||
containing articles. | ||
""" | ||
from typing import Optional, List, Union | ||
from datetime import datetime | ||
|
||
|
||
class Article: | ||
"""A class representing an article. | ||
This class represents an article with an ID, title, and body text. | ||
The body text can be provided as a list | ||
of paragraphs, which will be joined into a single string. | ||
Attributes: | ||
id (str): The unique identifier of the article. | ||
title (str): The title of the article. | ||
body (str): The body text of the article, represented as | ||
a single string. | ||
""" | ||
def __init__(self, article_id: str, title: str, | ||
body: Union[str, List[str]]) -> None: | ||
"""Initialize an Article object with the given ID, title, and body. | ||
Args: | ||
id (str): The unique identifier of the article. | ||
title (str): The title of the article. | ||
body (Union[str, List[str]): The body text of the article, | ||
provided as a list of paragraphs. | ||
""" | ||
self.id = article_id | ||
self.title = title | ||
if isinstance(body, list): | ||
article_body = '\n'.join(body) | ||
self.text = article_body | ||
else: | ||
self.text = body | ||
|
||
|
||
class Document: | ||
""" | ||
Represents a document containing articles. | ||
Args: | ||
title (str): The title of the document. | ||
publish_date (str): The publication date of the document in | ||
the format 'YYYY-MM-DD'. | ||
language (str): The language of the document. | ||
articles (List[Article]): A list of articles included in | ||
the document. | ||
Attributes: | ||
_title (str): The title of the document. | ||
_publish_date (str): The publication date of the document in | ||
the format 'YYYY-MM-DD'. | ||
_year (Optional[int]): The year of publication, extracted from | ||
publish_date. | ||
_language (str): The language of the document. | ||
_articles (List[Article]): A list of articles included in the | ||
document. | ||
Properties: | ||
title (str): Getter for the title of the document. | ||
publish_date (str): Getter for the publication date of the | ||
document. | ||
year (Optional[int]): Getter for the year of publication. | ||
decade (Optional[int]): Getter for the decade of publication. | ||
language (str): Getter for the language of the document. | ||
articles (List[Article]): Getter for the list of articles | ||
included in the document. | ||
""" | ||
def __init__(self, title: str, publish_date: str, language: str, | ||
articles: List[Article]) -> None: | ||
self._year: Optional[int] = None | ||
self._articles = articles | ||
self._title = title | ||
self._publish_date = publish_date | ||
self._language = language | ||
|
||
@property | ||
def title(self) -> str: | ||
""" | ||
Getter for the title of the document. | ||
Returns: | ||
str: The title of the document. | ||
""" | ||
return self._title | ||
|
||
@property | ||
def year(self) -> Optional[int]: | ||
""" | ||
Getter for the year of publication. | ||
Returns: | ||
Optional[int]: The year of publication extracted | ||
from publish_date, or None if it cannot be determined. | ||
""" | ||
if self._year is not None: | ||
return self._year | ||
try: | ||
date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d') | ||
self._year = date_obj.year | ||
return self._year | ||
except ValueError: | ||
return None | ||
|
||
@property | ||
def decade(self) -> Optional[int]: | ||
""" | ||
Getter for the decade of publication. | ||
Returns: | ||
Optional[int]: The decade of publication extracted from | ||
publish_date, | ||
or None if it cannot be determined. | ||
""" | ||
_ = self.year | ||
return int(self._year / 10) * 10 if self._year is not None else None | ||
|
||
@property | ||
def articles(self) -> List[Article]: | ||
""" | ||
Getter for the list of articles included in the document. | ||
Returns: | ||
List[Article]: The list of articles included in the document. | ||
""" | ||
return self._articles |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
""" | ||
Input File Module | ||
This module provides an abstract class for representing various input files. | ||
""" | ||
|
||
import abc | ||
import gzip | ||
from pathlib import Path | ||
from typing import Iterable, TextIO, cast, Optional | ||
from interest.document import Document, Article | ||
import logging | ||
|
||
# from .document_filter import DocumentFilter | ||
|
||
|
||
class InputFile(abc.ABC): | ||
""" | ||
Abstract class for representing various input files. | ||
Attributes: | ||
_filepath (Path): The file path of the input file. | ||
Methods: | ||
__init__(filepath): Initialize the InputFile with a file path. | ||
filepath(): Get the file path of the input file. | ||
base_file_name(): Output a list of documents in the input file. | ||
open(mode, encoding): Open the input file for reading. | ||
articles(): Return all articles for the document found in the | ||
input file. | ||
doc(): Output a list of documents in the input file. | ||
""" | ||
|
||
def __init__(self, filepath: Path) -> None: | ||
""" | ||
Initialize the InputFile with a file path. | ||
Args: | ||
filepath (Path): The file path of the input file. | ||
""" | ||
self._filepath = filepath | ||
|
||
@property | ||
def filepath(self) -> Path: | ||
""" | ||
Get the file path of the input file. | ||
Returns: | ||
Path: The file path of the input file. | ||
""" | ||
return self._filepath | ||
|
||
@abc.abstractmethod | ||
def base_file_name(self) -> str: | ||
""" | ||
Output a list of documents in the input file. | ||
This can be a singleton list if an input file contains only | ||
one document. | ||
Returns: | ||
str: The base file name without extension. | ||
""" | ||
return NotImplemented | ||
|
||
def open(self, mode: str = "rt", encoding=None) -> TextIO: | ||
""" | ||
Open the input file for reading. | ||
Args: | ||
mode (str): The file open mode. | ||
encoding: The encoding format. | ||
Returns: | ||
TextIO: A file object for reading the input file. | ||
""" | ||
if self._filepath.suffix.startswith(".gz"): | ||
return cast(TextIO, gzip.open(self._filepath, mode=mode, | ||
encoding=encoding)) | ||
|
||
# Default to text file | ||
return cast(TextIO, open(self._filepath, | ||
mode=mode, encoding=encoding)) | ||
|
||
# pylint: disable=no-member | ||
def articles(self) -> Iterable[Article]: | ||
""" | ||
Return all articles for the document found in the input file. | ||
Yields: | ||
Article: An article object. | ||
""" | ||
doc = self.doc() | ||
if doc is not None: | ||
yield from doc.articles | ||
else: | ||
logging.error("Document not found or is None for filepath: %s", | ||
self.filepath) | ||
return | ||
|
||
@abc.abstractmethod | ||
def doc(self) -> Optional[Document]: | ||
""" | ||
Output a list of documents in the input file. | ||
This can be a singleton list if an input file contains only | ||
one document. | ||
Returns: | ||
Document: A document object. | ||
""" | ||
return NotImplemented | ||
|
||
# def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]: | ||
# document = self.doc() | ||
# if filter.filter_document(document): | ||
# if document.articles() is not None: | ||
# for article in document.articles(): | ||
# if filter.filter_article(article): | ||
# yield article |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
from interest.preprocessor.parser import XMLExtractor | ||
# from interest.preprocessor.parser import XMLExtractor |
Oops, something went wrong.