From 1f9433a50cfe0c509b842f45fd71754b2bfa5d6c Mon Sep 17 00:00:00 2001
From: parisa-zahedi <p.zahedi@uu.nl>
Date: Tue, 25 Jun 2024 14:51:28 +0200
Subject: [PATCH 1/4] change repo_name

---
 .github/workflows/python-package.yml          |   2 +-
 dataQuest/__init__.py                         |   0
 dataQuest/article_final_selection/__init__.py |   0
 .../article_selector.py                       |  56 +++++
 .../process_article.py                        |  94 ++++++++
 .../process_articles.py                       | 102 +++++++++
 dataQuest/filter/__init__.py                  |   7 +
 dataQuest/filter/delpher_kranten.py           | 118 ++++++++++
 dataQuest/filter/document.py                  | 146 ++++++++++++
 dataQuest/filter/document_filter.py           | 208 ++++++++++++++++++
 dataQuest/filter/input_file.py                | 119 ++++++++++
 dataQuest/models/base.py                      |  20 ++
 dataQuest/models/tfidf.py                     | 101 +++++++++
 dataQuest/output_generator/text_formater.py   | 117 ++++++++++
 dataQuest/preprocessor/__init__.py            |   1 +
 dataQuest/preprocessor/parser.py              | 207 +++++++++++++++++
 dataQuest/preprocessor/text_cleaner.py        | 129 +++++++++++
 dataQuest/settings.py                         |  10 +
 dataQuest/temporal_categorization/__init__.py |   8 +
 .../timestamped_data.py                       | 123 +++++++++++
 dataQuest/utils.py                            | 174 +++++++++++++++
 scripts/convert_input_files.py                |   2 +-
 scripts/step1_filter_articles.py              |   8 +-
 scripts/step2_categorize_by_timestamp.py      |   4 +-
 scripts/step3_select_final_articles.py        |   6 +-
 scripts/step4_generate_output.py              |  10 +-
 26 files changed, 1756 insertions(+), 16 deletions(-)
 create mode 100644 dataQuest/__init__.py
 create mode 100644 dataQuest/article_final_selection/__init__.py
 create mode 100644 dataQuest/article_final_selection/article_selector.py
 create mode 100644 dataQuest/article_final_selection/process_article.py
 create mode 100644 dataQuest/article_final_selection/process_articles.py
 create mode 100644 dataQuest/filter/__init__.py
 create mode 100644 dataQuest/filter/delpher_kranten.py
 create mode 100644 dataQuest/filter/document.py
 create mode 100644 dataQuest/filter/document_filter.py
 create mode 100644 dataQuest/filter/input_file.py
 create mode 100644 dataQuest/models/base.py
 create mode 100644 dataQuest/models/tfidf.py
 create mode 100644 dataQuest/output_generator/text_formater.py
 create mode 100644 dataQuest/preprocessor/__init__.py
 create mode 100644 dataQuest/preprocessor/parser.py
 create mode 100644 dataQuest/preprocessor/text_cleaner.py
 create mode 100644 dataQuest/settings.py
 create mode 100644 dataQuest/temporal_categorization/__init__.py
 create mode 100644 dataQuest/temporal_categorization/timestamped_data.py
 create mode 100644 dataQuest/utils.py

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4a5cb18..4fc8315 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -18,7 +18,7 @@ on:
 
 # Replace package-name with your package name
 env:
-  PACKAGE_NAME: interest
+  PACKAGE_NAME: dataQuest
 
 jobs:
   build:
diff --git a/dataQuest/__init__.py b/dataQuest/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dataQuest/article_final_selection/__init__.py b/dataQuest/article_final_selection/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dataQuest/article_final_selection/article_selector.py b/dataQuest/article_final_selection/article_selector.py
new file mode 100644
index 0000000..c94ab9b
--- /dev/null
+++ b/dataQuest/article_final_selection/article_selector.py
@@ -0,0 +1,56 @@
+"""Module containing the ArticleSelector class for selecting articles based on
+similarity scores."""
+
+from typing import List, Dict, Union
+
+
+class ArticleSelector:
+    """Class for selecting articles based on similarity scores and
+    configuration parameters."""
+    # pylint: disable=too-few-public-methods
+
+    def __init__(self, similarity_scores: List[float],
+                 config: Dict[str, Union[str, float, int]]):
+        """Initializes the ArticleSelector object.
+
+        Args:
+            similarity_scores (List[float]): A list of similarity scores
+             between keywords and articles.
+            config (Dict[str, Union[str, float, int]]): A dictionary containing
+            configuration parameters for selecting articles.
+        """
+        self.similarity_scores = similarity_scores
+        self.config = config
+
+    def select_articles(self) -> List[int]:
+        """Selects articles based on the configured selection method and value.
+
+        Returns:
+            List[int]: A list of indices of selected articles.
+        """
+        sorted_indices = sorted(
+            range(len(self.similarity_scores)),
+            key=lambda i: self.similarity_scores[i],
+            reverse=True
+        )
+
+        selected_indices: List[int] = []
+        if self.config["type"] == "threshold":
+            threshold = float(self.config["value"])
+            selected_indices.extend(
+                i for i, score in enumerate(self.similarity_scores)
+                if score >= threshold
+            )
+        elif self.config["type"] == "num_articles":
+            num_articles = int(self.config["value"])
+            selected_indices.extend(sorted_indices[:num_articles])
+
+        elif self.config["type"] == "percentage":
+            percentage = float(self.config["value"])
+            num_articles = int(len(self.similarity_scores) *
+                               (percentage / 100.0))
+            num_articles = len(self.similarity_scores) if num_articles == 0\
+                else num_articles
+            selected_indices.extend(sorted_indices[:num_articles])
+
+        return selected_indices
diff --git a/dataQuest/article_final_selection/process_article.py b/dataQuest/article_final_selection/process_article.py
new file mode 100644
index 0000000..f78036a
--- /dev/null
+++ b/dataQuest/article_final_selection/process_article.py
@@ -0,0 +1,94 @@
+""" Module for processing articles from gzip files."""
+import gzip
+import json
+import logging
+from typing import List, Union, Tuple
+from dataQuest.preprocessor.text_cleaner import TextCleaner
+
+text_cleaner = TextCleaner()
+
+
+def clean(text:  Union[str, List[str]]) -> str:
+    """
+    Clean the input text using TextCleaner.
+
+    Args:
+        text (str): The input text to clean.
+
+    Returns:
+        str: The cleaned text.
+    """
+    return text_cleaner.preprocess(text)
+
+# pylint: disable=too-few-public-methods
+
+
+class ArticleProcessor:
+    """
+        Process individual articles from gzip files.
+
+        This class handles the processing of individual articles from
+        gzip files.
+        It reads the content of the article, cleans it using TextCleaner, and
+        determines whether the article contains any keywords of interests in
+        the title.
+    """
+    def __init__(self, gzip_file_path: str, article_id: int):
+        """
+        Initialize ArticleProcessor with the gzip file path and article ID.
+
+        Args:
+            gzip_file_path (str): The path to the gzip file.
+            article_id (int): The ID of the article.
+        """
+        self._file_path = gzip_file_path
+        self._article_id = article_id
+        self._title: Union[str, None] = ''
+        self._body: Union[str, list, None] = ''
+        self.selected: bool = False
+
+    def read_article_from_gzip(self) -> (
+            Tuple)[Union[str, None], Union[List[str], None], Union[str, None]]:
+        """
+        Read article content from a gzip file.
+
+        Returns:
+            Tuple[Union[str, None], Union[list, None], Union[str, None]]:
+            A tuple containing the title, body, and date of the article.
+        """
+        try:
+            with gzip.open(self._file_path, 'rt') as f:
+                data = json.load(f)
+                metadata = data.get('newsletter_metadata', {})
+                date = metadata.get('date', {})
+                articles = data.get('articles', {})
+                article = articles.get(str(self._article_id), {})
+                title = article.get('title', {})
+                body = article.get('body', {})
+                return title, body, date
+        except Exception as e:  # pylint: disable=broad-except
+            logging.error("Error reading article %s from %s: %s",
+                          str(self._article_id), self._file_path, e)
+            return None, None, None
+
+    def process_article(self, clean_keywords: List[str]) -> str:
+        """
+        Process the article content.
+
+        Args:
+            clean_keywords (List[str]): A list of clean keywords.
+
+        Returns:
+            str: The processed article body.
+        """
+        self._title, self._body, _ = self.read_article_from_gzip()
+        if (self._title is None) or (self._body is None):
+            return ""
+        clean_title = clean(self._title)
+        title_with_keyword = any(keyword in clean_title
+                                 for keyword in clean_keywords)
+        if title_with_keyword:
+            self.selected = True
+            return ""
+
+        return clean(self._body)
diff --git a/dataQuest/article_final_selection/process_articles.py b/dataQuest/article_final_selection/process_articles.py
new file mode 100644
index 0000000..d1f1419
--- /dev/null
+++ b/dataQuest/article_final_selection/process_articles.py
@@ -0,0 +1,102 @@
+"""
+This module contains functions for selecting articles based on keywords
+and similarity scores.
+"""
+from typing import List, Tuple, Dict, Union
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from dataQuest.models.tfidf import TfidfEmbedder
+from dataQuest.article_final_selection.process_article import ArticleProcessor
+from dataQuest.article_final_selection.process_article import clean
+from dataQuest.article_final_selection.article_selector import ArticleSelector
+
+
+def process_articles(articles_filepath: str, clean_keywords: List[str]) -> (
+        Tuple)[List[str], List[int]]:
+    """
+    Process articles from a CSV file.
+
+    Args:
+        articles_filepath (str): The path to the CSV file containing articles.
+        clean_keywords (List[str]): A list of clean keywords.
+
+    Returns:
+        Tuple[List[str], List[int]]: A tuple containing the processed article
+         bodies and selected indices.
+    """
+    articles_df = pd.read_csv(articles_filepath)
+    article_bodies: List[str] = []
+    selected_indices: List[int] = []
+    for index, row in articles_df.iterrows():
+        article_processor = ArticleProcessor(row['file_path'],
+                                             row['article_id'])
+        processed_article_body = article_processor.process_article(
+                                                   clean_keywords)
+        if article_processor.selected:
+            selected_indices.append(int(str(index)))
+        elif processed_article_body != "":
+            article_bodies.append(processed_article_body)
+    return article_bodies, selected_indices
+
+
+def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> (
+        List)[float]:
+    """
+    Apply TF-IDF similarity between documents and keywords.
+
+    Args:
+        documents (List[str]): A list of document bodies.
+        keywords (List[str]): A list of keywords.
+
+    Returns:
+        List[float]: A list of similarity scores.
+    """
+    model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1,
+                          max_df=1.0)
+    keywords_list = [" ".join(keywords)]
+    model.fit(documents)
+    embeddings_documents = model.transform(documents).tocsr()
+    embeddings_keywords = model.transform(keywords_list).tocsr()
+    similarity_scores = cosine_similarity(embeddings_keywords,
+                                          embeddings_documents)
+    return similarity_scores[0]
+
+
+def select_top_articles(similarity_scores: List[float],
+                        config: Dict[str, Union[str, float, int]]) \
+                        -> List[int]:
+    """
+    Select top articles based on similarity scores and configuration.
+
+    Args:
+        similarity_scores (List[float]): A list of similarity scores.
+        config (Dict[str, str]): Configuration for selecting articles.
+
+    Returns:
+        List[int]: A list of selected article indices.
+    """
+    selector = ArticleSelector(similarity_scores, config)
+    selected_indices = selector.select_articles()
+    return selected_indices
+
+
+def select_articles(articles_filepath: str, keywords: List[str],
+                    config: Dict[str, Union[str, float, int]]) -> List[int]:
+    """
+    Select articles based on keywords, similarity scores, and configuration.
+
+    Args:
+        articles_filepath (str): The path to the CSV file containing articles.
+        keywords (List[str]): A list of keywords.
+        config (Dict[str, str]): Configuration for selecting articles.
+
+    Returns:
+        List[int]: A list of selected article indices.
+    """
+    clean_keywords = [clean(keyword) for keyword in keywords]
+    article_bodies, selected_indices = process_articles(articles_filepath,
+                                                        clean_keywords)
+    similarity_scores = apply_tfidf_similarity(article_bodies, clean_keywords)
+    indices = select_top_articles(similarity_scores, config)
+    selected_indices.extend(indices)
+    return selected_indices
diff --git a/dataQuest/filter/__init__.py b/dataQuest/filter/__init__.py
new file mode 100644
index 0000000..1351c2a
--- /dev/null
+++ b/dataQuest/filter/__init__.py
@@ -0,0 +1,7 @@
+"""define input-file type"""
+from dataQuest.filter.delpher_kranten import KrantenFile
+
+INPUT_FILE_TYPES = {
+    "delpher_kranten": KrantenFile
+
+}
diff --git a/dataQuest/filter/delpher_kranten.py b/dataQuest/filter/delpher_kranten.py
new file mode 100644
index 0000000..f3544ff
--- /dev/null
+++ b/dataQuest/filter/delpher_kranten.py
@@ -0,0 +1,118 @@
+"""
+Delpher Kranten Module
+
+This module provides classes and functions for handling Delpher Kranten files.
+"""
+
+import json
+import logging
+import os
+from typing import Optional
+from dataQuest.filter.document import Document, Article
+from dataQuest.filter.input_file import InputFile
+
+
+class KrantenFile(InputFile):
+    """
+    An InputFile implementation for Delpher Kranten.
+
+    Input is a zip file which includes one JSON file. The JSON file contains
+    metadata and articles from one issue of a newspaper.
+
+    Attributes:
+        METADATA_FIELD (str): The key for metadata field in JSON data.
+        TITLE_FIELD (str): The key for title field in metadata.
+        DATE_FIELD (str): The key for date field in metadata.
+        LANGUAGE_FIELD (str): The key for language field in metadata.
+        ARTICLES_FIELD (str): The key for articles field in JSON data.
+        ARTICLE_TITLE_FIELD (str): The key for title field in an article.
+        ARTICLE_BODY_FIELD (str): The key for body field in an article.
+        ENCODING (str): The encoding format for reading the file.
+
+    Methods:
+        read_json(json_file): Read JSON data from a file and parse it into
+        a Document object.
+        base_file_name(): Extract the base file name without extension from
+        the filepath.
+        doc(): Read the directory and parse the JSON file into a Document
+        object.
+    """
+
+    METADATA_FIELD = "newsletter_metadata"
+    TITLE_FIELD = "title"
+    DATE_FIELD = "date"
+    LANGUAGE_FIELD = "language"
+    ARTICLES_FIELD = "articles"
+    ARTICLE_TITLE_FIELD = "title"
+    ARTICLE_BODY_FIELD = "body"
+    ENCODING = "utf-8"
+
+    def read_json(self, json_file) -> Optional[Document]:
+        """
+                Read JSON data from a file and parse it into a Document object.
+
+                Args:
+                    json_file: A file object containing JSON data.
+
+                Returns:
+                    Optional[Document]: A Document object parsed from
+                    the JSON data, or None if parsing fails.
+        """
+        try:
+            json_data = json.load(json_file)
+            metadata = json_data[self.METADATA_FIELD]
+            document_title = metadata[self.TITLE_FIELD]
+            publish_date = metadata[self.DATE_FIELD]
+            language = metadata[self.LANGUAGE_FIELD]
+
+            articles_data = json_data[self.ARTICLES_FIELD]
+
+            articles = []
+            for article_id, article in articles_data.items():
+                article_title = article[self.ARTICLE_TITLE_FIELD]
+                article_body = article[self.ARTICLE_BODY_FIELD]
+                article = Article(article_id=article_id, title=article_title,
+                                  body=article_body)
+                articles.append(article)
+
+            document = Document(title=document_title,
+                                publish_date=publish_date,
+                                language=language,
+                                articles=articles)
+            return document
+
+        except (json.JSONDecodeError, KeyError) as e:
+            logging.error("Error parsing JSON data: %s", e)
+            return None
+
+    def base_file_name(self) -> str:
+        """
+               Extract the base file name without extension from the filepath.
+
+               Returns:
+                   str: The base file name without extension.
+        """
+        file_name_json = os.path.splitext(os.path.basename(self.filepath))[0]
+        base_file_name = os.path.splitext(file_name_json)[0]
+        return base_file_name
+
+    def doc(self) -> Optional[Document]:
+        """
+                Read the directory and parse the JSON file into a Document
+                object.
+
+                Returns:
+                    Optional[Document]: A Document object parsed from the
+                    JSON data, or None if parsing fails.
+        """
+        try:
+            logging.info("Reading directory '%s'...", self._filepath)
+            fh = self.open(encoding=self.ENCODING)
+            document = self.read_json(fh)
+            fh.close()
+            return document
+
+        except OSError as e:
+            logging.error("Error processing gzip file '%s': %s",
+                          self._filepath, e)
+            return None
diff --git a/dataQuest/filter/document.py b/dataQuest/filter/document.py
new file mode 100644
index 0000000..eb3b1d3
--- /dev/null
+++ b/dataQuest/filter/document.py
@@ -0,0 +1,146 @@
+# pylint: disable=too-few-public-methods
+"""
+This module defines the Document class, which represents a document
+containing articles.
+"""
+import logging
+from typing import Optional, List, Union
+from datetime import datetime
+
+
+class Article:
+    """A class representing an article.
+
+        This class represents an article with an ID, title, and body text.
+        The body text can be provided as a list
+        of paragraphs, which will be joined into a single string.
+
+        Attributes:
+            id (str): The unique identifier of the article.
+            title (str): The title of the article.
+            body (str): The body text of the article, represented as
+            a single string.
+    """
+    def __init__(self, article_id: str, title: str,
+                 body: Union[str, List[str]]) -> None:
+        """Initialize an Article object with the given ID, title, and body.
+
+                Args:
+                    id (str): The unique identifier of the article.
+                    title (str): The title of the article.
+                    body (Union[str, List[str]): The body text of the article,
+                    provided as a list of paragraphs.
+        """
+        self.id = article_id
+        self.title = title
+        if isinstance(body, list):
+            if any(item is None for item in body):
+                logging.warning("There is a None value in body")
+                self.text = ""
+            else:
+                article_body = '\n'.join(body)
+                self.text = article_body
+        else:
+            self.text = body
+
+
+class Document:
+    """
+        Represents a document containing articles.
+
+        Args:
+            title (str): The title of the document.
+            publish_date (str): The publication date of the document in
+            the format 'YYYY-MM-DD'.
+            language (str): The language of the document.
+            articles (List[Article]): A list of articles included in
+             the document.
+
+        Attributes:
+            _title (str): The title of the document.
+            _publish_date (str): The publication date of the document in
+            the format 'YYYY-MM-DD'.
+            _year (Optional[int]): The year of publication, extracted from
+            publish_date.
+            _language (str): The language of the document.
+            _articles (List[Article]): A list of articles included in the
+             document.
+
+        Properties:
+            title (str): Getter for the title of the document.
+            publish_date (str): Getter for the publication date of the
+            document.
+            year (Optional[int]): Getter for the year of publication.
+            decade (Optional[int]): Getter for the decade of publication.
+            language (str): Getter for the language of the document.
+            articles (List[Article]): Getter for the list of articles
+            included in the document.
+    """
+    def __init__(self, title: str, publish_date: str, language: str,
+                 articles: List[Article]) -> None:
+        self._year: Optional[int] = None
+        self._articles = articles
+        self._title = title
+        self._publish_date = publish_date
+        self._language = language
+
+    @property
+    def title(self) -> str:
+        """
+            Getter for the title of the document.
+
+            Returns:
+                str: The title of the document.
+        """
+        return self._title
+
+    @property
+    def publish_date(self) -> str:
+        """
+           Getter for the publish_date of the document.
+
+           Returns:
+               str: The publish_date of the document.
+        """
+        return self._publish_date
+
+    @property
+    def year(self) -> Optional[int]:
+        """
+            Getter for the year of publication.
+
+            Returns:
+                Optional[int]: The year of publication extracted
+                from publish_date, or None if it cannot be determined.
+        """
+        if self._year is not None:
+            return self._year
+        try:
+            date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d')
+            self._year = date_obj.year
+            return self._year
+        except ValueError:
+            return None
+
+    @property
+    def decade(self) -> Optional[int]:
+        """
+            Getter for the decade of publication.
+
+            Returns:
+                Optional[int]: The decade of publication extracted from
+                publish_date,
+                or None if it cannot be determined.
+        """
+        _ = self.year
+        return int(self._year / 10) * 10 if self._year is not None else None
+
+    @property
+    def articles(self) -> List[Article]:
+        """
+            Getter for the list of articles included in the document.
+
+            Returns:
+                List[Article]: The list of articles included in the document.
+        """
+        return self._articles
diff --git a/dataQuest/filter/document_filter.py b/dataQuest/filter/document_filter.py
new file mode 100644
index 0000000..ee5bb8c
--- /dev/null
+++ b/dataQuest/filter/document_filter.py
@@ -0,0 +1,208 @@
+"""
+Document Filter Module
+This module provides classes for filtering documents and articles.
+"""
+from abc import ABC, abstractmethod
+from typing import List
+from dataQuest.filter.document import Document, Article
+
+
+class DocumentFilter(ABC):
+    """
+        Abstract base class for document filters.
+
+        Methods:
+            filter_document(document: Document) -> bool: Abstract method
+             to filter documents.
+            filter_article(article: Article) -> bool: Method to filter
+            articles.
+    """
+    @abstractmethod
+    def filter_document(self, document: Document) -> bool:
+        """
+               Abstract method to filter documents.
+
+               Args:
+                   document (Document): The document to be filtered.
+
+               Returns:
+                   bool: True if the document passes the filter,
+                   False otherwise.
+        """
+        return NotImplemented
+
+    def filter_article(self, _article: Article) -> bool:
+        """
+                Method to filter articles.
+
+                By default, returns True, allowing all articles to
+                pass through.
+
+                Args:
+                    _article (Article): The article to be filtered.
+
+                Returns:
+                    bool: True if the article passes the filter,
+                     False otherwise.
+        """
+        return True
+
+
+class TitleFilter(DocumentFilter):
+    """
+        Filter documents by title.
+
+        Attributes:
+            title (str): The title to filter by.
+    """
+    def __init__(self, title: str):
+        self.title = title
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by title.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document's title contains the specified
+                    title, False otherwise.
+        """
+        return self.title in document.title
+
+
+class YearFilter(DocumentFilter):
+    """
+       Filter documents by year.
+
+       Attributes:
+           year (int): The year to filter by.
+    """
+    def __init__(self, year: int):
+        self.year = year
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by year.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document's year matches the specified
+                    year, False otherwise.
+        """
+        return document.year == self.year
+
+
+class DecadeFilter(DocumentFilter):
+    """
+        Filter documents by decade.
+
+        Attributes:
+            decade (int): The decade to filter by.
+    """
+    def __init__(self, decade: int):
+        self.decade = decade
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by decade.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document's decade matches the
+                    specified decade, False otherwise.
+        """
+        return document.decade == self.decade
+
+
+class KeywordsFilter(DocumentFilter):
+    """
+        Filter documents and articles by keywords.
+
+        Attributes:
+            keywords (List[str]): The list of keywords to filter by.
+    """
+    def __init__(self, keywords: List[str]):
+        self.keywords = keywords
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by keywords.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: Always returns True.
+        """
+        return True
+
+    def filter_article(self, article: Article) -> bool:
+        """
+                Filter articles by keywords.
+
+                Args:
+                    article (Article): The article to be filtered.
+
+                Returns:
+                    bool: True if the article's title or text contains any
+                    of the specified keywords, False otherwise.
+        """
+        return any(keyword in article.title or keyword in article.text for
+                   keyword in self.keywords)
+
+
+class CompoundFilter(DocumentFilter):
+    """
+        Compound filter combining multiple filters.
+
+        Attributes:
+            filters (List[DocumentFilter]): The list of filters to apply.
+    """
+    def __init__(self, filters: List[DocumentFilter]):
+        self.filters = filters
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by applying all filters.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document passes all filters,
+                    False otherwise.
+        """
+        return all(filter_.filter_document(document)
+                   for filter_ in self.filters)
+
+    def filter_article(self, article: Article) -> bool:
+        """
+                Filter articles by applying all filters.
+
+                Args:
+                    article (Article): The article to be filtered.
+
+                Returns:
+                    bool: True if the article passes all filters,
+                    False otherwise.
+        """
+        return all(filter_.filter_article(article) for filter_ in self.filters)
+
+    def include_keyword_filter(self) -> bool:
+        """
+                Check if the compound filter includes a KeywordsFilter.
+
+                Returns:
+                    bool: True if the compound filter includes a
+                    KeywordsFilter, False otherwise.
+        """
+        for filter_ in self.filters:
+            if isinstance(filter_, KeywordsFilter):
+                return True
+        return False
diff --git a/dataQuest/filter/input_file.py b/dataQuest/filter/input_file.py
new file mode 100644
index 0000000..fc799fb
--- /dev/null
+++ b/dataQuest/filter/input_file.py
@@ -0,0 +1,119 @@
+"""
+Input File Module
+This module provides an abstract class for representing various input files.
+"""
+
+import abc
+import gzip
+import logging
+from pathlib import Path
+from typing import Iterable, TextIO, cast, Optional
+from dataQuest.filter.document import Document, Article
+from dataQuest.filter.document_filter import DocumentFilter
+
+
+class InputFile(abc.ABC):
+    """
+    Abstract class for representing various input files.
+
+    Attributes:
+        _filepath (Path): The file path of the input file.
+
+    Methods:
+        __init__(filepath): Initialize the InputFile with a file path.
+        filepath(): Get the file path of the input file.
+        base_file_name(): Output a list of documents in the input file.
+        open(mode, encoding): Open the input file for reading.
+        articles(): Return all articles for the document found in the
+        input file.
+        doc(): Output a list of documents in the input file.
+    """
+
+    def __init__(self, filepath: Path) -> None:
+        """
+               Initialize the InputFile with a file path.
+
+               Args:
+                   filepath (Path): The file path of the input file.
+        """
+        self._filepath = filepath
+
+    @property
+    def filepath(self) -> Path:
+        """
+                Get the file path of the input file.
+
+                Returns:
+                    Path: The file path of the input file.
+        """
+        return self._filepath
+
+    @abc.abstractmethod
+    def base_file_name(self) -> str:
+        """
+        Output a list of documents in the input file.
+
+        This can be a singleton list if an input file contains only
+        one document.
+
+        Returns:
+            str: The base file name without extension.
+        """
+        return NotImplemented
+
+    def open(self, mode: str = "rt", encoding=None) -> TextIO:
+        """
+                Open the input file for reading.
+
+                Args:
+                    mode (str): The file open mode.
+                    encoding: The encoding format.
+
+                Returns:
+                    TextIO: A file object for reading the input file.
+        """
+        if self._filepath.suffix.startswith(".gz"):
+            return cast(TextIO, gzip.open(self._filepath, mode=mode,
+                                          encoding=encoding))
+
+        # Default to text file
+        return cast(TextIO, open(self._filepath,
+                                 mode=mode, encoding=encoding))
+
+    # pylint: disable=no-member
+    def articles(self) -> Iterable[Article]:
+        """
+        Return all articles for the document found in the input file.
+
+        Yields:
+            Article: An article object.
+        """
+        doc = self.doc()
+        if doc is not None:
+            yield from doc.articles
+        else:
+            logging.error("Document not found or is None for filepath: %s",
+                          self.filepath)
+            return
+
+    @abc.abstractmethod
+    def doc(self) -> Optional[Document]:
+        """
+            Output a list of documents in the input file.
+
+            This can be a singleton list if an input file contains only
+            one document.
+
+            Returns:
+                Document: A document object.
+        """
+        return NotImplemented
+
+    def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
+        document = self.doc()
+        if document is not None:
+            if filter.filter_document(document):
+                if document.articles is not None:
+                    for article in document.articles:
+                        if filter.filter_article(article):
+                            yield article
diff --git a/dataQuest/models/base.py b/dataQuest/models/base.py
new file mode 100644
index 0000000..64832a1
--- /dev/null
+++ b/dataQuest/models/base.py
@@ -0,0 +1,20 @@
+"""Base class for document embeddings."""
+
+from abc import ABC, abstractmethod
+from typing import Union, Sequence
+import scipy
+from numpy import typing as npt
+import numpy as np
+
+
+class BaseEmbedder(ABC):
+    """Base class for creating document embeddings."""
+
+    @abstractmethod
+    def fit(self, documents: Sequence[str]) -> None:
+        """Train the model on documents."""
+
+    @abstractmethod
+    def transform(self, documents: Union[str, Sequence[str]]) -> (
+            Union)[scipy.sparse.spmatrix, npt.NDArray[np.float_]]:
+        """Get the embedding for a document."""
diff --git a/dataQuest/models/tfidf.py b/dataQuest/models/tfidf.py
new file mode 100644
index 0000000..8583f07
--- /dev/null
+++ b/dataQuest/models/tfidf.py
@@ -0,0 +1,101 @@
+"""Sklearn TF-IDF class."""
+
+from typing import Sequence, Union, Optional
+import warnings
+
+import scipy
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from dataQuest.models.base import BaseEmbedder
+from dataQuest.utils import load_spacy_model
+from dataQuest.settings import SPACY_MODEL
+
+
+class TfidfEmbedder(BaseEmbedder):
+    # pylint: disable=too-many-instance-attributes
+    """
+       Sklearn TF-IDF class.
+
+       Arguments
+       ---------
+       ngram_max:
+           Maximum n-gram, higher numbers mean bigger embeddings.
+       norm:
+           Which kind of normalization is used: "l1", "l2" or None.
+       sublinear_tf:
+           Apply sublinear term-frequency scaling.
+       min_df:
+           Minimum document frequency of word to be included in the embedding.
+       max_df:
+           Maximum document frequency of word to be included in the embedding.
+       """
+
+    # pylint: disable=too-many-arguments
+
+    def __init__(
+            self, ngram_max: int = 1, norm: Optional[str] = "l1",
+            sublinear_tf: bool = False, min_df: int = 1,
+            max_df: float = 1.0, spacy_model=SPACY_MODEL):
+        """Initialize the TF-IDF embedder."""
+        self.nlp = (
+            load_spacy_model(spacy_model)
+            if isinstance(spacy_model, str)
+            else spacy_model
+        )
+        self.stopword_list = self.nlp.Defaults.stop_words
+        self.stop_words = list(self.stopword_list)
+        self.ngram_max = ngram_max
+
+        self.norm = norm
+        self.sublinear_tf = sublinear_tf
+        self.min_df = min_df
+        self.max_df = max_df
+        if self.norm == "None":
+            self.norm = None
+
+        self._model: Optional[TfidfVectorizer] = None
+
+    def fit(self, documents: Sequence[str]) -> None:
+        """
+        Fit the TF-IDF model on the given documents.
+
+        Args:
+            documents (Sequence[str]): A sequence of document strings.
+        """
+        min_df = min(self.min_df, len(documents))
+        max_df = max(min_df/len(documents), self.max_df)
+
+        def _tokenizer(text):
+            doc = self.nlp(text)
+            tokens = [token.lemma_.lower() for token in doc
+                      if not token.is_stop and not token.is_punct]
+            return tokens
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            self._model = TfidfVectorizer(
+                ngram_range=(1, self.ngram_max),
+                stop_words=self.stop_words,
+                tokenizer=_tokenizer,  # self.stem_tokenizer,
+                min_df=min_df,
+                norm=self.norm,
+                sublinear_tf=self.sublinear_tf,
+                max_df=max_df)
+            self._model.fit(documents)
+
+    def transform(self, documents: Union[str, Sequence[str]]) -> Union[
+            scipy.sparse.spmatrix]:
+        """
+        Transform the input documents into TF-IDF embeddings.
+
+        Args:
+            documents (Union[str, Sequence[str]]): A single document string or
+            a sequence of document strings.
+
+        Returns:
+            Union[scipy.sparse.spmatrix]: The TF-IDF embeddings of the input
+             documents.
+        """
+        if self._model is None:
+            raise ValueError("Fit TF-IDF model before transforming data.")
+        return self._model.transform(documents).tocsr()
diff --git a/dataQuest/output_generator/text_formater.py b/dataQuest/output_generator/text_formater.py
new file mode 100644
index 0000000..1bad6a0
--- /dev/null
+++ b/dataQuest/output_generator/text_formater.py
@@ -0,0 +1,117 @@
+""" This module defines a TextFormatter class for formatting text based on
+specified output units. """
+from typing import List, Union
+import logging
+from dataQuest.settings import SPACY_MODEL
+from dataQuest.utils import load_spacy_model
+
+PARAGRAPH_FORMATTER = 'paragraph'
+FULLTEXT_FORMATTER = 'full_text'
+SEGMENTED_TEXT_FORMATTER = 'segmented_text'
+
+
+class TextFormatter:
+    # pylint: disable=R0903
+    """Class for formatting text based on specified output units. """
+
+    def __init__(self, output_unit: str, sentences_per_segment: int,
+                 spacy_model=SPACY_MODEL):  # : Union[str, Language]
+        """
+        Initializes the TextFormatter object.
+
+        Args:
+            output_unit (str): The type of output unit ('paragraph',
+             'full_text', 'segmented_text').
+            sentences_per_segment (int): Number of sentences per
+            segment when output_unit is 'segmented_text'.
+            spacy_model (Union[str, Language], optional): Spacy model
+             or model name used for text processing. Defaults to the global
+             SPACY_MODEL value.
+        """
+        self.nlp = (
+            load_spacy_model(spacy_model)
+            if isinstance(spacy_model, str)
+            else spacy_model
+        )
+        self.sentences_per_segment = sentences_per_segment
+        self.formatter = output_unit
+        self.is_fulltext = self._is_fulltext()
+        self.texts: List[str] = []
+
+    def format_output(self, texts: Union[None, List[str]]) -> (
+            Union)[str, List[str], List[List[str]], None]:
+        """
+        Formats input texts based on the specified output unit.
+
+        Args:
+            texts (List[str]): List of input texts to be formatted.
+
+        Returns:
+            Union[str, List[str], List[List[str]]]: Formatted output text
+            based on the selected output_unit. For 'full_text', returns a
+            single string. For 'paragraph' and 'segmented_text', returns a
+            list of segmented text lists.
+
+        Raises:
+            ValueError: If input 'texts' is not a list of strings.
+            ValueError: If an unsupported formatter type is specified.
+        """
+        try:
+            if (not isinstance(texts, list) or (texts is None) or
+                    not all(isinstance(text, str) for text in texts)):
+                raise ValueError("Input 'texts' must be a list of strings.")
+
+            self.texts = texts
+
+            if self.formatter == PARAGRAPH_FORMATTER:
+                return self._format_paragraph()
+            if self.formatter == FULLTEXT_FORMATTER:
+                return self._format_fulltext()
+            if self.formatter == SEGMENTED_TEXT_FORMATTER:
+                return self._format_segmented_text()
+
+        except ValueError as e:
+            logging.error("Unsupported formatter %s: %s", self.formatter, e)
+            return None
+        return None
+
+    def _format_paragraph(self) -> List[str]:
+        """Formats texts as a single paragraph.
+
+        Returns:
+            List[List[str]]: List of input texts, segmented in paragraphs.
+        """
+        return self.texts
+
+    def _format_fulltext(self) -> str:
+        """Formats texts as full text with newline separators.
+
+        Returns:
+            str: Newline-separated string of input texts.
+        """
+        return '\n'.join(self.texts)
+
+    def _format_segmented_text(self) -> List[List[str]]:
+        """Formats texts as segmented text based on sentences_per_segment.
+
+        Returns:
+             List[List[str]]: Flattened list of segmented text strings.
+        """
+        segmented_texts = []
+        for text in self.texts:
+            doc = self.nlp(text)
+            sentences = [sent.text for sent in doc.sents]
+
+            for i in range(0, len(sentences), self.sentences_per_segment):
+                segment = sentences[i:i + self.sentences_per_segment]
+                segmented_texts.append(segment)
+
+        return segmented_texts
+
+    def _is_fulltext(self) -> bool:
+        """Checks if the formatter type is 'full_text'.
+
+        Returns:
+            bool: True if formatter is 'full_text', False otherwise.
+        """
+        return self.formatter == FULLTEXT_FORMATTER
diff --git a/dataQuest/preprocessor/__init__.py b/dataQuest/preprocessor/__init__.py
new file mode 100644
index 0000000..f6b7579
--- /dev/null
+++ b/dataQuest/preprocessor/__init__.py
@@ -0,0 +1 @@
+# from dataQuest.preprocessor.parser import XMLExtractor
diff --git a/dataQuest/preprocessor/parser.py b/dataQuest/preprocessor/parser.py
new file mode 100644
index 0000000..efb7dd3
--- /dev/null
+++ b/dataQuest/preprocessor/parser.py
@@ -0,0 +1,207 @@
+
+import os
+import tarfile
+import gzip
+import json
+import xml.etree.ElementTree as ET
+from typing import Dict, Union, Any, Optional, List
+import logging
+
+
+class XMLExtractor:
+    """Class for extracting XML content and metadata from nested .tgz files."""  # noqa: E501
+    def __init__(self, root_dir: str, output_dir: str):
+        """
+        Initializes the XMLExtractor object.
+
+        Parameters:
+            root_dir (str): The root directory containing .tgz files.
+            output_dir (str): The output directory for saving extracted JSON files.  # noqa: E501
+        """
+        self.root_dir = root_dir
+        self.output_dir = output_dir
+        self.fields = [
+            "title", "language", "issuenumber", "date", "identifier",
+            "temporal", "recordRights", "publisher", "spatial", "source",
+            "recordIdentifier", "type", "isPartOf"
+        ]
+
+    def extract_xml_string(self) -> None:
+        """
+        Extracts XML content and metadata from .tgz files in the root directory.  # noqa: E501
+        """
+        for folder_name in os.listdir(self.root_dir):
+            folder_path = os.path.join(self.root_dir, folder_name)
+            if not os.path.isdir(folder_path):
+                continue
+            if not folder_name.isdigit():  # Exclude in_progress, manifests, and ocr_complete folders and log files.  # noqa: E501
+                continue
+            self.process_folder(folder_name, folder_path)
+
+    def process_folder(self, folder_name: str, folder_path: str) -> None:
+        """
+        Processes .tgz files within a folder.
+
+        Parameters:
+            folder_name (str): Name of the folder being processed.
+            folder_path (str): Path to the folder being processed.
+        """
+        for tgz_filename in os.listdir(folder_path):
+            if not tgz_filename.endswith('.tgz'):
+                continue
+            tgz_file_path = os.path.join(folder_path, tgz_filename)
+            base_name = os.path.splitext(tgz_filename)[0]
+            output_folder = os.path.join(self.output_dir, folder_name)
+            os.makedirs(output_folder, exist_ok=True)
+            try:
+                with tarfile.open(tgz_file_path, "r:gz") as outer_tar:
+                    news_dict = self.process_tar(outer_tar)
+            except tarfile.TarError as e:
+                logging.error(f"Error extracting {tgz_filename}: {e}")
+                continue
+            output_file = os.path.join(output_folder, f"{base_name}.json.gz")
+            self.save_as_json_compressed(news_dict, output_file)
+            # self.save_as_json(news_dict, output_file)
+
+    def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:  # noqa: E501
+        """
+        Processes a .tgz file and extracts XML content and metadata.
+
+        Parameters:
+            outer_tar (tarfile.TarFile): The .tgz file being processed.
+
+        Returns:
+            Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata.  # noqa: E501
+        """
+        news_dict: Dict[str, Any] = {"newsletter_metadata": {}, "articles": {}}
+        id = 0
+        for entry in outer_tar:
+            try:
+                if entry.name.endswith(".xml"):
+                    file = outer_tar.extractfile(entry)
+                    if file is not None:
+                        content = file.read()
+                        xml_content = content.decode('utf-8', 'ignore')
+                        article = self.extract_article(xml_content, entry.name)
+                        id += 1
+                        news_dict["articles"][id] = article
+
+                elif entry.name.endswith(".gz"):
+                    gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz'))  # noqa: E501
+                    with outer_tar.extractfile(gz_member) as gz_file:  # type: ignore  # noqa: E501
+                        with gzip.open(gz_file, 'rt') as xml_file:
+                            xml_string = xml_file.read()
+                            if isinstance(xml_string, bytes):
+                                xml_string = xml_string.decode('utf-8')
+                            newsletter_metadata = self.extract_meta(xml_string)
+                            news_dict["newsletter_metadata"] = newsletter_metadata  # noqa: E501
+                else:
+                    continue
+            except Exception as e:
+                logging.error(f"Error processing file {entry.name}: {e}")
+        return news_dict
+
+    @staticmethod
+    def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:  # noqa: E501
+        """
+        Saves data as compressed JSON using gzip.
+
+        Parameters:
+            data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.  # noqa: E501
+            output_file (str): Path to the output JSON file.
+        """
+        try:
+            with gzip.open(output_file, 'wt') as json_file:
+                json.dump(data, json_file, indent=4)
+        except Exception as e:
+            logging.error(f"Error saving compressed JSON to {output_file}: {e}")  # noqa: E501
+
+    # @staticmethod
+    # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:  # noqa: E501
+    #     """
+    #     Saves data as JSON to a specified file.
+
+    #     Parameters:
+    #         data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.  # noqa: E501
+    #         output_file (str): Path to the output JSON file.
+    #     """
+    #     try:
+    #         with open(output_file, 'w') as json_file:
+    #             json.dump(data, json_file, indent=4)
+    #     except Exception as e:
+    #         logging.error(f"Error saving JSON to {output_file}: {e}")
+
+    @staticmethod
+    def extract_article(xml_content: str, file_name: str) -> Dict[str, Union[str, List[Optional[str]]]]:  # noqa: E501
+        """
+        Extracts article title and body from XML content.
+
+        Parameters:
+            xml_content (str): XML content of the article.
+            file_name (str): Name of the XML file.
+
+        Returns:
+            Dict[Optional[str], list[str]]: A dictionary containing the extracted title and body of the article.
+              body contains a list of paragraphs.  # noqa: E501
+        """
+        try:
+            root = ET.fromstring(xml_content)
+        except ET.ParseError:
+            logging.error(f"Failed to parse XML from file: {file_name}")
+            return {}
+
+        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]  # noqa: E501
+        if len(title_values) > 1:
+            logging.warning("More than one titles are extracted for the article.")  # noqa: E501
+        if not title_values:
+            logging.warning("No title is extracted for the article.")
+            title = ""
+        else:
+            title = title_values[0] if title_values[0] is not None else ""
+            # title = title_values[0]
+
+        body_values = [element.text for element in root.iter() if element.tag.endswith('p')]  # noqa: E501
+        if not body_values:
+            logging.warning("No body is extracted.")
+            body = []
+        # elif len(body_values) > 1:
+        #     logging.warning("There are more than one paragraphs in the article.")  # noqa: E501
+        #     body = ' '.join(body_values)
+        else:
+            # body = body_values[0]
+            body = body_values
+
+        return {"title": title, "body": body}
+
+    def extract_meta(self, xml_string: str) -> Dict[str, Union[str, None]]:
+        """
+        Extracts metadata from XML string.
+
+        Parameters:
+            xml_string (str): XML string containing metadata.
+
+        Returns:
+            Dict[str, Union[str, None]]: A dictionary containing the extracted metadata.  # noqa: E501
+        """
+        newsletter_metadata: Dict[str, Union[str, None]] = {}
+
+        try:
+            root = ET.fromstring(xml_string)
+        except ET.ParseError:
+            logging.error("Failed to parse XML from file")
+            return newsletter_metadata
+
+        for field in self.fields:
+            field_values = [element.text for element in root.iter() if element.tag.endswith(field)]  # noqa: E501
+            if len(field_values) > 1:
+                logging.warning(f"More than one {field}s are extracted from metadata.")  # noqa: E501
+            if not field_values:
+                logging.warning(f"No {field} is extracted.")
+                newsletter_metadata[field] = None
+            else:
+                filtered_field_values = [value for value in field_values if value is not None]  # noqa: E501
+                newsletter_metadata[field] = filtered_field_values[0] if field != "spatial" else ", ".join(filtered_field_values)  # noqa: E501
+
+                # newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)  # noqa: E501
+
+        return newsletter_metadata
diff --git a/dataQuest/preprocessor/text_cleaner.py b/dataQuest/preprocessor/text_cleaner.py
new file mode 100644
index 0000000..33cd27b
--- /dev/null
+++ b/dataQuest/preprocessor/text_cleaner.py
@@ -0,0 +1,129 @@
+"""
+This module provides a TextCleaner class for preprocessing text
+data using various cleaning techniques.
+"""
+import re
+from typing import Union, List
+from dataQuest.settings import SPACY_MODEL
+from dataQuest.utils import load_spacy_model
+
+
+def merge_texts_list(text: Union[str, List[str]]) -> str:
+    """
+    Merge a list of texts into a single string by joining them with spaces.
+
+    Args:
+        text (Union[str, List[str]]): The input text or list of texts to merge.
+
+    Returns:
+        str: The merged text if input is a list of strings, otherwise returns
+        the input text unchanged.
+    """
+    if isinstance(text, list):
+        merged_text = ' '.join(text)
+        return merged_text
+    return text
+
+
+class TextCleaner:
+    """A class for cleaning text data using various preprocessing
+       techniques."""
+
+    def __init__(self, spacy_model=SPACY_MODEL) -> None:
+        """Initialize the TextCleaner instance.
+
+        Args:
+            spacy_model (str or spacy.Language, optional): The SpaCy
+                        model to use for text processing.
+                        Defaults to the model specified in the settings.
+        """
+
+        self.nlp = (
+            load_spacy_model(spacy_model)
+            if isinstance(spacy_model, str)
+            else spacy_model
+        )
+        self.stopword_list = self.nlp.Defaults.stop_words
+        self.stopwords = set(self.stopword_list)
+        self.text = ""
+
+    def get_lower_lemma_tokens(self) -> None:
+        """
+            Get lowercased lemmatized tokens from the text.
+
+            This method processes the text stored in the instance variable
+            `self.text`,tokenizes it using the SpaCy pipeline `self.nlp`,
+            and then lemmatizes each token, converting it to lowercase.
+            Stop words and punctuation tokens are filtered out.
+        """
+        doc = self.nlp(self.text)
+        self.text = " ".join([token.lemma_.lower() for token in doc
+                              if not token.is_stop and not token.is_punct])
+
+    def get_words(self):
+        """Tokenize words in the text."""
+        doc = self.nlp(self.text)
+        self.text = " ".join([token.text for token in doc])
+
+    def lower(self):
+        """Transform the text to lower case."""
+        self.text = self.text.lower()
+
+    def remove_stopwords(self):
+        """Remove the stopwords from the text."""
+        doc = self.nlp(self.text)
+        self.text = " ".join([token.text for token in doc if token.text
+                              not in self.stopwords])
+
+    def remove_numeric(self):
+        """Remove numbers from the text."""
+        self.text = re.sub(r'\d+', '', self.text)
+
+    def remove_non_ascii(self):
+        """Remove non ASCII characters from the text."""
+        self.text = re.sub(r'[^\x00-\x7f]', '', self.text)
+
+    def remove_extra_whitespace_tabs(self):
+        """Remove extra whitespaces and tabs from the text."""
+        self.text = re.sub(r'\s+', ' ', self.text)
+
+    def remove_one_char(self):
+        """Remove single characters from the text."""
+        self.text = " ".join([w for w in self.text.split() if len(w) > 1])
+
+    def keep_standard_chars(self):
+        """Keep only standard characters in the text."""
+        self.text = re.sub(r'[^-0-9\w,. ?!()%/]', '', self.text)
+
+    def preprocess(self, text):
+        """Preprocess the given text using a series of cleaning steps.
+
+        Args:
+            text ( List[str]): The text to preprocess.
+
+        Returns:
+            str: The preprocessed text.
+        """
+        self.text = merge_texts_list(text)
+        self.get_lower_lemma_tokens()
+        self.remove_numeric()
+        self.remove_extra_whitespace_tabs()
+        self.remove_one_char()
+        return self.text
+
+    def clean(self, text):
+        """Clean the given text by removing non-standard characters and
+           extra whitespace.
+
+        Args:
+            text (str): The text to clean.
+
+        Returns:
+            str: The cleaned text.
+        """
+        self.text = merge_texts_list(text)
+        self.text = text
+        self.get_words()
+        self.keep_standard_chars()
+        self.remove_extra_whitespace_tabs()
+        return self.text
diff --git a/dataQuest/settings.py b/dataQuest/settings.py
new file mode 100644
index 0000000..391f9b4
--- /dev/null
+++ b/dataQuest/settings.py
@@ -0,0 +1,10 @@
+"""
+Module containing configuration settings for the project.
+"""
+import os
+
+SPACY_MODEL = os.getenv("SPACY_MODEL", "nl_core_news_sm")
+"""Spacy model to use for sentence splitting."""
+
+ENCODING = os.getenv("ENCODING", "utf-8")
+"""Encoding used for reading and writing files."""
diff --git a/dataQuest/temporal_categorization/__init__.py b/dataQuest/temporal_categorization/__init__.py
new file mode 100644
index 0000000..ca3bb25
--- /dev/null
+++ b/dataQuest/temporal_categorization/__init__.py
@@ -0,0 +1,8 @@
+"""Mapping from string format descriptions to corresponding classes."""
+from dataQuest.temporal_categorization.timestamped_data \
+    import (YearPeriodData, DecadePeriodData)
+
+PERIOD_TYPES = {
+    "decade": DecadePeriodData,
+    "year": YearPeriodData
+}
diff --git a/dataQuest/temporal_categorization/timestamped_data.py b/dataQuest/temporal_categorization/timestamped_data.py
new file mode 100644
index 0000000..7a75488
--- /dev/null
+++ b/dataQuest/temporal_categorization/timestamped_data.py
@@ -0,0 +1,123 @@
+"""
+This module provides classes and utilities for working with data
+that includes timestamps.
+"""
+import json
+from datetime import datetime
+from pathlib import Path
+
+
+class TimestampedData:
+    """
+    Represents data with a timestamp.
+
+    Attributes:
+        DATE_FIELD (str): The field name for the timestamp in the data.
+        _filename (Path): The path to the file containing the data.
+        _data (dict): The loaded JSON data.
+        _timestamp (datetime): The timestamp extracted from the data.
+
+    Methods:
+        __init__(self, filename): Initializes the TimestampedData object.
+        filename(self) -> Path: Returns the filename path.
+        _load_data(self): Loads data from the file.
+        _get_timestamp(self): Extracts the timestamp from the data.
+        categorize(self): Abstract method for categorizing data by timestamp.
+    """
+
+    DATE_FIELD = "Date"
+
+    def __init__(self, filename: Path):
+        """
+        Initializes the TimestampedData object.
+
+        Args:
+            filename (Path): The path to the file containing the data.
+        """
+        self._filename = filename
+        self._data = self._load_data()
+        self._timestamp = self._get_timestamp()
+
+    @property
+    def filename(self) -> Path:
+        """
+        Returns the filename path.
+
+        Returns:
+            Path: The filename path.
+        """
+        return self._filename
+
+    def _load_data(self):
+        """
+        Loads data from the file.
+
+        Returns:
+            dict: The loaded JSON data.
+        """
+        with open(self._filename, 'r', encoding='utf-8') as file:
+            return json.load(file)
+
+    def data(self):
+        """
+        Returns the json data
+
+        Returns:
+            dict: The loaded JSON data.
+        """
+        return self._data
+
+    def _get_timestamp(self):
+        """
+        Extracts the timestamp from the data.
+
+        Returns:
+            datetime: The extracted timestamp.
+        """
+        return datetime.strptime(self._data[self.DATE_FIELD], '%Y-%m-%d')
+
+    def categorize(self):
+        """
+        Abstract method for categorizing data by timestamp.
+
+        Raises:
+            NotImplementedError: Subclasses must implement categorize method.
+        """
+        raise NotImplementedError("Subclass must implement categorize method")
+
+
+class YearPeriodData(TimestampedData):
+    """
+    Represents data categorized by year.
+
+    Methods:
+        categorize(self): Categorizes data by year.
+    """
+
+    def categorize(self):
+        """
+        Categorizes data by year.
+
+        Returns:
+            int: The year of the timestamp.
+        """
+        return self._timestamp.year
+
+
+class DecadePeriodData(TimestampedData):
+    """
+    Represents data categorized by decade.
+
+    Methods:
+        categorize(self): Categorizes data by decade.
+    """
+
+    def categorize(self):
+        """
+        Categorizes data by decade.
+
+        Returns:
+            int: The decade of the timestamp.
+        """
+        year = self._timestamp.year
+        return (year // 10) * 10
diff --git a/dataQuest/utils.py b/dataQuest/utils.py
new file mode 100644
index 0000000..7d8fd65
--- /dev/null
+++ b/dataQuest/utils.py
@@ -0,0 +1,174 @@
+"""
+Module containing utility functions for the project.
+"""
+import os
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from functools import cache
+import json
+import spacy
+import spacy.cli
+from dataQuest.filter.document_filter import (YearFilter,
+                                              TitleFilter,
+                                              DocumentFilter)
+from dataQuest.filter.document_filter import (CompoundFilter,
+                                              DecadeFilter,
+                                              KeywordsFilter)
+from dataQuest.settings import ENCODING
+
+
+@cache
+def load_spacy_model(model_name: str, retry: bool = True) \
+        -> Optional[spacy.Language]:
+    """Load and store a sentencize-only SpaCy model
+
+    Downloads the model if necessary.
+
+    Args:
+        model_name (str): The name of the SpaCy model to load.
+        retry (bool, optional): Whether to retry downloading the model
+            if loading fails initially. Defaults to True.
+
+    Returns:
+        spacy.Language: The SpaCy model object for the given name.
+    """
+
+    try:
+        nlp = spacy.load(model_name, disable=["tagger", "parser", "ner"])
+    except OSError as exc:
+        if retry:
+            spacy.cli.download(model_name)
+            return load_spacy_model(model_name, False)
+        raise exc
+    nlp.add_pipe("sentencizer")
+    return nlp
+
+
+def load_filters_from_config(config_file: Path) -> CompoundFilter:
+    """Load document filters from a configuration file.
+
+    Args:
+        config_file (Path): Path to the configuration file containing
+        filter settings.
+
+    Returns:
+        CompoundFilter: A compound filter containing individual document
+        filters loaded from the configuration.
+    """
+    with open(config_file, 'r', encoding=ENCODING) as f:
+        config: Dict[str, List[Dict[str, Any]]] = json.load(f)
+
+    filters: List[DocumentFilter] = []
+    for filter_config in config['filters']:
+        filter_type = filter_config['type']
+        if filter_type == 'TitleFilter':
+            filters.append(TitleFilter(filter_config['title']))
+        elif filter_type == 'YearFilter':
+            filters.append(YearFilter(filter_config['year']))
+        elif filter_type == 'DecadeFilter':
+            filters.append(DecadeFilter(filter_config['decade']))
+        elif filter_type == 'KeywordsFilter':
+            filters.append(KeywordsFilter(filter_config['keywords']))
+
+    return CompoundFilter(filters)
+
+
+def get_keywords_from_config(config_file: Path) -> List[str]:
+    """
+        Extract keywords from a JSON configuration file.
+
+        Args:
+            config_file (Path): The path to the JSON configuration file.
+
+        Returns:
+            List[str]: The list of keywords extracted from the configuration
+            file.
+
+        Raises:
+            FileNotFoundError: If the config file is not found or cannot be
+            opened.
+            KeyError: If the required keys are not found in the configuration
+            file.
+            TypeError: If the data in the configuration file is not in the
+            expected format.
+    """
+    try:
+        with open(config_file, 'r', encoding=ENCODING) as f:
+            config: Dict[str, List[Dict[str, Any]]] = json.load(f)
+
+        for filter_config in config['filters']:
+            filter_type = filter_config['type']
+            if filter_type == 'KeywordsFilter':
+                return filter_config['keywords']
+        return []
+    except FileNotFoundError as exc:
+        raise FileNotFoundError("Config file not found") from exc
+    except KeyError as exc:
+        raise KeyError("Keywords not found in config file") from exc
+
+
+def read_config(config_file: Path, item_key: str) -> Dict[str, str]:
+    """
+        Get the value of the given key item from a JSON file.
+
+        Args:
+            config_file (Path): The path to the JSON config file.
+            item_key (str): Key item defined in config file.
+        Returns:
+            Dict[str, str]: The article selector configuration.
+
+        Raises:
+            KeyError: If the key item is not found in the config file.
+            FileNotFoundError: If the config file is not found.
+    """
+    try:
+        with open(config_file, 'r', encoding=ENCODING) as f:
+            config: Dict[str, str] = json.load(f)[item_key]
+        if not config:
+            raise ValueError("Config is empty")
+        return config
+    except FileNotFoundError as exc:
+        raise FileNotFoundError("Config file not found") from exc
+    except KeyError as exc:
+        raise KeyError("Key item %s not found in config file") from exc
+
+
+def save_filtered_articles(input_file: Any, article_id: str,
+                           output_dir: str) -> None:
+    """Save filtered articles data to a JSON file.
+
+    Args:
+        input_file: The input file object.
+        article_id (str): The ID of the article.
+        output_dir (str): The directory where the JSON file will be saved.
+
+    Returns:
+        None
+    """
+    data = {
+        "file_path": str(input_file.filepath),
+        "article_id": str(article_id),
+        "Date": str(input_file.doc().publish_date),
+        "Title": input_file.doc().title,
+    }
+
+    output_fp = os.path.join(output_dir, input_file.base_file_name() + '.json')
+    print('output_fp', output_fp)
+    with open(output_fp, "w", encoding=ENCODING) as json_file:
+        json.dump(data, json_file, indent=4)
+
+
+def get_file_name_without_extension(full_path: str) -> str:
+    """
+    Extracts the file name without extension from a full path.
+
+    Args:
+        full_path (str): The full path of the file.
+
+    Returns:
+        str: The file name without extension.
+
+    """
+    base_name = os.path.basename(full_path)
+    file_name_without_ext = os.path.splitext(base_name)[0]
+    return file_name_without_ext
diff --git a/scripts/convert_input_files.py b/scripts/convert_input_files.py
index b6d2dea..023d152 100644
--- a/scripts/convert_input_files.py
+++ b/scripts/convert_input_files.py
@@ -1,4 +1,4 @@
-from interest.preprocessor.parser import XMLExtractor
+from dataQuest.preprocessor.parser import XMLExtractor
 from argparse import ArgumentParser
 from pathlib import Path
 import logging
diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py
index 99d59a0..fa638f2 100644
--- a/scripts/step1_filter_articles.py
+++ b/scripts/step1_filter_articles.py
@@ -9,10 +9,10 @@
 
 from tqdm import tqdm
 
-from interest.filter import INPUT_FILE_TYPES
-from interest.filter.input_file import InputFile
-from interest.utils import load_filters_from_config
-from interest.utils import save_filtered_articles
+from dataQuest.filter import INPUT_FILE_TYPES
+from dataQuest.filter.input_file import InputFile
+from dataQuest.utils import load_filters_from_config
+from dataQuest.utils import save_filtered_articles
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Filter articles from input files.")
diff --git a/scripts/step2_categorize_by_timestamp.py b/scripts/step2_categorize_by_timestamp.py
index 0979642..586eb3d 100644
--- a/scripts/step2_categorize_by_timestamp.py
+++ b/scripts/step2_categorize_by_timestamp.py
@@ -9,8 +9,8 @@
 from pathlib import Path
 import pandas as pd
 from tqdm import tqdm  # type: ignore
-from interest.temporal_categorization import PERIOD_TYPES
-from interest.temporal_categorization.timestamped_data import TimestampedData
+from dataQuest.temporal_categorization import PERIOD_TYPES
+from dataQuest.temporal_categorization.timestamped_data import TimestampedData
 
 OUTPUT_FILE_NAME = 'articles'
 FILENAME_COLUMN = 'file_path'
diff --git a/scripts/step3_select_final_articles.py b/scripts/step3_select_final_articles.py
index 37f723c..82a71f9 100644
--- a/scripts/step3_select_final_articles.py
+++ b/scripts/step3_select_final_articles.py
@@ -4,9 +4,9 @@
 from typing import List
 from pathlib import Path
 import pandas as pd
-from interest.utils import get_keywords_from_config
-from interest.utils import read_config
-from interest.article_final_selection.process_articles import select_articles
+from dataQuest.utils import get_keywords_from_config
+from dataQuest.utils import read_config
+from dataQuest.article_final_selection.process_articles import select_articles
 
 ARTICLE_SELECTOR_FIELD = "article_selector"
 
diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py
index 161140c..5a62e5a 100644
--- a/scripts/step4_generate_output.py
+++ b/scripts/step4_generate_output.py
@@ -7,11 +7,11 @@
 from typing import Union
 import pandas as pd
 from pandas import DataFrame
-from interest.settings import SPACY_MODEL
-from interest.article_final_selection.process_article import ArticleProcessor
-from interest.utils import read_config, get_file_name_without_extension
-from interest.output_generator.text_formater import (TextFormatter,
-                                                     SEGMENTED_TEXT_FORMATTER)
+from dataQuest.settings import SPACY_MODEL
+from dataQuest.article_final_selection.process_article import ArticleProcessor
+from dataQuest.utils import read_config, get_file_name_without_extension
+from dataQuest.output_generator.text_formater import (TextFormatter,
+                                                      SEGMENTED_TEXT_FORMATTER)
 
 
 FILE_PATH_FIELD = "file_path"

From bb52e561d73f940703808f6a46f2a021d533adc9 Mon Sep 17 00:00:00 2001
From: parisa-zahedi <p.zahedi@uu.nl>
Date: Tue, 25 Jun 2024 14:53:47 +0200
Subject: [PATCH 2/4] remove files in old directory

---
 interest/__init__.py                          |   0
 interest/article_final_selection/__init__.py  |   0
 .../article_selector.py                       |  56 -----
 .../process_article.py                        |  94 --------
 .../process_articles.py                       | 102 ---------
 interest/filter/__init__.py                   |   7 -
 interest/filter/delpher_kranten.py            | 118 ----------
 interest/filter/document.py                   | 146 ------------
 interest/filter/document_filter.py            | 208 ------------------
 interest/filter/input_file.py                 | 119 ----------
 interest/models/base.py                       |  20 --
 interest/models/tfidf.py                      | 101 ---------
 interest/output_generator/text_formater.py    | 117 ----------
 interest/preprocessor/__init__.py             |   1 -
 interest/preprocessor/parser.py               | 207 -----------------
 interest/preprocessor/text_cleaner.py         | 129 -----------
 interest/settings.py                          |  10 -
 interest/temporal_categorization/__init__.py  |   8 -
 .../timestamped_data.py                       | 123 -----------
 interest/utils.py                             | 174 ---------------
 20 files changed, 1740 deletions(-)
 delete mode 100644 interest/__init__.py
 delete mode 100644 interest/article_final_selection/__init__.py
 delete mode 100644 interest/article_final_selection/article_selector.py
 delete mode 100644 interest/article_final_selection/process_article.py
 delete mode 100644 interest/article_final_selection/process_articles.py
 delete mode 100644 interest/filter/__init__.py
 delete mode 100644 interest/filter/delpher_kranten.py
 delete mode 100644 interest/filter/document.py
 delete mode 100644 interest/filter/document_filter.py
 delete mode 100644 interest/filter/input_file.py
 delete mode 100644 interest/models/base.py
 delete mode 100644 interest/models/tfidf.py
 delete mode 100644 interest/output_generator/text_formater.py
 delete mode 100644 interest/preprocessor/__init__.py
 delete mode 100644 interest/preprocessor/parser.py
 delete mode 100644 interest/preprocessor/text_cleaner.py
 delete mode 100644 interest/settings.py
 delete mode 100644 interest/temporal_categorization/__init__.py
 delete mode 100644 interest/temporal_categorization/timestamped_data.py
 delete mode 100644 interest/utils.py

diff --git a/interest/__init__.py b/interest/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/interest/article_final_selection/__init__.py b/interest/article_final_selection/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/interest/article_final_selection/article_selector.py b/interest/article_final_selection/article_selector.py
deleted file mode 100644
index c94ab9b..0000000
--- a/interest/article_final_selection/article_selector.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""Module containing the ArticleSelector class for selecting articles based on
-similarity scores."""
-
-from typing import List, Dict, Union
-
-
-class ArticleSelector:
-    """Class for selecting articles based on similarity scores and
-    configuration parameters."""
-    # pylint: disable=too-few-public-methods
-
-    def __init__(self, similarity_scores: List[float],
-                 config: Dict[str, Union[str, float, int]]):
-        """Initializes the ArticleSelector object.
-
-        Args:
-            similarity_scores (List[float]): A list of similarity scores
-             between keywords and articles.
-            config (Dict[str, Union[str, float, int]]): A dictionary containing
-            configuration parameters for selecting articles.
-        """
-        self.similarity_scores = similarity_scores
-        self.config = config
-
-    def select_articles(self) -> List[int]:
-        """Selects articles based on the configured selection method and value.
-
-        Returns:
-            List[int]: A list of indices of selected articles.
-        """
-        sorted_indices = sorted(
-            range(len(self.similarity_scores)),
-            key=lambda i: self.similarity_scores[i],
-            reverse=True
-        )
-
-        selected_indices: List[int] = []
-        if self.config["type"] == "threshold":
-            threshold = float(self.config["value"])
-            selected_indices.extend(
-                i for i, score in enumerate(self.similarity_scores)
-                if score >= threshold
-            )
-        elif self.config["type"] == "num_articles":
-            num_articles = int(self.config["value"])
-            selected_indices.extend(sorted_indices[:num_articles])
-
-        elif self.config["type"] == "percentage":
-            percentage = float(self.config["value"])
-            num_articles = int(len(self.similarity_scores) *
-                               (percentage / 100.0))
-            num_articles = len(self.similarity_scores) if num_articles == 0\
-                else num_articles
-            selected_indices.extend(sorted_indices[:num_articles])
-
-        return selected_indices
diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py
deleted file mode 100644
index 763e166..0000000
--- a/interest/article_final_selection/process_article.py
+++ /dev/null
@@ -1,94 +0,0 @@
-""" Module for processing articles from gzip files."""
-import gzip
-import json
-import logging
-from typing import List, Union, Tuple
-from interest.preprocessor.text_cleaner import TextCleaner
-
-text_cleaner = TextCleaner()
-
-
-def clean(text:  Union[str, List[str]]) -> str:
-    """
-    Clean the input text using TextCleaner.
-
-    Args:
-        text (str): The input text to clean.
-
-    Returns:
-        str: The cleaned text.
-    """
-    return text_cleaner.preprocess(text)
-
-# pylint: disable=too-few-public-methods
-
-
-class ArticleProcessor:
-    """
-        Process individual articles from gzip files.
-
-        This class handles the processing of individual articles from
-        gzip files.
-        It reads the content of the article, cleans it using TextCleaner, and
-        determines whether the article contains any keywords of interests in
-        the title.
-    """
-    def __init__(self, gzip_file_path: str, article_id: int):
-        """
-        Initialize ArticleProcessor with the gzip file path and article ID.
-
-        Args:
-            gzip_file_path (str): The path to the gzip file.
-            article_id (int): The ID of the article.
-        """
-        self._file_path = gzip_file_path
-        self._article_id = article_id
-        self._title: Union[str, None] = ''
-        self._body: Union[str, list, None] = ''
-        self.selected: bool = False
-
-    def read_article_from_gzip(self) -> (
-            Tuple)[Union[str, None], Union[List[str], None], Union[str, None]]:
-        """
-        Read article content from a gzip file.
-
-        Returns:
-            Tuple[Union[str, None], Union[list, None], Union[str, None]]:
-            A tuple containing the title, body, and date of the article.
-        """
-        try:
-            with gzip.open(self._file_path, 'rt') as f:
-                data = json.load(f)
-                metadata = data.get('newsletter_metadata', {})
-                date = metadata.get('date', {})
-                articles = data.get('articles', {})
-                article = articles.get(str(self._article_id), {})
-                title = article.get('title', {})
-                body = article.get('body', {})
-                return title, body, date
-        except Exception as e:  # pylint: disable=broad-except
-            logging.error("Error reading article %s from %s: %s",
-                          str(self._article_id), self._file_path, e)
-            return None, None, None
-
-    def process_article(self, clean_keywords: List[str]) -> str:
-        """
-        Process the article content.
-
-        Args:
-            clean_keywords (List[str]): A list of clean keywords.
-
-        Returns:
-            str: The processed article body.
-        """
-        self._title, self._body, _ = self.read_article_from_gzip()
-        if (self._title is None) or (self._body is None):
-            return ""
-        clean_title = clean(self._title)
-        title_with_keyword = any(keyword in clean_title
-                                 for keyword in clean_keywords)
-        if title_with_keyword:
-            self.selected = True
-            return ""
-
-        return clean(self._body)
diff --git a/interest/article_final_selection/process_articles.py b/interest/article_final_selection/process_articles.py
deleted file mode 100644
index c768271..0000000
--- a/interest/article_final_selection/process_articles.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-This module contains functions for selecting articles based on keywords
-and similarity scores.
-"""
-from typing import List, Tuple, Dict, Union
-import pandas as pd
-from sklearn.metrics.pairwise import cosine_similarity
-from interest.models.tfidf import TfidfEmbedder
-from interest.article_final_selection.process_article import ArticleProcessor
-from interest.article_final_selection.process_article import clean
-from interest.article_final_selection.article_selector import ArticleSelector
-
-
-def process_articles(articles_filepath: str, clean_keywords: List[str]) -> (
-        Tuple)[List[str], List[int]]:
-    """
-    Process articles from a CSV file.
-
-    Args:
-        articles_filepath (str): The path to the CSV file containing articles.
-        clean_keywords (List[str]): A list of clean keywords.
-
-    Returns:
-        Tuple[List[str], List[int]]: A tuple containing the processed article
-         bodies and selected indices.
-    """
-    articles_df = pd.read_csv(articles_filepath)
-    article_bodies: List[str] = []
-    selected_indices: List[int] = []
-    for index, row in articles_df.iterrows():
-        article_processor = ArticleProcessor(row['file_path'],
-                                             row['article_id'])
-        processed_article_body = article_processor.process_article(
-                                                   clean_keywords)
-        if article_processor.selected:
-            selected_indices.append(int(str(index)))
-        elif processed_article_body != "":
-            article_bodies.append(processed_article_body)
-    return article_bodies, selected_indices
-
-
-def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> (
-        List)[float]:
-    """
-    Apply TF-IDF similarity between documents and keywords.
-
-    Args:
-        documents (List[str]): A list of document bodies.
-        keywords (List[str]): A list of keywords.
-
-    Returns:
-        List[float]: A list of similarity scores.
-    """
-    model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1,
-                          max_df=1.0)
-    keywords_list = [" ".join(keywords)]
-    model.fit(documents)
-    embeddings_documents = model.transform(documents).tocsr()
-    embeddings_keywords = model.transform(keywords_list).tocsr()
-    similarity_scores = cosine_similarity(embeddings_keywords,
-                                          embeddings_documents)
-    return similarity_scores[0]
-
-
-def select_top_articles(similarity_scores: List[float],
-                        config: Dict[str, Union[str, float, int]]) \
-                        -> List[int]:
-    """
-    Select top articles based on similarity scores and configuration.
-
-    Args:
-        similarity_scores (List[float]): A list of similarity scores.
-        config (Dict[str, str]): Configuration for selecting articles.
-
-    Returns:
-        List[int]: A list of selected article indices.
-    """
-    selector = ArticleSelector(similarity_scores, config)
-    selected_indices = selector.select_articles()
-    return selected_indices
-
-
-def select_articles(articles_filepath: str, keywords: List[str],
-                    config: Dict[str, Union[str, float, int]]) -> List[int]:
-    """
-    Select articles based on keywords, similarity scores, and configuration.
-
-    Args:
-        articles_filepath (str): The path to the CSV file containing articles.
-        keywords (List[str]): A list of keywords.
-        config (Dict[str, str]): Configuration for selecting articles.
-
-    Returns:
-        List[int]: A list of selected article indices.
-    """
-    clean_keywords = [clean(keyword) for keyword in keywords]
-    article_bodies, selected_indices = process_articles(articles_filepath,
-                                                        clean_keywords)
-    similarity_scores = apply_tfidf_similarity(article_bodies, clean_keywords)
-    indices = select_top_articles(similarity_scores, config)
-    selected_indices.extend(indices)
-    return selected_indices
diff --git a/interest/filter/__init__.py b/interest/filter/__init__.py
deleted file mode 100644
index 5618aa7..0000000
--- a/interest/filter/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-"""define input-file type"""
-from interest.filter.delpher_kranten import KrantenFile
-
-INPUT_FILE_TYPES = {
-    "delpher_kranten": KrantenFile
-
-}
diff --git a/interest/filter/delpher_kranten.py b/interest/filter/delpher_kranten.py
deleted file mode 100644
index ec2dc85..0000000
--- a/interest/filter/delpher_kranten.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""
-Delpher Kranten Module
-
-This module provides classes and functions for handling Delpher Kranten files.
-"""
-
-import json
-import logging
-import os
-from typing import Optional
-from interest.filter.document import Document, Article
-from interest.filter.input_file import InputFile
-
-
-class KrantenFile(InputFile):
-    """
-    An InputFile implementation for Delpher Kranten.
-
-    Input is a zip file which includes one JSON file. The JSON file contains
-    metadata and articles from one issue of a newspaper.
-
-    Attributes:
-        METADATA_FIELD (str): The key for metadata field in JSON data.
-        TITLE_FIELD (str): The key for title field in metadata.
-        DATE_FIELD (str): The key for date field in metadata.
-        LANGUAGE_FIELD (str): The key for language field in metadata.
-        ARTICLES_FIELD (str): The key for articles field in JSON data.
-        ARTICLE_TITLE_FIELD (str): The key for title field in an article.
-        ARTICLE_BODY_FIELD (str): The key for body field in an article.
-        ENCODING (str): The encoding format for reading the file.
-
-    Methods:
-        read_json(json_file): Read JSON data from a file and parse it into
-        a Document object.
-        base_file_name(): Extract the base file name without extension from
-        the filepath.
-        doc(): Read the directory and parse the JSON file into a Document
-        object.
-    """
-
-    METADATA_FIELD = "newsletter_metadata"
-    TITLE_FIELD = "title"
-    DATE_FIELD = "date"
-    LANGUAGE_FIELD = "language"
-    ARTICLES_FIELD = "articles"
-    ARTICLE_TITLE_FIELD = "title"
-    ARTICLE_BODY_FIELD = "body"
-    ENCODING = "utf-8"
-
-    def read_json(self, json_file) -> Optional[Document]:
-        """
-                Read JSON data from a file and parse it into a Document object.
-
-                Args:
-                    json_file: A file object containing JSON data.
-
-                Returns:
-                    Optional[Document]: A Document object parsed from
-                    the JSON data, or None if parsing fails.
-        """
-        try:
-            json_data = json.load(json_file)
-            metadata = json_data[self.METADATA_FIELD]
-            document_title = metadata[self.TITLE_FIELD]
-            publish_date = metadata[self.DATE_FIELD]
-            language = metadata[self.LANGUAGE_FIELD]
-
-            articles_data = json_data[self.ARTICLES_FIELD]
-
-            articles = []
-            for article_id, article in articles_data.items():
-                article_title = article[self.ARTICLE_TITLE_FIELD]
-                article_body = article[self.ARTICLE_BODY_FIELD]
-                article = Article(article_id=article_id, title=article_title,
-                                  body=article_body)
-                articles.append(article)
-
-            document = Document(title=document_title,
-                                publish_date=publish_date,
-                                language=language,
-                                articles=articles)
-            return document
-
-        except (json.JSONDecodeError, KeyError) as e:
-            logging.error("Error parsing JSON data: %s", e)
-            return None
-
-    def base_file_name(self) -> str:
-        """
-               Extract the base file name without extension from the filepath.
-
-               Returns:
-                   str: The base file name without extension.
-        """
-        file_name_json = os.path.splitext(os.path.basename(self.filepath))[0]
-        base_file_name = os.path.splitext(file_name_json)[0]
-        return base_file_name
-
-    def doc(self) -> Optional[Document]:
-        """
-                Read the directory and parse the JSON file into a Document
-                object.
-
-                Returns:
-                    Optional[Document]: A Document object parsed from the
-                    JSON data, or None if parsing fails.
-        """
-        try:
-            logging.info("Reading directory '%s'...", self._filepath)
-            fh = self.open(encoding=self.ENCODING)
-            document = self.read_json(fh)
-            fh.close()
-            return document
-
-        except OSError as e:
-            logging.error("Error processing gzip file '%s': %s",
-                          self._filepath, e)
-            return None
diff --git a/interest/filter/document.py b/interest/filter/document.py
deleted file mode 100644
index eb3b1d3..0000000
--- a/interest/filter/document.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# pylint: disable=too-few-public-methods
-"""
-This module defines the Document class, which represents a document
-containing articles.
-"""
-import logging
-from typing import Optional, List, Union
-from datetime import datetime
-
-
-class Article:
-    """A class representing an article.
-
-        This class represents an article with an ID, title, and body text.
-        The body text can be provided as a list
-        of paragraphs, which will be joined into a single string.
-
-        Attributes:
-            id (str): The unique identifier of the article.
-            title (str): The title of the article.
-            body (str): The body text of the article, represented as
-            a single string.
-    """
-    def __init__(self, article_id: str, title: str,
-                 body: Union[str, List[str]]) -> None:
-        """Initialize an Article object with the given ID, title, and body.
-
-                Args:
-                    id (str): The unique identifier of the article.
-                    title (str): The title of the article.
-                    body (Union[str, List[str]): The body text of the article,
-                    provided as a list of paragraphs.
-        """
-        self.id = article_id
-        self.title = title
-        if isinstance(body, list):
-            if any(item is None for item in body):
-                logging.warning("There is a None value in body")
-                self.text = ""
-            else:
-                article_body = '\n'.join(body)
-                self.text = article_body
-        else:
-            self.text = body
-
-
-class Document:
-    """
-        Represents a document containing articles.
-
-        Args:
-            title (str): The title of the document.
-            publish_date (str): The publication date of the document in
-            the format 'YYYY-MM-DD'.
-            language (str): The language of the document.
-            articles (List[Article]): A list of articles included in
-             the document.
-
-        Attributes:
-            _title (str): The title of the document.
-            _publish_date (str): The publication date of the document in
-            the format 'YYYY-MM-DD'.
-            _year (Optional[int]): The year of publication, extracted from
-            publish_date.
-            _language (str): The language of the document.
-            _articles (List[Article]): A list of articles included in the
-             document.
-
-        Properties:
-            title (str): Getter for the title of the document.
-            publish_date (str): Getter for the publication date of the
-            document.
-            year (Optional[int]): Getter for the year of publication.
-            decade (Optional[int]): Getter for the decade of publication.
-            language (str): Getter for the language of the document.
-            articles (List[Article]): Getter for the list of articles
-            included in the document.
-    """
-    def __init__(self, title: str, publish_date: str, language: str,
-                 articles: List[Article]) -> None:
-        self._year: Optional[int] = None
-        self._articles = articles
-        self._title = title
-        self._publish_date = publish_date
-        self._language = language
-
-    @property
-    def title(self) -> str:
-        """
-            Getter for the title of the document.
-
-            Returns:
-                str: The title of the document.
-        """
-        return self._title
-
-    @property
-    def publish_date(self) -> str:
-        """
-           Getter for the publish_date of the document.
-
-           Returns:
-               str: The publish_date of the document.
-        """
-        return self._publish_date
-
-    @property
-    def year(self) -> Optional[int]:
-        """
-            Getter for the year of publication.
-
-            Returns:
-                Optional[int]: The year of publication extracted
-                from publish_date, or None if it cannot be determined.
-        """
-        if self._year is not None:
-            return self._year
-        try:
-            date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d')
-            self._year = date_obj.year
-            return self._year
-        except ValueError:
-            return None
-
-    @property
-    def decade(self) -> Optional[int]:
-        """
-            Getter for the decade of publication.
-
-            Returns:
-                Optional[int]: The decade of publication extracted from
-                publish_date,
-                or None if it cannot be determined.
-        """
-        _ = self.year
-        return int(self._year / 10) * 10 if self._year is not None else None
-
-    @property
-    def articles(self) -> List[Article]:
-        """
-            Getter for the list of articles included in the document.
-
-            Returns:
-                List[Article]: The list of articles included in the document.
-        """
-        return self._articles
diff --git a/interest/filter/document_filter.py b/interest/filter/document_filter.py
deleted file mode 100644
index 19f5412..0000000
--- a/interest/filter/document_filter.py
+++ /dev/null
@@ -1,208 +0,0 @@
-"""
-Document Filter Module
-This module provides classes for filtering documents and articles.
-"""
-from abc import ABC, abstractmethod
-from typing import List
-from interest.filter.document import Document, Article
-
-
-class DocumentFilter(ABC):
-    """
-        Abstract base class for document filters.
-
-        Methods:
-            filter_document(document: Document) -> bool: Abstract method
-             to filter documents.
-            filter_article(article: Article) -> bool: Method to filter
-            articles.
-    """
-    @abstractmethod
-    def filter_document(self, document: Document) -> bool:
-        """
-               Abstract method to filter documents.
-
-               Args:
-                   document (Document): The document to be filtered.
-
-               Returns:
-                   bool: True if the document passes the filter,
-                   False otherwise.
-        """
-        return NotImplemented
-
-    def filter_article(self, _article: Article) -> bool:
-        """
-                Method to filter articles.
-
-                By default, returns True, allowing all articles to
-                pass through.
-
-                Args:
-                    _article (Article): The article to be filtered.
-
-                Returns:
-                    bool: True if the article passes the filter,
-                     False otherwise.
-        """
-        return True
-
-
-class TitleFilter(DocumentFilter):
-    """
-        Filter documents by title.
-
-        Attributes:
-            title (str): The title to filter by.
-    """
-    def __init__(self, title: str):
-        self.title = title
-
-    def filter_document(self, document: Document) -> bool:
-        """
-                Filter documents by title.
-
-                Args:
-                    document (Document): The document to be filtered.
-
-                Returns:
-                    bool: True if the document's title contains the specified
-                    title, False otherwise.
-        """
-        return self.title in document.title
-
-
-class YearFilter(DocumentFilter):
-    """
-       Filter documents by year.
-
-       Attributes:
-           year (int): The year to filter by.
-    """
-    def __init__(self, year: int):
-        self.year = year
-
-    def filter_document(self, document: Document) -> bool:
-        """
-                Filter documents by year.
-
-                Args:
-                    document (Document): The document to be filtered.
-
-                Returns:
-                    bool: True if the document's year matches the specified
-                    year, False otherwise.
-        """
-        return document.year == self.year
-
-
-class DecadeFilter(DocumentFilter):
-    """
-        Filter documents by decade.
-
-        Attributes:
-            decade (int): The decade to filter by.
-    """
-    def __init__(self, decade: int):
-        self.decade = decade
-
-    def filter_document(self, document: Document) -> bool:
-        """
-                Filter documents by decade.
-
-                Args:
-                    document (Document): The document to be filtered.
-
-                Returns:
-                    bool: True if the document's decade matches the
-                    specified decade, False otherwise.
-        """
-        return document.decade == self.decade
-
-
-class KeywordsFilter(DocumentFilter):
-    """
-        Filter documents and articles by keywords.
-
-        Attributes:
-            keywords (List[str]): The list of keywords to filter by.
-    """
-    def __init__(self, keywords: List[str]):
-        self.keywords = keywords
-
-    def filter_document(self, document: Document) -> bool:
-        """
-                Filter documents by keywords.
-
-                Args:
-                    document (Document): The document to be filtered.
-
-                Returns:
-                    bool: Always returns True.
-        """
-        return True
-
-    def filter_article(self, article: Article) -> bool:
-        """
-                Filter articles by keywords.
-
-                Args:
-                    article (Article): The article to be filtered.
-
-                Returns:
-                    bool: True if the article's title or text contains any
-                    of the specified keywords, False otherwise.
-        """
-        return any(keyword in article.title or keyword in article.text for
-                   keyword in self.keywords)
-
-
-class CompoundFilter(DocumentFilter):
-    """
-        Compound filter combining multiple filters.
-
-        Attributes:
-            filters (List[DocumentFilter]): The list of filters to apply.
-    """
-    def __init__(self, filters: List[DocumentFilter]):
-        self.filters = filters
-
-    def filter_document(self, document: Document) -> bool:
-        """
-                Filter documents by applying all filters.
-
-                Args:
-                    document (Document): The document to be filtered.
-
-                Returns:
-                    bool: True if the document passes all filters,
-                    False otherwise.
-        """
-        return all(filter_.filter_document(document)
-                   for filter_ in self.filters)
-
-    def filter_article(self, article: Article) -> bool:
-        """
-                Filter articles by applying all filters.
-
-                Args:
-                    article (Article): The article to be filtered.
-
-                Returns:
-                    bool: True if the article passes all filters,
-                    False otherwise.
-        """
-        return all(filter_.filter_article(article) for filter_ in self.filters)
-
-    def include_keyword_filter(self) -> bool:
-        """
-                Check if the compound filter includes a KeywordsFilter.
-
-                Returns:
-                    bool: True if the compound filter includes a
-                    KeywordsFilter, False otherwise.
-        """
-        for filter_ in self.filters:
-            if isinstance(filter_, KeywordsFilter):
-                return True
-        return False
diff --git a/interest/filter/input_file.py b/interest/filter/input_file.py
deleted file mode 100644
index dcb7504..0000000
--- a/interest/filter/input_file.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""
-Input File Module
-This module provides an abstract class for representing various input files.
-"""
-
-import abc
-import gzip
-import logging
-from pathlib import Path
-from typing import Iterable, TextIO, cast, Optional
-from interest.filter.document import Document, Article
-from interest.filter.document_filter import DocumentFilter
-
-
-class InputFile(abc.ABC):
-    """
-    Abstract class for representing various input files.
-
-    Attributes:
-        _filepath (Path): The file path of the input file.
-
-    Methods:
-        __init__(filepath): Initialize the InputFile with a file path.
-        filepath(): Get the file path of the input file.
-        base_file_name(): Output a list of documents in the input file.
-        open(mode, encoding): Open the input file for reading.
-        articles(): Return all articles for the document found in the
-        input file.
-        doc(): Output a list of documents in the input file.
-    """
-
-    def __init__(self, filepath: Path) -> None:
-        """
-               Initialize the InputFile with a file path.
-
-               Args:
-                   filepath (Path): The file path of the input file.
-        """
-        self._filepath = filepath
-
-    @property
-    def filepath(self) -> Path:
-        """
-                Get the file path of the input file.
-
-                Returns:
-                    Path: The file path of the input file.
-        """
-        return self._filepath
-
-    @abc.abstractmethod
-    def base_file_name(self) -> str:
-        """
-        Output a list of documents in the input file.
-
-        This can be a singleton list if an input file contains only
-        one document.
-
-        Returns:
-            str: The base file name without extension.
-        """
-        return NotImplemented
-
-    def open(self, mode: str = "rt", encoding=None) -> TextIO:
-        """
-                Open the input file for reading.
-
-                Args:
-                    mode (str): The file open mode.
-                    encoding: The encoding format.
-
-                Returns:
-                    TextIO: A file object for reading the input file.
-        """
-        if self._filepath.suffix.startswith(".gz"):
-            return cast(TextIO, gzip.open(self._filepath, mode=mode,
-                                          encoding=encoding))
-
-        # Default to text file
-        return cast(TextIO, open(self._filepath,
-                                 mode=mode, encoding=encoding))
-
-    # pylint: disable=no-member
-    def articles(self) -> Iterable[Article]:
-        """
-        Return all articles for the document found in the input file.
-
-        Yields:
-            Article: An article object.
-        """
-        doc = self.doc()
-        if doc is not None:
-            yield from doc.articles
-        else:
-            logging.error("Document not found or is None for filepath: %s",
-                          self.filepath)
-            return
-
-    @abc.abstractmethod
-    def doc(self) -> Optional[Document]:
-        """
-            Output a list of documents in the input file.
-
-            This can be a singleton list if an input file contains only
-            one document.
-
-            Returns:
-                Document: A document object.
-        """
-        return NotImplemented
-
-    def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
-        document = self.doc()
-        if document is not None:
-            if filter.filter_document(document):
-                if document.articles is not None:
-                    for article in document.articles:
-                        if filter.filter_article(article):
-                            yield article
diff --git a/interest/models/base.py b/interest/models/base.py
deleted file mode 100644
index 64832a1..0000000
--- a/interest/models/base.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Base class for document embeddings."""
-
-from abc import ABC, abstractmethod
-from typing import Union, Sequence
-import scipy
-from numpy import typing as npt
-import numpy as np
-
-
-class BaseEmbedder(ABC):
-    """Base class for creating document embeddings."""
-
-    @abstractmethod
-    def fit(self, documents: Sequence[str]) -> None:
-        """Train the model on documents."""
-
-    @abstractmethod
-    def transform(self, documents: Union[str, Sequence[str]]) -> (
-            Union)[scipy.sparse.spmatrix, npt.NDArray[np.float_]]:
-        """Get the embedding for a document."""
diff --git a/interest/models/tfidf.py b/interest/models/tfidf.py
deleted file mode 100644
index c443843..0000000
--- a/interest/models/tfidf.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""Sklearn TF-IDF class."""
-
-from typing import Sequence, Union, Optional
-import warnings
-
-import scipy
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-from interest.models.base import BaseEmbedder
-from interest.utils import load_spacy_model
-from interest.settings import SPACY_MODEL
-
-
-class TfidfEmbedder(BaseEmbedder):
-    # pylint: disable=too-many-instance-attributes
-    """
-       Sklearn TF-IDF class.
-
-       Arguments
-       ---------
-       ngram_max:
-           Maximum n-gram, higher numbers mean bigger embeddings.
-       norm:
-           Which kind of normalization is used: "l1", "l2" or None.
-       sublinear_tf:
-           Apply sublinear term-frequency scaling.
-       min_df:
-           Minimum document frequency of word to be included in the embedding.
-       max_df:
-           Maximum document frequency of word to be included in the embedding.
-       """
-
-    # pylint: disable=too-many-arguments
-
-    def __init__(
-            self, ngram_max: int = 1, norm: Optional[str] = "l1",
-            sublinear_tf: bool = False, min_df: int = 1,
-            max_df: float = 1.0, spacy_model=SPACY_MODEL):
-        """Initialize the TF-IDF embedder."""
-        self.nlp = (
-            load_spacy_model(spacy_model)
-            if isinstance(spacy_model, str)
-            else spacy_model
-        )
-        self.stopword_list = self.nlp.Defaults.stop_words
-        self.stop_words = list(self.stopword_list)
-        self.ngram_max = ngram_max
-
-        self.norm = norm
-        self.sublinear_tf = sublinear_tf
-        self.min_df = min_df
-        self.max_df = max_df
-        if self.norm == "None":
-            self.norm = None
-
-        self._model: Optional[TfidfVectorizer] = None
-
-    def fit(self, documents: Sequence[str]) -> None:
-        """
-        Fit the TF-IDF model on the given documents.
-
-        Args:
-            documents (Sequence[str]): A sequence of document strings.
-        """
-        min_df = min(self.min_df, len(documents))
-        max_df = max(min_df/len(documents), self.max_df)
-
-        def _tokenizer(text):
-            doc = self.nlp(text)
-            tokens = [token.lemma_.lower() for token in doc
-                      if not token.is_stop and not token.is_punct]
-            return tokens
-
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore")
-            self._model = TfidfVectorizer(
-                ngram_range=(1, self.ngram_max),
-                stop_words=self.stop_words,
-                tokenizer=_tokenizer,  # self.stem_tokenizer,
-                min_df=min_df,
-                norm=self.norm,
-                sublinear_tf=self.sublinear_tf,
-                max_df=max_df)
-            self._model.fit(documents)
-
-    def transform(self, documents: Union[str, Sequence[str]]) -> Union[
-            scipy.sparse.spmatrix]:
-        """
-        Transform the input documents into TF-IDF embeddings.
-
-        Args:
-            documents (Union[str, Sequence[str]]): A single document string or
-            a sequence of document strings.
-
-        Returns:
-            Union[scipy.sparse.spmatrix]: The TF-IDF embeddings of the input
-             documents.
-        """
-        if self._model is None:
-            raise ValueError("Fit TF-IDF model before transforming data.")
-        return self._model.transform(documents).tocsr()
diff --git a/interest/output_generator/text_formater.py b/interest/output_generator/text_formater.py
deleted file mode 100644
index 93bb85b..0000000
--- a/interest/output_generator/text_formater.py
+++ /dev/null
@@ -1,117 +0,0 @@
-""" This module defines a TextFormatter class for formatting text based on
-specified output units. """
-from typing import List, Union
-import logging
-from interest.settings import SPACY_MODEL
-from interest.utils import load_spacy_model
-
-PARAGRAPH_FORMATTER = 'paragraph'
-FULLTEXT_FORMATTER = 'full_text'
-SEGMENTED_TEXT_FORMATTER = 'segmented_text'
-
-
-class TextFormatter:
-    # pylint: disable=R0903
-    """Class for formatting text based on specified output units. """
-
-    def __init__(self, output_unit: str, sentences_per_segment: int,
-                 spacy_model=SPACY_MODEL):  # : Union[str, Language]
-        """
-        Initializes the TextFormatter object.
-
-        Args:
-            output_unit (str): The type of output unit ('paragraph',
-             'full_text', 'segmented_text').
-            sentences_per_segment (int): Number of sentences per
-            segment when output_unit is 'segmented_text'.
-            spacy_model (Union[str, Language], optional): Spacy model
-             or model name used for text processing. Defaults to the global
-             SPACY_MODEL value.
-        """
-        self.nlp = (
-            load_spacy_model(spacy_model)
-            if isinstance(spacy_model, str)
-            else spacy_model
-        )
-        self.sentences_per_segment = sentences_per_segment
-        self.formatter = output_unit
-        self.is_fulltext = self._is_fulltext()
-        self.texts: List[str] = []
-
-    def format_output(self, texts: Union[None, List[str]]) -> (
-            Union)[str, List[str], List[List[str]], None]:
-        """
-        Formats input texts based on the specified output unit.
-
-        Args:
-            texts (List[str]): List of input texts to be formatted.
-
-        Returns:
-            Union[str, List[str], List[List[str]]]: Formatted output text
-            based on the selected output_unit. For 'full_text', returns a
-            single string. For 'paragraph' and 'segmented_text', returns a
-            list of segmented text lists.
-
-        Raises:
-            ValueError: If input 'texts' is not a list of strings.
-            ValueError: If an unsupported formatter type is specified.
-        """
-        try:
-            if (not isinstance(texts, list) or (texts is None) or
-                    not all(isinstance(text, str) for text in texts)):
-                raise ValueError("Input 'texts' must be a list of strings.")
-
-            self.texts = texts
-
-            if self.formatter == PARAGRAPH_FORMATTER:
-                return self._format_paragraph()
-            if self.formatter == FULLTEXT_FORMATTER:
-                return self._format_fulltext()
-            if self.formatter == SEGMENTED_TEXT_FORMATTER:
-                return self._format_segmented_text()
-
-        except ValueError as e:
-            logging.error("Unsupported formatter %s: %s", self.formatter, e)
-            return None
-        return None
-
-    def _format_paragraph(self) -> List[str]:
-        """Formats texts as a single paragraph.
-
-        Returns:
-            List[List[str]]: List of input texts, segmented in paragraphs.
-        """
-        return self.texts
-
-    def _format_fulltext(self) -> str:
-        """Formats texts as full text with newline separators.
-
-        Returns:
-            str: Newline-separated string of input texts.
-        """
-        return '\n'.join(self.texts)
-
-    def _format_segmented_text(self) -> List[List[str]]:
-        """Formats texts as segmented text based on sentences_per_segment.
-
-        Returns:
-             List[List[str]]: Flattened list of segmented text strings.
-        """
-        segmented_texts = []
-        for text in self.texts:
-            doc = self.nlp(text)
-            sentences = [sent.text for sent in doc.sents]
-
-            for i in range(0, len(sentences), self.sentences_per_segment):
-                segment = sentences[i:i + self.sentences_per_segment]
-                segmented_texts.append(segment)
-
-        return segmented_texts
-
-    def _is_fulltext(self) -> bool:
-        """Checks if the formatter type is 'full_text'.
-
-        Returns:
-            bool: True if formatter is 'full_text', False otherwise.
-        """
-        return self.formatter == FULLTEXT_FORMATTER
diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py
deleted file mode 100644
index 3cec932..0000000
--- a/interest/preprocessor/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# from interest.preprocessor.parser import XMLExtractor
diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
deleted file mode 100644
index efb7dd3..0000000
--- a/interest/preprocessor/parser.py
+++ /dev/null
@@ -1,207 +0,0 @@
-
-import os
-import tarfile
-import gzip
-import json
-import xml.etree.ElementTree as ET
-from typing import Dict, Union, Any, Optional, List
-import logging
-
-
-class XMLExtractor:
-    """Class for extracting XML content and metadata from nested .tgz files."""  # noqa: E501
-    def __init__(self, root_dir: str, output_dir: str):
-        """
-        Initializes the XMLExtractor object.
-
-        Parameters:
-            root_dir (str): The root directory containing .tgz files.
-            output_dir (str): The output directory for saving extracted JSON files.  # noqa: E501
-        """
-        self.root_dir = root_dir
-        self.output_dir = output_dir
-        self.fields = [
-            "title", "language", "issuenumber", "date", "identifier",
-            "temporal", "recordRights", "publisher", "spatial", "source",
-            "recordIdentifier", "type", "isPartOf"
-        ]
-
-    def extract_xml_string(self) -> None:
-        """
-        Extracts XML content and metadata from .tgz files in the root directory.  # noqa: E501
-        """
-        for folder_name in os.listdir(self.root_dir):
-            folder_path = os.path.join(self.root_dir, folder_name)
-            if not os.path.isdir(folder_path):
-                continue
-            if not folder_name.isdigit():  # Exclude in_progress, manifests, and ocr_complete folders and log files.  # noqa: E501
-                continue
-            self.process_folder(folder_name, folder_path)
-
-    def process_folder(self, folder_name: str, folder_path: str) -> None:
-        """
-        Processes .tgz files within a folder.
-
-        Parameters:
-            folder_name (str): Name of the folder being processed.
-            folder_path (str): Path to the folder being processed.
-        """
-        for tgz_filename in os.listdir(folder_path):
-            if not tgz_filename.endswith('.tgz'):
-                continue
-            tgz_file_path = os.path.join(folder_path, tgz_filename)
-            base_name = os.path.splitext(tgz_filename)[0]
-            output_folder = os.path.join(self.output_dir, folder_name)
-            os.makedirs(output_folder, exist_ok=True)
-            try:
-                with tarfile.open(tgz_file_path, "r:gz") as outer_tar:
-                    news_dict = self.process_tar(outer_tar)
-            except tarfile.TarError as e:
-                logging.error(f"Error extracting {tgz_filename}: {e}")
-                continue
-            output_file = os.path.join(output_folder, f"{base_name}.json.gz")
-            self.save_as_json_compressed(news_dict, output_file)
-            # self.save_as_json(news_dict, output_file)
-
-    def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:  # noqa: E501
-        """
-        Processes a .tgz file and extracts XML content and metadata.
-
-        Parameters:
-            outer_tar (tarfile.TarFile): The .tgz file being processed.
-
-        Returns:
-            Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata.  # noqa: E501
-        """
-        news_dict: Dict[str, Any] = {"newsletter_metadata": {}, "articles": {}}
-        id = 0
-        for entry in outer_tar:
-            try:
-                if entry.name.endswith(".xml"):
-                    file = outer_tar.extractfile(entry)
-                    if file is not None:
-                        content = file.read()
-                        xml_content = content.decode('utf-8', 'ignore')
-                        article = self.extract_article(xml_content, entry.name)
-                        id += 1
-                        news_dict["articles"][id] = article
-
-                elif entry.name.endswith(".gz"):
-                    gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz'))  # noqa: E501
-                    with outer_tar.extractfile(gz_member) as gz_file:  # type: ignore  # noqa: E501
-                        with gzip.open(gz_file, 'rt') as xml_file:
-                            xml_string = xml_file.read()
-                            if isinstance(xml_string, bytes):
-                                xml_string = xml_string.decode('utf-8')
-                            newsletter_metadata = self.extract_meta(xml_string)
-                            news_dict["newsletter_metadata"] = newsletter_metadata  # noqa: E501
-                else:
-                    continue
-            except Exception as e:
-                logging.error(f"Error processing file {entry.name}: {e}")
-        return news_dict
-
-    @staticmethod
-    def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:  # noqa: E501
-        """
-        Saves data as compressed JSON using gzip.
-
-        Parameters:
-            data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.  # noqa: E501
-            output_file (str): Path to the output JSON file.
-        """
-        try:
-            with gzip.open(output_file, 'wt') as json_file:
-                json.dump(data, json_file, indent=4)
-        except Exception as e:
-            logging.error(f"Error saving compressed JSON to {output_file}: {e}")  # noqa: E501
-
-    # @staticmethod
-    # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:  # noqa: E501
-    #     """
-    #     Saves data as JSON to a specified file.
-
-    #     Parameters:
-    #         data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.  # noqa: E501
-    #         output_file (str): Path to the output JSON file.
-    #     """
-    #     try:
-    #         with open(output_file, 'w') as json_file:
-    #             json.dump(data, json_file, indent=4)
-    #     except Exception as e:
-    #         logging.error(f"Error saving JSON to {output_file}: {e}")
-
-    @staticmethod
-    def extract_article(xml_content: str, file_name: str) -> Dict[str, Union[str, List[Optional[str]]]]:  # noqa: E501
-        """
-        Extracts article title and body from XML content.
-
-        Parameters:
-            xml_content (str): XML content of the article.
-            file_name (str): Name of the XML file.
-
-        Returns:
-            Dict[Optional[str], list[str]]: A dictionary containing the extracted title and body of the article.
-              body contains a list of paragraphs.  # noqa: E501
-        """
-        try:
-            root = ET.fromstring(xml_content)
-        except ET.ParseError:
-            logging.error(f"Failed to parse XML from file: {file_name}")
-            return {}
-
-        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]  # noqa: E501
-        if len(title_values) > 1:
-            logging.warning("More than one titles are extracted for the article.")  # noqa: E501
-        if not title_values:
-            logging.warning("No title is extracted for the article.")
-            title = ""
-        else:
-            title = title_values[0] if title_values[0] is not None else ""
-            # title = title_values[0]
-
-        body_values = [element.text for element in root.iter() if element.tag.endswith('p')]  # noqa: E501
-        if not body_values:
-            logging.warning("No body is extracted.")
-            body = []
-        # elif len(body_values) > 1:
-        #     logging.warning("There are more than one paragraphs in the article.")  # noqa: E501
-        #     body = ' '.join(body_values)
-        else:
-            # body = body_values[0]
-            body = body_values
-
-        return {"title": title, "body": body}
-
-    def extract_meta(self, xml_string: str) -> Dict[str, Union[str, None]]:
-        """
-        Extracts metadata from XML string.
-
-        Parameters:
-            xml_string (str): XML string containing metadata.
-
-        Returns:
-            Dict[str, Union[str, None]]: A dictionary containing the extracted metadata.  # noqa: E501
-        """
-        newsletter_metadata: Dict[str, Union[str, None]] = {}
-
-        try:
-            root = ET.fromstring(xml_string)
-        except ET.ParseError:
-            logging.error("Failed to parse XML from file")
-            return newsletter_metadata
-
-        for field in self.fields:
-            field_values = [element.text for element in root.iter() if element.tag.endswith(field)]  # noqa: E501
-            if len(field_values) > 1:
-                logging.warning(f"More than one {field}s are extracted from metadata.")  # noqa: E501
-            if not field_values:
-                logging.warning(f"No {field} is extracted.")
-                newsletter_metadata[field] = None
-            else:
-                filtered_field_values = [value for value in field_values if value is not None]  # noqa: E501
-                newsletter_metadata[field] = filtered_field_values[0] if field != "spatial" else ", ".join(filtered_field_values)  # noqa: E501
-
-                # newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)  # noqa: E501
-
-        return newsletter_metadata
diff --git a/interest/preprocessor/text_cleaner.py b/interest/preprocessor/text_cleaner.py
deleted file mode 100644
index ca96945..0000000
--- a/interest/preprocessor/text_cleaner.py
+++ /dev/null
@@ -1,129 +0,0 @@
-"""
-This module provides a TextCleaner class for preprocessing text
-data using various cleaning techniques.
-"""
-import re
-from typing import Union, List
-from interest.settings import SPACY_MODEL
-from interest.utils import load_spacy_model
-
-
-def merge_texts_list(text: Union[str, List[str]]) -> str:
-    """
-    Merge a list of texts into a single string by joining them with spaces.
-
-    Args:
-        text (Union[str, List[str]]): The input text or list of texts to merge.
-
-    Returns:
-        str: The merged text if input is a list of strings, otherwise returns
-        the input text unchanged.
-    """
-    if isinstance(text, list):
-        merged_text = ' '.join(text)
-        return merged_text
-    return text
-
-
-class TextCleaner:
-    """A class for cleaning text data using various preprocessing
-       techniques."""
-
-    def __init__(self, spacy_model=SPACY_MODEL) -> None:
-        """Initialize the TextCleaner instance.
-
-        Args:
-            spacy_model (str or spacy.Language, optional): The SpaCy
-                        model to use for text processing.
-                        Defaults to the model specified in the settings.
-        """
-
-        self.nlp = (
-            load_spacy_model(spacy_model)
-            if isinstance(spacy_model, str)
-            else spacy_model
-        )
-        self.stopword_list = self.nlp.Defaults.stop_words
-        self.stopwords = set(self.stopword_list)
-        self.text = ""
-
-    def get_lower_lemma_tokens(self) -> None:
-        """
-            Get lowercased lemmatized tokens from the text.
-
-            This method processes the text stored in the instance variable
-            `self.text`,tokenizes it using the SpaCy pipeline `self.nlp`,
-            and then lemmatizes each token, converting it to lowercase.
-            Stop words and punctuation tokens are filtered out.
-        """
-        doc = self.nlp(self.text)
-        self.text = " ".join([token.lemma_.lower() for token in doc
-                              if not token.is_stop and not token.is_punct])
-
-    def get_words(self):
-        """Tokenize words in the text."""
-        doc = self.nlp(self.text)
-        self.text = " ".join([token.text for token in doc])
-
-    def lower(self):
-        """Transform the text to lower case."""
-        self.text = self.text.lower()
-
-    def remove_stopwords(self):
-        """Remove the stopwords from the text."""
-        doc = self.nlp(self.text)
-        self.text = " ".join([token.text for token in doc if token.text
-                              not in self.stopwords])
-
-    def remove_numeric(self):
-        """Remove numbers from the text."""
-        self.text = re.sub(r'\d+', '', self.text)
-
-    def remove_non_ascii(self):
-        """Remove non ASCII characters from the text."""
-        self.text = re.sub(r'[^\x00-\x7f]', '', self.text)
-
-    def remove_extra_whitespace_tabs(self):
-        """Remove extra whitespaces and tabs from the text."""
-        self.text = re.sub(r'\s+', ' ', self.text)
-
-    def remove_one_char(self):
-        """Remove single characters from the text."""
-        self.text = " ".join([w for w in self.text.split() if len(w) > 1])
-
-    def keep_standard_chars(self):
-        """Keep only standard characters in the text."""
-        self.text = re.sub(r'[^-0-9\w,. ?!()%/]', '', self.text)
-
-    def preprocess(self, text):
-        """Preprocess the given text using a series of cleaning steps.
-
-        Args:
-            text ( List[str]): The text to preprocess.
-
-        Returns:
-            str: The preprocessed text.
-        """
-        self.text = merge_texts_list(text)
-        self.get_lower_lemma_tokens()
-        self.remove_numeric()
-        self.remove_extra_whitespace_tabs()
-        self.remove_one_char()
-        return self.text
-
-    def clean(self, text):
-        """Clean the given text by removing non-standard characters and
-           extra whitespace.
-
-        Args:
-            text (str): The text to clean.
-
-        Returns:
-            str: The cleaned text.
-        """
-        self.text = merge_texts_list(text)
-        self.text = text
-        self.get_words()
-        self.keep_standard_chars()
-        self.remove_extra_whitespace_tabs()
-        return self.text
diff --git a/interest/settings.py b/interest/settings.py
deleted file mode 100644
index 391f9b4..0000000
--- a/interest/settings.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-Module containing configuration settings for the project.
-"""
-import os
-
-SPACY_MODEL = os.getenv("SPACY_MODEL", "nl_core_news_sm")
-"""Spacy model to use for sentence splitting."""
-
-ENCODING = os.getenv("ENCODING", "utf-8")
-"""Encoding used for reading and writing files."""
diff --git a/interest/temporal_categorization/__init__.py b/interest/temporal_categorization/__init__.py
deleted file mode 100644
index 60ec91d..0000000
--- a/interest/temporal_categorization/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Mapping from string format descriptions to corresponding classes."""
-from interest.temporal_categorization.timestamped_data \
-    import (YearPeriodData, DecadePeriodData)
-
-PERIOD_TYPES = {
-    "decade": DecadePeriodData,
-    "year": YearPeriodData
-}
diff --git a/interest/temporal_categorization/timestamped_data.py b/interest/temporal_categorization/timestamped_data.py
deleted file mode 100644
index 7a75488..0000000
--- a/interest/temporal_categorization/timestamped_data.py
+++ /dev/null
@@ -1,123 +0,0 @@
-"""
-This module provides classes and utilities for working with data
-that includes timestamps.
-"""
-import json
-from datetime import datetime
-from pathlib import Path
-
-
-class TimestampedData:
-    """
-    Represents data with a timestamp.
-
-    Attributes:
-        DATE_FIELD (str): The field name for the timestamp in the data.
-        _filename (Path): The path to the file containing the data.
-        _data (dict): The loaded JSON data.
-        _timestamp (datetime): The timestamp extracted from the data.
-
-    Methods:
-        __init__(self, filename): Initializes the TimestampedData object.
-        filename(self) -> Path: Returns the filename path.
-        _load_data(self): Loads data from the file.
-        _get_timestamp(self): Extracts the timestamp from the data.
-        categorize(self): Abstract method for categorizing data by timestamp.
-    """
-
-    DATE_FIELD = "Date"
-
-    def __init__(self, filename: Path):
-        """
-        Initializes the TimestampedData object.
-
-        Args:
-            filename (Path): The path to the file containing the data.
-        """
-        self._filename = filename
-        self._data = self._load_data()
-        self._timestamp = self._get_timestamp()
-
-    @property
-    def filename(self) -> Path:
-        """
-        Returns the filename path.
-
-        Returns:
-            Path: The filename path.
-        """
-        return self._filename
-
-    def _load_data(self):
-        """
-        Loads data from the file.
-
-        Returns:
-            dict: The loaded JSON data.
-        """
-        with open(self._filename, 'r', encoding='utf-8') as file:
-            return json.load(file)
-
-    def data(self):
-        """
-        Returns the json data
-
-        Returns:
-            dict: The loaded JSON data.
-        """
-        return self._data
-
-    def _get_timestamp(self):
-        """
-        Extracts the timestamp from the data.
-
-        Returns:
-            datetime: The extracted timestamp.
-        """
-        return datetime.strptime(self._data[self.DATE_FIELD], '%Y-%m-%d')
-
-    def categorize(self):
-        """
-        Abstract method for categorizing data by timestamp.
-
-        Raises:
-            NotImplementedError: Subclasses must implement categorize method.
-        """
-        raise NotImplementedError("Subclass must implement categorize method")
-
-
-class YearPeriodData(TimestampedData):
-    """
-    Represents data categorized by year.
-
-    Methods:
-        categorize(self): Categorizes data by year.
-    """
-
-    def categorize(self):
-        """
-        Categorizes data by year.
-
-        Returns:
-            int: The year of the timestamp.
-        """
-        return self._timestamp.year
-
-
-class DecadePeriodData(TimestampedData):
-    """
-    Represents data categorized by decade.
-
-    Methods:
-        categorize(self): Categorizes data by decade.
-    """
-
-    def categorize(self):
-        """
-        Categorizes data by decade.
-
-        Returns:
-            int: The decade of the timestamp.
-        """
-        year = self._timestamp.year
-        return (year // 10) * 10
diff --git a/interest/utils.py b/interest/utils.py
deleted file mode 100644
index c601162..0000000
--- a/interest/utils.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""
-Module containing utility functions for the project.
-"""
-import os
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-from functools import cache
-import json
-import spacy
-import spacy.cli
-from interest.filter.document_filter import (YearFilter,
-                                             TitleFilter,
-                                             DocumentFilter)
-from interest.filter.document_filter import (CompoundFilter,
-                                             DecadeFilter,
-                                             KeywordsFilter)
-from interest.settings import ENCODING
-
-
-@cache
-def load_spacy_model(model_name: str, retry: bool = True) \
-        -> Optional[spacy.Language]:
-    """Load and store a sentencize-only SpaCy model
-
-    Downloads the model if necessary.
-
-    Args:
-        model_name (str): The name of the SpaCy model to load.
-        retry (bool, optional): Whether to retry downloading the model
-            if loading fails initially. Defaults to True.
-
-    Returns:
-        spacy.Language: The SpaCy model object for the given name.
-    """
-
-    try:
-        nlp = spacy.load(model_name, disable=["tagger", "parser", "ner"])
-    except OSError as exc:
-        if retry:
-            spacy.cli.download(model_name)
-            return load_spacy_model(model_name, False)
-        raise exc
-    nlp.add_pipe("sentencizer")
-    return nlp
-
-
-def load_filters_from_config(config_file: Path) -> CompoundFilter:
-    """Load document filters from a configuration file.
-
-    Args:
-        config_file (Path): Path to the configuration file containing
-        filter settings.
-
-    Returns:
-        CompoundFilter: A compound filter containing individual document
-        filters loaded from the configuration.
-    """
-    with open(config_file, 'r', encoding=ENCODING) as f:
-        config: Dict[str, List[Dict[str, Any]]] = json.load(f)
-
-    filters: List[DocumentFilter] = []
-    for filter_config in config['filters']:
-        filter_type = filter_config['type']
-        if filter_type == 'TitleFilter':
-            filters.append(TitleFilter(filter_config['title']))
-        elif filter_type == 'YearFilter':
-            filters.append(YearFilter(filter_config['year']))
-        elif filter_type == 'DecadeFilter':
-            filters.append(DecadeFilter(filter_config['decade']))
-        elif filter_type == 'KeywordsFilter':
-            filters.append(KeywordsFilter(filter_config['keywords']))
-
-    return CompoundFilter(filters)
-
-
-def get_keywords_from_config(config_file: Path) -> List[str]:
-    """
-        Extract keywords from a JSON configuration file.
-
-        Args:
-            config_file (Path): The path to the JSON configuration file.
-
-        Returns:
-            List[str]: The list of keywords extracted from the configuration
-            file.
-
-        Raises:
-            FileNotFoundError: If the config file is not found or cannot be
-            opened.
-            KeyError: If the required keys are not found in the configuration
-            file.
-            TypeError: If the data in the configuration file is not in the
-            expected format.
-    """
-    try:
-        with open(config_file, 'r', encoding=ENCODING) as f:
-            config: Dict[str, List[Dict[str, Any]]] = json.load(f)
-
-        for filter_config in config['filters']:
-            filter_type = filter_config['type']
-            if filter_type == 'KeywordsFilter':
-                return filter_config['keywords']
-        return []
-    except FileNotFoundError as exc:
-        raise FileNotFoundError("Config file not found") from exc
-    except KeyError as exc:
-        raise KeyError("Keywords not found in config file") from exc
-
-
-def read_config(config_file: Path, item_key: str) -> Dict[str, str]:
-    """
-        Get the value of the given key item from a JSON file.
-
-        Args:
-            config_file (Path): The path to the JSON config file.
-            item_key (str): Key item defined in config file.
-        Returns:
-            Dict[str, str]: The article selector configuration.
-
-        Raises:
-            KeyError: If the key item is not found in the config file.
-            FileNotFoundError: If the config file is not found.
-    """
-    try:
-        with open(config_file, 'r', encoding=ENCODING) as f:
-            config: Dict[str, str] = json.load(f)[item_key]
-        if not config:
-            raise ValueError("Config is empty")
-        return config
-    except FileNotFoundError as exc:
-        raise FileNotFoundError("Config file not found") from exc
-    except KeyError as exc:
-        raise KeyError("Key item %s not found in config file") from exc
-
-
-def save_filtered_articles(input_file: Any, article_id: str,
-                           output_dir: str) -> None:
-    """Save filtered articles data to a JSON file.
-
-    Args:
-        input_file: The input file object.
-        article_id (str): The ID of the article.
-        output_dir (str): The directory where the JSON file will be saved.
-
-    Returns:
-        None
-    """
-    data = {
-        "file_path": str(input_file.filepath),
-        "article_id": str(article_id),
-        "Date": str(input_file.doc().publish_date),
-        "Title": input_file.doc().title,
-    }
-
-    output_fp = os.path.join(output_dir, input_file.base_file_name() + '.json')
-    print('output_fp', output_fp)
-    with open(output_fp, "w", encoding=ENCODING) as json_file:
-        json.dump(data, json_file, indent=4)
-
-
-def get_file_name_without_extension(full_path: str) -> str:
-    """
-    Extracts the file name without extension from a full path.
-
-    Args:
-        full_path (str): The full path of the file.
-
-    Returns:
-        str: The file name without extension.
-
-    """
-    base_name = os.path.basename(full_path)
-    file_name_without_ext = os.path.splitext(base_name)[0]
-    return file_name_without_ext

From 76b70b7e30fae4c3f586960da5bc302d13581ed5 Mon Sep 17 00:00:00 2001
From: parisa-zahedi <p.zahedi@uu.nl>
Date: Tue, 25 Jun 2024 16:15:29 +0200
Subject: [PATCH 3/4] modify project name

---
 README.md | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 0116b7c..a949991 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# INTEREST
+# dataQuest
 
 The code in this repository implements a pipeline to extract specific articles from a large corpus.
 
@@ -10,7 +10,7 @@ Articles can be filtered based on individual or multiple features such as title,
 ## Getting Started
 Clone this repository to your working station to obtain examples and python scripts:
 ```
-git clone https://github.com/UtrechtUniversity/historical-news-sentiment.git
+git clone https://github.com/UtrechtUniversity/dataQuest.git
 ```
 
 ### Prerequisites
@@ -20,10 +20,10 @@ To install and run this project you need to have the following prerequisites ins
 ```
 
 ### Installation
-#### Option 1 - Install interest package
-To run the project, ensure to install the interest package that is part of this project.
+#### Option 1 - Install dataQuest package
+To run the project, ensure to install the dataQuest package that is part of this project.
 ```
-pip install interest
+pip install dataQuest
 ```
 #### Option 2 - Run from source code
 If you want to run the scripts without installation you need to:  
@@ -42,7 +42,7 @@ pip install .
 On Linux and Mac OS, you might have to set the PYTHONPATH environment variable to point to this directory.
 
 ```commandline
-export PYTHONPATH="current working directory/historical-news-sentiment:${PYTHONPATH}"
+export PYTHONPATH="current working directory/dataQuest:${PYTHONPATH}"
 ```
 ### Built with
 These packages are automatically installed in the step above:
@@ -85,7 +85,7 @@ Below is a snapshot of the JSON file format:
 
 In our use case, the harvested KB data is in XML format. We have provided the following script to transform the original data into the expected format.
 ```
-from interest.preprocessor.parser import XMLExtractor
+from dataQuest.preprocessor.parser import XMLExtractor
 
 extractor = XMLExtractor(Path(input_dir), Path(output_dir))
 extractor.extract_xml_string()
@@ -99,9 +99,9 @@ python3 convert_input_files.py --input_dir path/to/raw/xml/data --output_dir pat
 
 In order to define a corpus with a new data format you should:
 
-- add a new input_file_type to [INPUT_FILE_TYPES](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/filter/__init__.py)
-- implement a class that inherits from [input_file.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/filter/input_file.py).
-This class is customized to read a new data format. In our case-study we defined [delpher_kranten.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/filter/delpher_kranten.py).
+- add a new input_file_type to [INPUT_FILE_TYPES](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/__init__.py)
+- implement a class that inherits from [input_file.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/input_file.py).
+This class is customized to read a new data format. In our case-study we defined [delpher_kranten.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/delpher_kranten.py).
 
 
 ### 2. Filtering
@@ -144,7 +144,7 @@ The output of this script is a JSON file for each selected article in the follow
 }
 ```
 ### 3. Categorization by timestamp
-The output files generated in the previous step are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/temporal_categorization/__init__.py), 
+The output files generated in the previous step are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/temporal_categorization/__init__.py), 
 such as ```year``` or ```decade```. This categorization is essential for subsequent steps, especially if you intend to apply tf-idf or other models to specific periods. In our case, we applied tf-idf per decade.
 
 ```commandline
@@ -159,7 +159,7 @@ By utilizing tf-idf, the most relevant articles related to the specified topic (
 
 Before applying tf-idf, articles containing any of the specified keywords in their title are selected.
 
-From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json):
+From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json):
 
 - Percentage of selected articles with the top scores
 - Maximum number of selected articles with the top scores 
@@ -192,12 +192,12 @@ From the rest of articles, to choose the most relevant ones, you can specify one
 
 The following script, add a new column, ```selected``` to the .csv files from the previous step.
 ```commandline
-python3 scripts/3_select_final_articles.py --input_dir "output/output_timestamped/"
+python3 scripts/step3_select_final_articles.py --input-dir "output/output_timestamped/"
 ```
 
 ### 5. Generate output
 As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text.
-This feature can be set in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json).
+This feature can be set in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json).
 ```commandline
 "output_unit": "paragraph"
 
@@ -211,7 +211,7 @@ OR
 ```
 
 ```commandline
-python3 scripts/step4_generate_output.py --input_dir "output/output_timestamped/” --output-dir “output/output_results/“  --glob “*.csv”
+python3 scripts/step4_generate_output.py --input-dir "output/output_timestamped/” --output-dir “output/output_results/“  --glob “*.csv”
 ```
 ## About the Project
 **Date**: February 2024
@@ -248,5 +248,5 @@ To contribute:
 
 Pim Huijnen - p.huijnen@uu.nl
 
-Project Link: [https://github.com/UtrechtUniversity/historical-news-sentiment](https://github.com/UtrechtUniversity/historical-news-sentiment)
+Project Link: [https://github.com/UtrechtUniversity/dataQuest](https://github.com/UtrechtUniversity/dataQuest)
 

From 01d74591750213915b0a348547b3e5805b5e6b7d Mon Sep 17 00:00:00 2001
From: parisa-zahedi <p.zahedi@uu.nl>
Date: Tue, 25 Jun 2024 16:19:22 +0200
Subject: [PATCH 4/4] change project name

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6878f35..23e1c3d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "interest"
+name = "dataQuest"
 description = "A package to extract hystorical news sentiments"
 authors = [
     {name = "Shiva Nadi", email = "s.nadi@uu.nl"},
@@ -31,7 +31,7 @@ lint = ["flake8"]
 test = ["pytest", "mypy"]
 
 [tool.setuptools]
-packages = ["interest"]
+packages = ["dataQuest"]
 
 [tool.flake8]
 max-line-length = 99