Filter (#5)

* define input file classes * define document class * apply pylint * apply flake8 * define filters and read them from config * filter articles * apply pylint, and flake8 * fix flake8 errors * fix mypy errors * rename filter_articles.py * apply reviews * Add .gz format --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl> Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
UtrechtUniversity · Mar 11, 2024 · 8defb7d · 8defb7d
1 parent 57235ab
commit 8defb7d
Show file tree

Hide file tree

Showing 7 changed files with 379 additions and 12 deletions.
diff --git a/config.json b/config.json
@@ -0,0 +1,8 @@
+{
+  "filters": [
+    {
+      "type": "KeywordsFilter",
+      "keywords": ["Article 1","Zweepen","spoorwegpersoneel"]
+    }
+  ]
+}
diff --git a/interest/document_filter.py b/interest/document_filter.py
@@ -0,0 +1,208 @@
+"""
+Document Filter Module
+This module provides classes for filtering documents and articles.
+"""
+from abc import ABC, abstractmethod
+from typing import List
+from interest.document import Document, Article
+
+
+class DocumentFilter(ABC):
+    """
+        Abstract base class for document filters.
+
+        Methods:
+            filter_document(document: Document) -> bool: Abstract method
+             to filter documents.
+            filter_article(article: Article) -> bool: Method to filter
+            articles.
+    """
+    @abstractmethod
+    def filter_document(self, document: Document) -> bool:
+        """
+               Abstract method to filter documents.
+
+               Args:
+                   document (Document): The document to be filtered.
+
+               Returns:
+                   bool: True if the document passes the filter,
+                   False otherwise.
+        """
+        return NotImplemented
+
+    def filter_article(self, _article: Article) -> bool:
+        """
+                Method to filter articles.
+
+                By default, returns True, allowing all articles to
+                pass through.
+
+                Args:
+                    _article (Article): The article to be filtered.
+
+                Returns:
+                    bool: True if the article passes the filter,
+                     False otherwise.
+        """
+        return True
+
+
+class TitleFilter(DocumentFilter):
+    """
+        Filter documents by title.
+
+        Attributes:
+            title (str): The title to filter by.
+    """
+    def __init__(self, title: str):
+        self.title = title
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by title.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document's title contains the specified
+                    title, False otherwise.
+        """
+        return self.title in document.title
+
+
+class YearFilter(DocumentFilter):
+    """
+       Filter documents by year.
+
+       Attributes:
+           year (int): The year to filter by.
+    """
+    def __init__(self, year: int):
+        self.year = year
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by year.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document's year matches the specified
+                    year, False otherwise.
+        """
+        return document.year == self.year
+
+
+class DecadeFilter(DocumentFilter):
+    """
+        Filter documents by decade.
+
+        Attributes:
+            decade (int): The decade to filter by.
+    """
+    def __init__(self, decade: int):
+        self.decade = decade
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by decade.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document's decade matches the
+                    specified decade, False otherwise.
+        """
+        return document.decade == self.decade
+
+
+class KeywordsFilter(DocumentFilter):
+    """
+        Filter documents and articles by keywords.
+
+        Attributes:
+            keywords (List[str]): The list of keywords to filter by.
+    """
+    def __init__(self, keywords: List[str]):
+        self.keywords = keywords
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by keywords.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: Always returns True.
+        """
+        return True
+
+    def filter_article(self, article: Article) -> bool:
+        """
+                Filter articles by keywords.
+
+                Args:
+                    article (Article): The article to be filtered.
+
+                Returns:
+                    bool: True if the article's title or text contains any
+                    of the specified keywords, False otherwise.
+        """
+        return any(keyword in article.title or keyword in article.text for
+                   keyword in self.keywords)
+
+
+class CompoundFilter(DocumentFilter):
+    """
+        Compound filter combining multiple filters.
+
+        Attributes:
+            filters (List[DocumentFilter]): The list of filters to apply.
+    """
+    def __init__(self, filters: List[DocumentFilter]):
+        self.filters = filters
+
+    def filter_document(self, document: Document) -> bool:
+        """
+                Filter documents by applying all filters.
+
+                Args:
+                    document (Document): The document to be filtered.
+
+                Returns:
+                    bool: True if the document passes all filters,
+                    False otherwise.
+        """
+        return all(filter_.filter_document(document)
+                   for filter_ in self.filters)
+
+    def filter_article(self, article: Article) -> bool:
+        """
+                Filter articles by applying all filters.
+
+                Args:
+                    article (Article): The article to be filtered.
+
+                Returns:
+                    bool: True if the article passes all filters,
+                    False otherwise.
+        """
+        return all(filter_.filter_article(article) for filter_ in self.filters)
+
+    def include_keyword_filter(self) -> bool:
+        """
+                Check if the compound filter includes a KeywordsFilter.
+
+                Returns:
+                    bool: True if the compound filter includes a
+                    KeywordsFilter, False otherwise.
+        """
+        for filter_ in self.filters:
+            if isinstance(filter_, KeywordsFilter):
+                return True
+        return False
diff --git a/interest/input_file.py b/interest/input_file.py
@@ -5,12 +5,11 @@
 
 import abc
 import gzip
+import logging
 from pathlib import Path
 from typing import Iterable, TextIO, cast, Optional
 from interest.document import Document, Article
-import logging
-
-# from .document_filter import DocumentFilter
+from interest.document_filter import DocumentFilter
 
 
 class InputFile(abc.ABC):
@@ -110,10 +109,11 @@ def doc(self) -> Optional[Document]:
         """
         return NotImplemented
 
-    # def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
-    #     document = self.doc()
-    #     if filter.filter_document(document):
-    #         if document.articles() is not None:
-    #             for article in document.articles():
-    #                 if filter.filter_article(article):
-    #                     yield article
+    def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
+        document = self.doc()
+        if document is not None:
+            if filter.filter_document(document):
+                if document.articles is not None:
+                    for article in document.articles:
+                        if filter.filter_article(article):
+                            yield article
diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
@@ -59,7 +59,7 @@ def process_folder(self, folder_name: str, folder_path: str) -> None:
             except tarfile.TarError as e:
                 logging.error(f"Error extracting {tgz_filename}: {e}")
                 continue
-            output_file = os.path.join(output_folder, f"{base_name}.json")
+            output_file = os.path.join(output_folder, f"{base_name}.json.gz")
             self.save_as_json_compressed(news_dict, output_file)
             # self.save_as_json(news_dict, output_file)
 

diff --git a/interest/utils.py b/interest/utils.py
@@ -0,0 +1,63 @@
+from interest.document_filter import YearFilter, TitleFilter, DocumentFilter
+from interest.document_filter import (CompoundFilter, DecadeFilter,
+                                      KeywordsFilter)
+# from sklearn.feature_extraction.text import CountVectorizer
+import json
+from typing import List
+# import os
+
+# def calculate_word_frequency_per_doc(document):
+#     # Initialize CountVectorizer
+#     vectorizer = CountVectorizer()
+#
+#     # Fit the vectorizer to the document and transform the document
+#     # into a word frequency matrix
+#     word_frequency_matrix = vectorizer.fit_transform([document])
+#
+#     # Get the vocabulary (list of words) and their corresponding indices
+#     vocabulary = vectorizer.get_feature_names_out()
+#
+#     # Get the word frequency vector for the document
+#     word_frequency_vector = word_frequency_matrix.toarray()[0]
+#
+#     # Create a dictionary mapping words to their frequencies
+#     word_frequency_dict = dict(zip(vocabulary,
+#                           word_frequency_vector.tolist()))
+#
+#     return word_frequency_dict
+
+
+def load_filters_from_config(config_file) -> CompoundFilter:
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+
+    filters: List[DocumentFilter] = []
+    for filter_config in config['filters']:
+        filter_type = filter_config['type']
+        if filter_type == 'TitleFilter':
+            filters.append(TitleFilter(filter_config['title']))
+        elif filter_type == 'YearFilter':
+            filters.append(YearFilter(filter_config['year']))
+        elif filter_type == 'DecadeFilter':
+            filters.append(DecadeFilter(filter_config['decade']))
+        elif filter_type == 'KeywordsFilter':
+            filters.append(KeywordsFilter(filter_config['keywords']))
+
+    return CompoundFilter(filters)
+
+
+# def save_filtered_articles(input_file,article_id,word_freq,output_dir)
+# -> None:
+#
+#     data = {
+#         "file_path": str(input_file.filepath),
+#         "article_id": str(article_id),
+#         "Date": str(input_file.doc().publish_date),
+#         "Title": input_file.doc().title,
+#         "word_freq": word_freq
+#     }
+#
+#     output_fp = os.path.join(output_dir, input_file.base_file_name()+'.json')
+#     print('output_fp',output_fp)
+#     with open(output_fp, "w") as json_file:
+#         json.dump(data, json_file, indent=4)
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
 ]
 dynamic = ["version"]
-dependencies = [
+dependencies = ["tqdm"
 ]
 
 [project.optional-dependencies]