Skip to content

Commit

Permalink
Filter (#5)
Browse files Browse the repository at this point in the history
* define input file classes

* define document class

* apply pylint

* apply flake8

* define filters and read them from config

* filter articles

* apply pylint, and flake8

* fix flake8 errors

* fix mypy errors

* rename filter_articles.py

* apply reviews

* Add .gz format

---------

Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
  • Loading branch information
3 people authored Mar 11, 2024
1 parent 57235ab commit 8defb7d
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 12 deletions.
8 changes: 8 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"filters": [
{
"type": "KeywordsFilter",
"keywords": ["Article 1","Zweepen","spoorwegpersoneel"]
}
]
}
208 changes: 208 additions & 0 deletions interest/document_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
"""
Document Filter Module
This module provides classes for filtering documents and articles.
"""
from abc import ABC, abstractmethod
from typing import List
from interest.document import Document, Article


class DocumentFilter(ABC):
"""
Abstract base class for document filters.
Methods:
filter_document(document: Document) -> bool: Abstract method
to filter documents.
filter_article(article: Article) -> bool: Method to filter
articles.
"""
@abstractmethod
def filter_document(self, document: Document) -> bool:
"""
Abstract method to filter documents.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document passes the filter,
False otherwise.
"""
return NotImplemented

def filter_article(self, _article: Article) -> bool:
"""
Method to filter articles.
By default, returns True, allowing all articles to
pass through.
Args:
_article (Article): The article to be filtered.
Returns:
bool: True if the article passes the filter,
False otherwise.
"""
return True


class TitleFilter(DocumentFilter):
"""
Filter documents by title.
Attributes:
title (str): The title to filter by.
"""
def __init__(self, title: str):
self.title = title

def filter_document(self, document: Document) -> bool:
"""
Filter documents by title.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document's title contains the specified
title, False otherwise.
"""
return self.title in document.title


class YearFilter(DocumentFilter):
"""
Filter documents by year.
Attributes:
year (int): The year to filter by.
"""
def __init__(self, year: int):
self.year = year

def filter_document(self, document: Document) -> bool:
"""
Filter documents by year.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document's year matches the specified
year, False otherwise.
"""
return document.year == self.year


class DecadeFilter(DocumentFilter):
"""
Filter documents by decade.
Attributes:
decade (int): The decade to filter by.
"""
def __init__(self, decade: int):
self.decade = decade

def filter_document(self, document: Document) -> bool:
"""
Filter documents by decade.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document's decade matches the
specified decade, False otherwise.
"""
return document.decade == self.decade


class KeywordsFilter(DocumentFilter):
"""
Filter documents and articles by keywords.
Attributes:
keywords (List[str]): The list of keywords to filter by.
"""
def __init__(self, keywords: List[str]):
self.keywords = keywords

def filter_document(self, document: Document) -> bool:
"""
Filter documents by keywords.
Args:
document (Document): The document to be filtered.
Returns:
bool: Always returns True.
"""
return True

def filter_article(self, article: Article) -> bool:
"""
Filter articles by keywords.
Args:
article (Article): The article to be filtered.
Returns:
bool: True if the article's title or text contains any
of the specified keywords, False otherwise.
"""
return any(keyword in article.title or keyword in article.text for
keyword in self.keywords)


class CompoundFilter(DocumentFilter):
"""
Compound filter combining multiple filters.
Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters

def filter_document(self, document: Document) -> bool:
"""
Filter documents by applying all filters.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document passes all filters,
False otherwise.
"""
return all(filter_.filter_document(document)
for filter_ in self.filters)

def filter_article(self, article: Article) -> bool:
"""
Filter articles by applying all filters.
Args:
article (Article): The article to be filtered.
Returns:
bool: True if the article passes all filters,
False otherwise.
"""
return all(filter_.filter_article(article) for filter_ in self.filters)

def include_keyword_filter(self) -> bool:
"""
Check if the compound filter includes a KeywordsFilter.
Returns:
bool: True if the compound filter includes a
KeywordsFilter, False otherwise.
"""
for filter_ in self.filters:
if isinstance(filter_, KeywordsFilter):
return True
return False
20 changes: 10 additions & 10 deletions interest/input_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

import abc
import gzip
import logging
from pathlib import Path
from typing import Iterable, TextIO, cast, Optional
from interest.document import Document, Article
import logging

# from .document_filter import DocumentFilter
from interest.document_filter import DocumentFilter


class InputFile(abc.ABC):
Expand Down Expand Up @@ -110,10 +109,11 @@ def doc(self) -> Optional[Document]:
"""
return NotImplemented

# def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
# document = self.doc()
# if filter.filter_document(document):
# if document.articles() is not None:
# for article in document.articles():
# if filter.filter_article(article):
# yield article
def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
document = self.doc()
if document is not None:
if filter.filter_document(document):
if document.articles is not None:
for article in document.articles:
if filter.filter_article(article):
yield article
2 changes: 1 addition & 1 deletion interest/preprocessor/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def process_folder(self, folder_name: str, folder_path: str) -> None:
except tarfile.TarError as e:
logging.error(f"Error extracting {tgz_filename}: {e}")
continue
output_file = os.path.join(output_folder, f"{base_name}.json")
output_file = os.path.join(output_folder, f"{base_name}.json.gz")
self.save_as_json_compressed(news_dict, output_file)
# self.save_as_json(news_dict, output_file)

Expand Down
63 changes: 63 additions & 0 deletions interest/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from interest.document_filter import YearFilter, TitleFilter, DocumentFilter
from interest.document_filter import (CompoundFilter, DecadeFilter,
KeywordsFilter)
# from sklearn.feature_extraction.text import CountVectorizer
import json
from typing import List
# import os

# def calculate_word_frequency_per_doc(document):
# # Initialize CountVectorizer
# vectorizer = CountVectorizer()
#
# # Fit the vectorizer to the document and transform the document
# # into a word frequency matrix
# word_frequency_matrix = vectorizer.fit_transform([document])
#
# # Get the vocabulary (list of words) and their corresponding indices
# vocabulary = vectorizer.get_feature_names_out()
#
# # Get the word frequency vector for the document
# word_frequency_vector = word_frequency_matrix.toarray()[0]
#
# # Create a dictionary mapping words to their frequencies
# word_frequency_dict = dict(zip(vocabulary,
# word_frequency_vector.tolist()))
#
# return word_frequency_dict


def load_filters_from_config(config_file) -> CompoundFilter:
with open(config_file, 'r') as f:
config = json.load(f)

filters: List[DocumentFilter] = []
for filter_config in config['filters']:
filter_type = filter_config['type']
if filter_type == 'TitleFilter':
filters.append(TitleFilter(filter_config['title']))
elif filter_type == 'YearFilter':
filters.append(YearFilter(filter_config['year']))
elif filter_type == 'DecadeFilter':
filters.append(DecadeFilter(filter_config['decade']))
elif filter_type == 'KeywordsFilter':
filters.append(KeywordsFilter(filter_config['keywords']))

return CompoundFilter(filters)


# def save_filtered_articles(input_file,article_id,word_freq,output_dir)
# -> None:
#
# data = {
# "file_path": str(input_file.filepath),
# "article_id": str(article_id),
# "Date": str(input_file.doc().publish_date),
# "Title": input_file.doc().title,
# "word_freq": word_freq
# }
#
# output_fp = os.path.join(output_dir, input_file.base_file_name()+'.json')
# print('output_fp',output_fp)
# with open(output_fp, "w") as json_file:
# json.dump(data, json_file, indent=4)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ classifiers = [
"License :: OSI Approved :: MIT License",
]
dynamic = ["version"]
dependencies = [
dependencies = ["tqdm"
]

[project.optional-dependencies]
Expand Down
Loading

0 comments on commit 8defb7d

Please sign in to comment.