generated from UtrechtUniversity/re-python-package-setuptools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* define input file classes * define document class * apply pylint * apply flake8 * define filters and read them from config * filter articles * apply pylint, and flake8 * fix flake8 errors * fix mypy errors * rename filter_articles.py * apply reviews * Add .gz format --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl> Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
- Loading branch information
1 parent
57235ab
commit 8defb7d
Showing
7 changed files
with
379 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"filters": [ | ||
{ | ||
"type": "KeywordsFilter", | ||
"keywords": ["Article 1","Zweepen","spoorwegpersoneel"] | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
""" | ||
Document Filter Module | ||
This module provides classes for filtering documents and articles. | ||
""" | ||
from abc import ABC, abstractmethod | ||
from typing import List | ||
from interest.document import Document, Article | ||
|
||
|
||
class DocumentFilter(ABC): | ||
""" | ||
Abstract base class for document filters. | ||
Methods: | ||
filter_document(document: Document) -> bool: Abstract method | ||
to filter documents. | ||
filter_article(article: Article) -> bool: Method to filter | ||
articles. | ||
""" | ||
@abstractmethod | ||
def filter_document(self, document: Document) -> bool: | ||
""" | ||
Abstract method to filter documents. | ||
Args: | ||
document (Document): The document to be filtered. | ||
Returns: | ||
bool: True if the document passes the filter, | ||
False otherwise. | ||
""" | ||
return NotImplemented | ||
|
||
def filter_article(self, _article: Article) -> bool: | ||
""" | ||
Method to filter articles. | ||
By default, returns True, allowing all articles to | ||
pass through. | ||
Args: | ||
_article (Article): The article to be filtered. | ||
Returns: | ||
bool: True if the article passes the filter, | ||
False otherwise. | ||
""" | ||
return True | ||
|
||
|
||
class TitleFilter(DocumentFilter): | ||
""" | ||
Filter documents by title. | ||
Attributes: | ||
title (str): The title to filter by. | ||
""" | ||
def __init__(self, title: str): | ||
self.title = title | ||
|
||
def filter_document(self, document: Document) -> bool: | ||
""" | ||
Filter documents by title. | ||
Args: | ||
document (Document): The document to be filtered. | ||
Returns: | ||
bool: True if the document's title contains the specified | ||
title, False otherwise. | ||
""" | ||
return self.title in document.title | ||
|
||
|
||
class YearFilter(DocumentFilter): | ||
""" | ||
Filter documents by year. | ||
Attributes: | ||
year (int): The year to filter by. | ||
""" | ||
def __init__(self, year: int): | ||
self.year = year | ||
|
||
def filter_document(self, document: Document) -> bool: | ||
""" | ||
Filter documents by year. | ||
Args: | ||
document (Document): The document to be filtered. | ||
Returns: | ||
bool: True if the document's year matches the specified | ||
year, False otherwise. | ||
""" | ||
return document.year == self.year | ||
|
||
|
||
class DecadeFilter(DocumentFilter): | ||
""" | ||
Filter documents by decade. | ||
Attributes: | ||
decade (int): The decade to filter by. | ||
""" | ||
def __init__(self, decade: int): | ||
self.decade = decade | ||
|
||
def filter_document(self, document: Document) -> bool: | ||
""" | ||
Filter documents by decade. | ||
Args: | ||
document (Document): The document to be filtered. | ||
Returns: | ||
bool: True if the document's decade matches the | ||
specified decade, False otherwise. | ||
""" | ||
return document.decade == self.decade | ||
|
||
|
||
class KeywordsFilter(DocumentFilter): | ||
""" | ||
Filter documents and articles by keywords. | ||
Attributes: | ||
keywords (List[str]): The list of keywords to filter by. | ||
""" | ||
def __init__(self, keywords: List[str]): | ||
self.keywords = keywords | ||
|
||
def filter_document(self, document: Document) -> bool: | ||
""" | ||
Filter documents by keywords. | ||
Args: | ||
document (Document): The document to be filtered. | ||
Returns: | ||
bool: Always returns True. | ||
""" | ||
return True | ||
|
||
def filter_article(self, article: Article) -> bool: | ||
""" | ||
Filter articles by keywords. | ||
Args: | ||
article (Article): The article to be filtered. | ||
Returns: | ||
bool: True if the article's title or text contains any | ||
of the specified keywords, False otherwise. | ||
""" | ||
return any(keyword in article.title or keyword in article.text for | ||
keyword in self.keywords) | ||
|
||
|
||
class CompoundFilter(DocumentFilter): | ||
""" | ||
Compound filter combining multiple filters. | ||
Attributes: | ||
filters (List[DocumentFilter]): The list of filters to apply. | ||
""" | ||
def __init__(self, filters: List[DocumentFilter]): | ||
self.filters = filters | ||
|
||
def filter_document(self, document: Document) -> bool: | ||
""" | ||
Filter documents by applying all filters. | ||
Args: | ||
document (Document): The document to be filtered. | ||
Returns: | ||
bool: True if the document passes all filters, | ||
False otherwise. | ||
""" | ||
return all(filter_.filter_document(document) | ||
for filter_ in self.filters) | ||
|
||
def filter_article(self, article: Article) -> bool: | ||
""" | ||
Filter articles by applying all filters. | ||
Args: | ||
article (Article): The article to be filtered. | ||
Returns: | ||
bool: True if the article passes all filters, | ||
False otherwise. | ||
""" | ||
return all(filter_.filter_article(article) for filter_ in self.filters) | ||
|
||
def include_keyword_filter(self) -> bool: | ||
""" | ||
Check if the compound filter includes a KeywordsFilter. | ||
Returns: | ||
bool: True if the compound filter includes a | ||
KeywordsFilter, False otherwise. | ||
""" | ||
for filter_ in self.filters: | ||
if isinstance(filter_, KeywordsFilter): | ||
return True | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from interest.document_filter import YearFilter, TitleFilter, DocumentFilter | ||
from interest.document_filter import (CompoundFilter, DecadeFilter, | ||
KeywordsFilter) | ||
# from sklearn.feature_extraction.text import CountVectorizer | ||
import json | ||
from typing import List | ||
# import os | ||
|
||
# def calculate_word_frequency_per_doc(document): | ||
# # Initialize CountVectorizer | ||
# vectorizer = CountVectorizer() | ||
# | ||
# # Fit the vectorizer to the document and transform the document | ||
# # into a word frequency matrix | ||
# word_frequency_matrix = vectorizer.fit_transform([document]) | ||
# | ||
# # Get the vocabulary (list of words) and their corresponding indices | ||
# vocabulary = vectorizer.get_feature_names_out() | ||
# | ||
# # Get the word frequency vector for the document | ||
# word_frequency_vector = word_frequency_matrix.toarray()[0] | ||
# | ||
# # Create a dictionary mapping words to their frequencies | ||
# word_frequency_dict = dict(zip(vocabulary, | ||
# word_frequency_vector.tolist())) | ||
# | ||
# return word_frequency_dict | ||
|
||
|
||
def load_filters_from_config(config_file) -> CompoundFilter: | ||
with open(config_file, 'r') as f: | ||
config = json.load(f) | ||
|
||
filters: List[DocumentFilter] = [] | ||
for filter_config in config['filters']: | ||
filter_type = filter_config['type'] | ||
if filter_type == 'TitleFilter': | ||
filters.append(TitleFilter(filter_config['title'])) | ||
elif filter_type == 'YearFilter': | ||
filters.append(YearFilter(filter_config['year'])) | ||
elif filter_type == 'DecadeFilter': | ||
filters.append(DecadeFilter(filter_config['decade'])) | ||
elif filter_type == 'KeywordsFilter': | ||
filters.append(KeywordsFilter(filter_config['keywords'])) | ||
|
||
return CompoundFilter(filters) | ||
|
||
|
||
# def save_filtered_articles(input_file,article_id,word_freq,output_dir) | ||
# -> None: | ||
# | ||
# data = { | ||
# "file_path": str(input_file.filepath), | ||
# "article_id": str(article_id), | ||
# "Date": str(input_file.doc().publish_date), | ||
# "Title": input_file.doc().title, | ||
# "word_freq": word_freq | ||
# } | ||
# | ||
# output_fp = os.path.join(output_dir, input_file.base_file_name()+'.json') | ||
# print('output_fp',output_fp) | ||
# with open(output_fp, "w") as json_file: | ||
# json.dump(data, json_file, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.