Skip to content

Commit

Permalink
Merge pull request #1 from UtrechtUniversity/parse
Browse files Browse the repository at this point in the history
Parse
  • Loading branch information
parisa-zahedi authored Mar 1, 2024
2 parents b1ed90a + 8fde5fd commit 8abfb9b
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 2 deletions.
1 change: 1 addition & 0 deletions interest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from interest.preprocessor.parser import XMLExtractor
1 change: 1 addition & 0 deletions interest/preprocessor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from interest.preprocessor.parser import XMLExtractor
201 changes: 201 additions & 0 deletions interest/preprocessor/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@

import os
import tarfile
import gzip
import json
import xml.etree.ElementTree as ET
from typing import Dict, Union
import logging


class XMLExtractor:
"""Class for extracting XML content and metadata from nested .tgz files.""" # noqa: E501
def __init__(self, root_dir: str, output_dir: str):
"""
Initializes the XMLExtractor object.
Parameters:
root_dir (str): The root directory containing .tgz files.
output_dir (str): The output directory for saving extracted JSON files. # noqa: E501
"""
self.root_dir = root_dir
self.output_dir = output_dir
self.fields = [
"title", "language", "issuenumber", "date", "identifier",
"temporal", "recordRights", "publisher", "spatial", "source",
"recordIdentifier", "type", "isPartOf"
]

def extract_xml_string(self) -> None:
"""
Extracts XML content and metadata from .tgz files in the root directory. # noqa: E501
"""
for folder_name in os.listdir(self.root_dir):
folder_path = os.path.join(self.root_dir, folder_name)
if not os.path.isdir(folder_path):
continue
if not folder_name.isdigit(): # Exclude in_progress, manifests, and ocr_complete folders and log files. # noqa: E501
continue
self.process_folder(folder_name, folder_path)

def process_folder(self, folder_name: str, folder_path: str) -> None:
"""
Processes .tgz files within a folder.
Parameters:
folder_name (str): Name of the folder being processed.
folder_path (str): Path to the folder being processed.
"""
for tgz_filename in os.listdir(folder_path):
if not tgz_filename.endswith('.tgz'):
continue
tgz_file_path = os.path.join(folder_path, tgz_filename)
base_name = os.path.splitext(tgz_filename)[0]
output_folder = os.path.join(self.output_dir, folder_name)
os.makedirs(output_folder, exist_ok=True)
try:
with tarfile.open(tgz_file_path, "r:gz") as outer_tar:
news_dict = self.process_tar(outer_tar)
except tarfile.TarError as e:
logging.error(f"Error extracting {tgz_filename}: {e}")
continue
output_file = os.path.join(output_folder, f"{base_name}.json")
self.save_as_json_compressed(news_dict, output_file)
# self.save_as_json(news_dict, output_file)

def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: # noqa: E501
"""
Processes a .tgz file and extracts XML content and metadata.
Parameters:
outer_tar (tarfile.TarFile): The .tgz file being processed.
Returns:
Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata. # noqa: E501
"""
news_dict = {"newsletter_metadata": {}, "articles": {}}
id = 0
for entry in outer_tar:
try:
if entry.name.endswith(".xml"):
file = outer_tar.extractfile(entry)
if file is not None:
content = file.read()
xml_content = content.decode('utf-8', 'ignore')
article = self.extract_article(xml_content, entry.name)
id += 1
news_dict["articles"][id] = article

elif entry.name.endswith(".gz"):
gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz')) # noqa: E501
with outer_tar.extractfile(gz_member) as gz_file:
with gzip.open(gz_file, 'rt') as xml_file:
xml_string = xml_file.read()
newsletter_metadata = self.extract_meta(xml_string)
news_dict["newsletter_metadata"] = newsletter_metadata # noqa: E501
else:
continue
except Exception as e:
logging.error(f"Error processing file {entry.name}: {e}")
return news_dict

@staticmethod
def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: # noqa: E501
"""
Saves data as compressed JSON using gzip.
Parameters:
data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. # noqa: E501
output_file (str): Path to the output JSON file.
"""
try:
with gzip.open(output_file, 'wt') as json_file:
json.dump(data, json_file, indent=4)
except Exception as e:
logging.error(f"Error saving compressed JSON to {output_file}: {e}") # noqa: E501

# @staticmethod
# def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: # noqa: E501
# """
# Saves data as JSON to a specified file.

# Parameters:
# data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. # noqa: E501
# output_file (str): Path to the output JSON file.
# """
# try:
# with open(output_file, 'w') as json_file:
# json.dump(data, json_file, indent=4)
# except Exception as e:
# logging.error(f"Error saving JSON to {output_file}: {e}")

@staticmethod
def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
"""
Extracts article title and body from XML content.
Parameters:
xml_content (str): XML content of the article.
file_name (str): Name of the XML file.
Returns:
Dict[str, str]: A dictionary containing the extracted title and body of the article. # noqa: E501
"""
try:
root = ET.fromstring(xml_content)
except ET.ParseError:
logging.error(f"Failed to parse XML from file: {file_name}")
return {}

title_values = [element.text for element in root.iter() if element.tag.endswith('title')] # noqa: E501
if len(title_values) > 1:
logging.warning("More than one titles are extracted for the article.") # noqa: E501
if not title_values:
logging.warning("No title is extracted for the article.")
title = None
else:
title = title_values[0]

body_values = [element.text for element in root.iter() if element.tag.endswith('p')] # noqa: E501
if not body_values:
logging.warning("No body is extracted.")
body = None
# elif len(body_values) > 1:
# logging.warning("There are more than one paragraphs in the article.") # noqa: E501
# body = ' '.join(body_values)
else:
# body = body_values[0]
body = body_values

return {"title": title, "body": body}

def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]:
"""
Extracts metadata from XML string.
Parameters:
xml_string (str): XML string containing metadata.
Returns:
Dict[str, Union[str, None]]: A dictionary containing the extracted metadata. # noqa: E501
"""
newsletter_metadata: Dict[str, Union[str, None]] = {}

try:
root = ET.fromstring(xml_string)
except ET.ParseError:
logging.error("Failed to parse XML from file")
return newsletter_metadata

for field in self.fields:
field_values = [element.text for element in root.iter() if element.tag.endswith(field)] # noqa: E501
if len(field_values) > 1:
logging.warning(f"More than one {field}s are extracted from metadata.") # noqa: E501
if not field_values:
logging.warning(f"No {field} is extracted.")
newsletter_metadata[field] = None
else:
newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) # noqa: E501

return newsletter_metadata

3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ description = "A package to extract hystorical news sentiments"
authors = [
{name = "Shiva Nadi", email = "s.nadi@uu.nl"},
{name = "Parisa Zahedi", email = "p.zahedi@uu.nl"},
{name = "Matty Vermet", email = "m.s.vermet@uu.nl"}
]
readme = "README.md"
requires-python = ">=3.8"
Expand All @@ -23,8 +24,6 @@ classifiers = [
]
dynamic = ["version"]
dependencies = [
# "numpy ~= 1.23.4",
# "scikit-learn ~= 0.19.1",
]

[project.optional-dependencies]
Expand Down
22 changes: 22 additions & 0 deletions scripts/convert_input_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from interest.preprocessor.parser import XMLExtractor
from argparse import ArgumentParser
from pathlib import Path
import logging


logging.basicConfig(filename='extractor.log', level=logging.DEBUG)



def parse_arguments():
parser = ArgumentParser(
prog="convert_input_files.py",
description="Convert nested gzip files to compressed json")
parser.add_argument("--input_dir", required=True)
parser.add_argument("--output_dir", required=True)
return parser.parse_args()

if __name__=="__main__":
args = parse_arguments()
extractor = XMLExtractor(Path(args.input_dir), Path(args.output_dir))
extractor.extract_xml_string()

0 comments on commit 8abfb9b

Please sign in to comment.