Skip to content

Commit

Permalink
Create JsonTransformer class
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* A generic JSON format class is needed as a base class for JSON metadata formats such as aardvark

How this addresses that need:
* Create JsonTransformer class
* Rename write_timdex_records_to_json > write_timdex_records_to_json_file
* Shift write_deleted_records_to_file from helpers module to Transformer method and rename to write_deleted_records_to_txt_file
* Shift CLI codeblock to Transformer.write_output_files method
* Add corresponding unit tests for write_output_files method

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-83
  • Loading branch information
ehanson8 committed Dec 1, 2023
1 parent c214bfe commit 7519db8
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 36 deletions.
25 changes: 25 additions & 0 deletions tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,31 @@ def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_non
assert len(output_records.deleted_records) == 1


def test_xmltransformer__write_output_files_writes_timdex_records_and_deleted_files(
tmp_path, oai_pmh_records
):
output_file = str(tmp_path / "output_file.json")
transformer = XmlTransformer("cool-repo", oai_pmh_records)
transformer._write_output_files(output_file)
output_files = list(tmp_path.iterdir())
assert len(output_files) == 2
assert output_files[0].name == "output_file.json"
assert output_files[1].name == "output_file.txt"


def test_xmltransformer__write_output_files_no_deleted_records_file_if_not_needed(
tmp_path,
):
output_file = str(tmp_path / "output_file.json")
datacite_records = XmlTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
)
transformer = XmlTransformer("cool-repo", datacite_records)
transformer._write_output_files(output_file)
assert len(list(tmp_path.iterdir())) == 1
assert next(tmp_path.iterdir()).name == "output_file.json"


def test_xmltransformer_parse_source_file_returns_record_iterator():
records = XmlTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
Expand Down
10 changes: 1 addition & 9 deletions transmogrifier/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import click

from transmogrifier.config import SOURCES, configure_logger, configure_sentry
from transmogrifier.helpers import write_deleted_records_to_file
from transmogrifier.sources.transformer import Transformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,14 +41,7 @@ def main(source, input_file, output_file, verbose):
logger.info("Running transform for source %s", source)

transformer = Transformer.load(source, input_file)
transformer.write_timdex_records_to_json(output_file)
if transformer.processed_record_count == 0:
raise ValueError("No records processed from input file, needs investigation")
if deleted_records := transformer.deleted_records:
deleted_output_file = output_file.replace("index", "delete").replace(
"json", "txt"
)
write_deleted_records_to_file(deleted_records, deleted_output_file)
transformer._write_output_files(output_file)
logger.info(
(
"Completed transform, total records processed: %d, "
Expand Down
8 changes: 0 additions & 8 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from datetime import datetime
from typing import Optional

from smart_open import open

from transmogrifier.config import DATE_FORMATS

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -140,12 +138,6 @@ def validate_date_range(
return False


def write_deleted_records_to_file(deleted_records: list[str], output_file_path: str):
with open(output_file_path, "w") as file:
for record_id in deleted_records:
file.write(f"{record_id}\n")


class DeletedRecord(Exception):
"""Exception raised for records with a deleted status.
Expand Down
230 changes: 211 additions & 19 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,20 @@ def __next__(self) -> TimdexRecord:
continue

@final
def write_timdex_records_to_json(self, output_file: str) -> int:
def _write_output_files(self, output_file: str) -> None:
self.write_timdex_records_to_json_file(output_file)
if self.processed_record_count == 0:
raise ValueError(
"No records processed from input file, needs investigation"
)
if deleted_records := self.deleted_records:
deleted_output_file = output_file.replace("index", "delete").replace(
"json", "txt"
)
self.write_deleted_records_to_txt_file(deleted_records, deleted_output_file)

@final
def write_timdex_records_to_json_file(self, output_file: str) -> int:
"""
Write TIMDEX records to JSON file.
Expand Down Expand Up @@ -106,6 +119,15 @@ def write_timdex_records_to_json(self, output_file: str) -> int:
file.write("\n]")
return count

@final
@staticmethod
def write_deleted_records_to_txt_file(
deleted_records: list[str], output_file_path: str
):
with open(output_file_path, "w") as file:
for record_id in deleted_records:
file.write(f"{record_id}\n")

@final
@classmethod
def load(cls, source: str, source_file: str) -> Transformer:
Expand Down Expand Up @@ -293,41 +315,66 @@ def get_optional_fields(self, source_record: JSON | Tag) -> Optional[dict]:
return {}


class XmlTransformer(Transformer):
"""XML transformer class."""
class JsonTransformer(Transformer):
"""JSON transformer class."""

@final
@classmethod
def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
def parse_source_file(cls, source_file: str) -> Iterator[JSON]:
"""
Parse XML file and return source records as bs4 Tags via an iterator.
Parse JSON file and return source records as JSON objects via an iterator.
May not be overridden.
Args:
source_file: A file containing source records to be transformed.
"""
with open(source_file, "rb") as file:
for _, element in etree.iterparse(
file,
tag="{*}record",
encoding="utf-8",
recover=True,
):
record_string = etree.tostring(element, encoding="utf-8")
record = BeautifulSoup(record_string, "xml")
for record in json.load(file):
yield record
element.clear()

@final
def get_required_fields(self, source_record: Tag) -> dict:
def transform(self, source_record: JSON) -> Optional[TimdexRecord]:
"""
Get required TIMDEX fields from an XML record.
Transform a JSON record into a TIMDEX record.
May not be overridden.
Args:
source_record: A BeautifulSoup Tag representing a single source record.
source_record: A JSON object representing a source record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)

@final
def get_required_fields(self, source_record: JSON) -> dict:
"""
Get required TIMDEX fields from an JSON record.
May not be overridden.
Args:
source_record: A JSON object representing a source record.
"""
source_record_id = self.get_source_record_id(source_record)

Expand All @@ -347,6 +394,123 @@ def get_required_fields(self, source_record: Tag) -> dict:
"title": title,
}

@classmethod
@abstractmethod
def get_main_titles(cls, source_record: JSON) -> list[str]:
"""
Retrieve main title(s) from a JSON record.
Must be overridden by source subclasses.
Args:
source_record: A JSON object representing a source record.
"""
pass

@classmethod
def get_source_link(
cls, source_base_url: str, source_record_id: str, source_record: JSON
) -> str:
"""
Class method to set the source link for the item.
May be overridden by source subclasses if needed.
Default behavior is to concatenate the source base URL + source record id.
Args:
source_base_url: Source base URL.
source_record_id: Record identifier for the source record.
source_record: A JSON object representing a source record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return source_base_url + source_record_id

@classmethod
def get_timdex_record_id(
cls, source: str, source_record_id: str, source_record: JSON
) -> str:
"""
Class method to set the TIMDEX record id.
May be overridden by source subclasses if needed.
Default behavior is to concatenate the source name + source record id.
Args:
source: Source name.
source_record_id: Record identifier for the source record.
source_record: A JSON object representing a source record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return f"{source}:{source_record_id.replace('/', '-')}"

@classmethod
@abstractmethod
def get_source_record_id(cls, source_record: JSON) -> str:
"""
Get or generate a source record ID from a JSON record.
May be overridden by source subclasses if needed.
Args:
source_record: A JSON object representing a source record.
"""
pass

@classmethod
@abstractmethod
def record_is_deleted(cls, source_record: JSON) -> bool:
"""
Determine whether record has a status of deleted.
May be overridden by source subclasses if needed.
Args:
source_record: A JSON object representing a source record.
"""
pass

def get_optional_fields(self, source_record: JSON) -> Optional[dict]:
"""
Retrieve optional TIMDEX fields from a JSON record.
May be overridden by source subclasses.
Args:
source_record: A JSON object representing a source record.
"""
return {}


class XmlTransformer(Transformer):
"""XML transformer class."""

@final
@classmethod
def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
"""
Parse XML file and return source records as bs4 Tags via an iterator.
May not be overridden.
Args:
source_file: A file containing source records to be transformed.
"""
with open(source_file, "rb") as file:
for _, element in etree.iterparse(
file,
tag="{*}record",
encoding="utf-8",
recover=True,
):
record_string = etree.tostring(element, encoding="utf-8")
record = BeautifulSoup(record_string, "xml")
yield record
element.clear()

@final
def transform(self, source_record: Tag) -> Optional[TimdexRecord]:
"""
Expand Down Expand Up @@ -380,6 +544,34 @@ def transform(self, source_record: Tag) -> Optional[TimdexRecord]:

return TimdexRecord(**fields)

@final
def get_required_fields(self, source_record: Tag) -> dict:
"""
Get required TIMDEX fields from an XML record.
May not be overridden.
Args:
source_record: A BeautifulSoup Tag representing a single source record.
"""
source_record_id = self.get_source_record_id(source_record)

# run methods to generate required fields
source_link = self.get_source_link(
self.source_base_url, source_record_id, source_record
)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
title = self.get_valid_title(source_record_id, source_record)

return {
"source": self.source_name,
"source_link": source_link,
"timdex_record_id": timdex_record_id,
"title": title,
}

@classmethod
def get_main_titles(cls, source_record: Tag) -> list[Tag]:
"""
Expand All @@ -388,7 +580,7 @@ def get_main_titles(cls, source_record: Tag) -> list[Tag]:
May be overridden by source subclasses.
Args:
source_record: A BeautifulSoup Tag representing a single XML record
source_record: A BeautifulSoup Tag representing a single XML record.
"""
return []

Expand Down Expand Up @@ -440,7 +632,7 @@ def get_source_record_id(cls, source_record: Tag) -> str:
May be overridden by source subclasses if needed.
Args:
source_record: A BeautifulSoup Tag representing a single XML record
source_record: A BeautifulSoup Tag representing a single XML record.
"""
return str(source_record.header.find("identifier").string)

Expand Down

0 comments on commit 7519db8

Please sign in to comment.