Skip to content

Commit

Permalink
Updates based on further discussion in #56
Browse files Browse the repository at this point in the history
* Shift build_bitstream_dict, match_bitstreams_to_item_identifiers, and match_item_identifiers_to_bitstreams to BaseWorkflow private methods
  • Loading branch information
ehanson8 committed Jan 7, 2025
1 parent 0c860b0 commit 059a282
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 101 deletions.
58 changes: 0 additions & 58 deletions dsc/utilities/__init__.py

This file was deleted.

65 changes: 57 additions & 8 deletions dsc/workflows/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import logging
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import TYPE_CHECKING, Any, final

from dsc.exceptions import (
Expand All @@ -11,13 +12,10 @@
ItemMetadatMissingRequiredFieldError,
)
from dsc.item_submission import ItemSubmission
from dsc.utilities import (
build_bitstream_dict,
match_bitstreams_to_item_identifiers,
match_item_identifiers_to_bitstreams,
)
from dsc.utilities.aws.s3 import S3Client

if TYPE_CHECKING:
from _collections_abc import dict_keys
from collections.abc import Iterator

from mypy_boto3_sqs.type_defs import SendMessageResultTypeDef
Expand Down Expand Up @@ -103,7 +101,7 @@ def reconcile_bitstreams_and_metadata(self) -> tuple[set[str], set[str]]:
without bitstreams. Any discrepancies will be addressed by the engineer and
stakeholders as necessary.
"""
bitstream_dict = build_bitstream_dict(self.s3_bucket, self.batch_path)
bitstream_dict = self._build_bitstream_dict()

# extract item identifiers from batch metadata
item_identifiers = [
Expand All @@ -112,17 +110,68 @@ def reconcile_bitstreams_and_metadata(self) -> tuple[set[str], set[str]]:
]

# reconcile item identifiers against bitstreams
item_identifier_matches = match_item_identifiers_to_bitstreams(
item_identifier_matches = self._match_item_identifiers_to_bitstreams(
bitstream_dict.keys(), item_identifiers
)
file_matches = match_bitstreams_to_item_identifiers(
file_matches = self._match_bitstreams_to_item_identifiers(
bitstream_dict.keys(), item_identifiers
)
logger.info(f"Item identifiers and bitstreams matched: {item_identifier_matches}")
no_bitstreams = set(item_identifiers) - set(item_identifier_matches)
no_item_identifiers = set(bitstream_dict.keys()) - set(file_matches)
return no_bitstreams, no_item_identifiers

def _build_bitstream_dict(self) -> dict:
"""Build a dict of potential bitstreams with an item identifier for the key.
An underscore (if present) serves as the delimiter between the item identifier
and any additional suffixes in the case of multiple matching bitstreams.
"""
s3_client = S3Client()
bitstreams = list(
s3_client.files_iter(bucket=self.s3_bucket, prefix=self.batch_path)
)
bitstream_dict: dict[str, list[str]] = defaultdict(list)
for bitstream in bitstreams:
file_name = bitstream.split("/")[-1]
item_identifier = file_name.split("_")[0] if "_" in file_name else file_name
bitstream_dict[item_identifier].append(bitstream)
return bitstream_dict

def _match_bitstreams_to_item_identifiers(
self, bitstreams: dict_keys, item_identifiers: list[str]
) -> list[str]:
"""Create list of bitstreams matched to item identifiers.
Args:
bitstreams: A dict of S3 files with base file IDs and full URIs.
item_identifiers: A list of item identifiers retrieved from the batch
metadata.
"""
return [
file_id
for item_identifier in item_identifiers
for file_id in bitstreams
if file_id == item_identifier
]

def _match_item_identifiers_to_bitstreams(
self, bitstreams: dict_keys, item_identifiers: list[str]
) -> list[str]:
"""Create list of item identifers matched to bitstreams.
Args:
bitstreams: A dict of S3 files with base file IDs and full URIs.
item_identifiers: A list of item identifiers retrieved from the batch
metadata.
"""
return [
item_identifier
for file_id in bitstreams
for item_identifier in item_identifiers
if file_id == item_identifier
]

@final
def run(self) -> Iterator[SendMessageResultTypeDef]:
"""Run workflow to submit items to the DSpace Submission Service."""
Expand Down
36 changes: 36 additions & 0 deletions tests/test_base_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,42 @@ def test_base_workflow_reconcile_bitstreams_and_metadata_success(
assert "Item identifiers and bitstreams matched: ['123']" in caplog.text


def test_build_bitstream_dict_success(mocked_s3, s3_client, base_workflow_instance):
s3_client.put_file(file_content="", bucket="dsc", key="test/batch-aaa/123_01.pdf")
s3_client.put_file(file_content="", bucket="dsc", key="test/batch-aaa/123_02.pdf")
s3_client.put_file(file_content="", bucket="dsc", key="test/batch-aaa/456_01.pdf")
s3_client.put_file(file_content="", bucket="dsc", key="test/batch-aaa/789_01.jpg")
assert base_workflow_instance._build_bitstream_dict() == { # noqa: SLF001
"123": ["test/batch-aaa/123_01.pdf", "test/batch-aaa/123_02.pdf"],
"456": ["test/batch-aaa/456_01.pdf"],
"789": ["test/batch-aaa/789_01.jpg"],
}


def test_match_item_identifiers_to_bitstreams_success(base_workflow_instance):
bitstream_dict = {"test": "test_01.pdf"}
item_identifiers = ["test", "tast"]
item_identifier_matches = (
base_workflow_instance._match_item_identifiers_to_bitstreams( # noqa: SLF001
bitstream_dict.keys(), item_identifiers
)
)
assert len(item_identifier_matches) == 1
assert "test" in item_identifier_matches


def test_match_bitstreams_to_item_identifiers_success(base_workflow_instance):
bitstream_dict = {"test": "test_01.pdf", "tast": "tast_01.pdf"}
item_identifiers = ["test"]
file_matches = (
base_workflow_instance._match_bitstreams_to_item_identifiers( # noqa: SLF001
bitstream_dict, item_identifiers
)
)
assert len(file_matches) == 1
assert "test" in file_matches


def test_base_workflow_run_success(
caplog, base_workflow_instance, mocked_s3, mocked_sqs_input, mocked_sqs_output
):
Expand Down
35 changes: 0 additions & 35 deletions tests/test_utilities.py

This file was deleted.

0 comments on commit 059a282

Please sign in to comment.