Skip to content

Commit

Permalink
[wip] bitsreams_in_directory with ID settings
Browse files Browse the repository at this point in the history
  • Loading branch information
jonavellecuerdo committed Mar 19, 2024
1 parent 6130f37 commit 5df4372
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 18 deletions.
4 changes: 3 additions & 1 deletion dsaps/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def main(ctx: click.Context, source_config, url, email, password, verbose):
start_time = time.time()
ctx.obj["client"] = client
ctx.obj["s3_client"] = s3_client
ctx.obj["source_settings"] = CONFIG.source_settings
ctx.obj["start_time"] = start_time


Expand Down Expand Up @@ -135,6 +136,7 @@ def additems(
"""
client = ctx.obj["client"]
s3_client = ctx.obj["s3_client"]
source_settings = ctx.obj["source_settings"]
start_time = ctx.obj["start_time"]
if "collection_uuid" not in ctx.obj and collection_handle is None:
raise click.UsageError(
Expand All @@ -151,7 +153,7 @@ def additems(
mapping = json.load(jsonfile)
collection = Collection.create_metadata_for_items_from_csv(metadata, mapping)
for item in collection.items:
item.bitstreams_in_directory(s3_client, content_directory)
item.bitstreams_in_directory(s3_client, source_settings, content_directory)
collection.uuid = collection_uuid
for item in collection.post_items(client):
logger.info(item.file_identifier)
Expand Down
7 changes: 7 additions & 0 deletions dsaps/helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import os
import re


def create_csv_from_list(list_name, output):
Expand Down Expand Up @@ -76,6 +77,12 @@ def filter_files_by_prefix(file_paths, prefixes):
return filtered_file_paths


def parse_id_from_file_name(file_name: str, source_settings: dict):
if pattern := source_settings.get("regex"):
id_pattern = re.compile(pattern)
return id_pattern.search(file_name).group(1)


def match_files_to_metadata(file_list, metadata_ids):
"""Create list of files matched to metadata records."""
file_matches = [
Expand Down
28 changes: 23 additions & 5 deletions dsaps/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from attrs import field

from dsaps.helpers import filter_files_by_prefix
from dsaps.helpers import filter_files_by_prefix, parse_id_from_file_name

logger = structlog.get_logger()
op = operator.attrgetter("name")
Expand Down Expand Up @@ -239,7 +239,12 @@ class Item(BaseRecord):
source_system_identifier: str = field(default=None)

def bitstreams_in_directory(
self, s3_client, bucket: str, prefix="", search_in="", delimiter: str = "-"
self,
s3_client,
source_settings,
bucket: str,
prefix="",
search_in="",
):
"""Create a list of bitstreams from S3 file objects.
Expand All @@ -258,9 +263,22 @@ def bitstreams_in_directory(
for file_path in file_paths:
file_name = file_path.split("/")[-1]
file_directory = "/".join([bucket, *file_path.split("/")[:-1]])
self.bitstreams.append(
Bitstream(name=file_name, file_path=f"{file_directory}/{file_name}")
)

if source_settings.get("id"):
file_identifier = parse_id_from_file_name(
file_name, source_settings["id"]
)

if file_identifier == self.file_identifier:
self.bitstreams.append(
Bitstream(
name=file_name, file_path=f"{file_directory}/{file_name}"
)
)
else:
self.bitstreams.append(
Bitstream(name=file_name, file_path=f"{file_directory}/{file_name}")
)
self.bitstreams.sort(key=lambda x: x.name)

@classmethod
Expand Down
26 changes: 25 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from moto import mock_aws

from dsaps import models
from dsaps.config import Config


# Env fixtures
Expand All @@ -17,12 +18,35 @@ def _test_environment(monkeypatch):
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "testing")
monkeypatch.setenv("AWS_SECURITY_TOKEN", "testing")
monkeypatch.setenv("AWS_SESSION_TOKEN", "testing")
monkeypatch.setenv("SOURCE_CONFIG", "tests/fixtures/config/source.json")
monkeypatch.setenv("SOURCE_CONFIG", "tests/fixtures/config/source_simple.json")
monkeypatch.setenv("DSPACE_URL", "mock://example.com/")
monkeypatch.setenv("DSPACE_EMAIL", "test@test.mock")
monkeypatch.setenv("DSPACE_PASSWORD", "1234")


@pytest.fixture
def simple_config():
return Config(config_file="tests/fixtures/config/source_simple.json")


@pytest.fixture
def complex_config():
return Config(config_file="tests/fixtures/config/source_complex.json")


@pytest.fixture()
def mocked_s3_ddc():
with mock_aws():
s3_instance = boto3.client("s3", region_name="us-east-1")
s3_instance.create_bucket(Bucket="test-bucket-ddc")
s3_instance.put_object(
Body="",
Bucket="test-bucket-ddc",
Key="aaaa-bbbb-cccc-dddd-eeee-02-000012345.pdf",
)
yield s3_instance


@pytest.fixture()
def mocked_s3():
with mock_aws():
Expand Down
8 changes: 6 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ def test_additems(runner, mocked_s3, caplog):
result = runner.invoke(
main,
[
"--source-config",
"tests/fixtures/config/source_simple.json",
"--url",
"mock://example.com/",
"--email",
Expand All @@ -33,6 +35,8 @@ def test_additems(runner, mocked_s3, caplog):
result = runner.invoke(
main,
[
"--source-config",
"tests/fixtures/config/source_simple.json",
"--url",
"mock://example.com/",
"--email",
Expand Down Expand Up @@ -64,7 +68,7 @@ def test_newcollection(runner):
main,
[
"--source-config",
"tests/fixtures/config/source.json",
"tests/fixtures/config/source_simple.json",
"--url",
"mock://example.com/",
"--email",
Expand All @@ -88,7 +92,7 @@ def test_reconcile(runner, mocked_s3, output_dir):
main,
[
"--source-config",
"tests/fixtures/config/source.json",
"tests/fixtures/config/source_simple.json",
"--url",
"mock://example.com/",
"--email",
Expand Down
9 changes: 2 additions & 7 deletions tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
from dsaps.config import Config

CONFIG = Config(config_file="tests/fixtures/config/source.json")
CONFIG = Config(config_file="tests/fixtures/config/source_simple.json")


def test_load_source_config():
source_config = Config.load_source_config("tests/fixtures/config/source.json")
source_config = Config.load_source_config("tests/fixtures/config/source_simple.json")
assert "settings" in source_config
assert "mapping" in source_config


def test_source_settings_with_id_configs():
source_settings = CONFIG.source_settings
assert "id" in source_settings
23 changes: 21 additions & 2 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,33 @@ def test_collection_post_items(


@mock_aws
def test_item_bitstreams_in_directory(mocked_s3, s3_client, caplog):
def test_item_bitstreams_in_directory(mocked_s3, s3_client, simple_config, caplog):
caplog.set_level("DEBUG")
item = models.Item(file_identifier="test")
item.bitstreams_in_directory(s3_client=s3_client, bucket="s3://test-bucket")
item.bitstreams_in_directory(
s3_client=s3_client,
source_settings=simple_config.source_settings,
bucket="s3://test-bucket",
)
assert item.bitstreams[0].name == "best_01.pdf"
assert item.bitstreams[1].name == "test_01.jpg"


@mock_aws
def test_item_bitstreams_in_directory_aspace(
mocked_s3_ddc, s3_client, complex_config, caplog
):
caplog.set_level("DEBUG")
item = models.Item(file_identifier="02-000012345")
item.bitstreams_in_directory(
s3_client=s3_client,
source_settings=complex_config.source_settings,
bucket="s3://test-bucket-ddc",
)

assert item.bitstreams[0].name == "aaaa-bbbb-cccc-dddd-eeee-02-000012345.pdf"


def test_item_metadata_from_csv_row(aspace_delimited_csv, aspace_mapping):
row = next(aspace_delimited_csv)
item = models.Item.metadata_from_csv_row(row, aspace_mapping)
Expand Down

0 comments on commit 5df4372

Please sign in to comment.