From d66c8b0a80411fc0295dbf659fea1d0c9831032d Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Wed, 15 Jan 2025 12:16:29 -0500 Subject: [PATCH 1/3] Create SCCS workflow Why these changes are being introduced: * Support deposits requested by Scholarly Communications and Collections Strategy (SCCS). How this addresses that need: * Create SCCS workflow and metadata mapping JSON file * Move DemoWorkflow metadata mapping JSON file * Change Workflow.email_recipients to list[str] * Clean up Workflow and SimpleCSV fixtures Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/IN-1098 --- dsc/cli.py | 2 +- dsc/workflows/__init__.py | 3 +- dsc/workflows/base/__init__.py | 2 +- dsc/workflows/demo.py | 2 +- .../workflows/metadata_mapping/demo.json | 14 ++-- dsc/workflows/metadata_mapping/sccs.json | 79 +++++++++++++++++++ dsc/workflows/sccs.py | 13 +++ tests/conftest.py | 6 +- 8 files changed, 107 insertions(+), 14 deletions(-) rename tests/fixtures/demo_metadata_mapping.json => dsc/workflows/metadata_mapping/demo.json (93%) create mode 100644 dsc/workflows/metadata_mapping/sccs.json create mode 100644 dsc/workflows/sccs.py diff --git a/dsc/cli.py b/dsc/cli.py index 650de71..9f5a123 100644 --- a/dsc/cli.py +++ b/dsc/cli.py @@ -70,7 +70,7 @@ def main( workflow = workflow_class( collection_handle=collection_handle, batch_id=batch_id, - email_recipients=tuple(email_recipients.split(",")), + email_recipients=email_recipients.split(","), s3_bucket=s3_bucket, output_queue=output_queue, ) diff --git a/dsc/workflows/__init__.py b/dsc/workflows/__init__.py index a085a0c..8d5e3b2 100644 --- a/dsc/workflows/__init__.py +++ b/dsc/workflows/__init__.py @@ -6,5 +6,6 @@ from dsc.workflows.base import Workflow from dsc.workflows.base.simple_csv import SimpleCSV from dsc.workflows.demo import DemoWorkflow +from dsc.workflows.sccs import SCCS -__all__ = ["DemoWorkflow", "SimpleCSV", "Workflow"] +__all__ = ["SCCS", "DemoWorkflow", "SimpleCSV", "Workflow"] diff --git a/dsc/workflows/base/__init__.py b/dsc/workflows/base/__init__.py index 03d27d3..2b0f062 100644 --- a/dsc/workflows/base/__init__.py +++ b/dsc/workflows/base/__init__.py @@ -35,7 +35,7 @@ def __init__( self, collection_handle: str, batch_id: str, - email_recipients: tuple[str, ...], + email_recipients: list[str], s3_bucket: str | None = None, output_queue: str | None = None, ) -> None: diff --git a/dsc/workflows/demo.py b/dsc/workflows/demo.py index 0ca82cb..b39856e 100644 --- a/dsc/workflows/demo.py +++ b/dsc/workflows/demo.py @@ -5,4 +5,4 @@ class DemoWorkflow(SimpleCSV): workflow_name: str = "demo" submission_system: str = "DSpace@MIT" - metadata_mapping_path: str = "tests/fixtures/demo_metadata_mapping.json" + metadata_mapping_path: str = "dsc/workflows/metadata_mapping/demo.json" diff --git a/tests/fixtures/demo_metadata_mapping.json b/dsc/workflows/metadata_mapping/demo.json similarity index 93% rename from tests/fixtures/demo_metadata_mapping.json rename to dsc/workflows/metadata_mapping/demo.json index 6b2f173..1f849c2 100644 --- a/tests/fixtures/demo_metadata_mapping.json +++ b/dsc/workflows/metadata_mapping/demo.json @@ -2,7 +2,14 @@ "item_identifier": { "source_field_name": "item_identifier", "language": null, - "delimiter": "" + "delimiter": "", + "required": true + }, + "dc.title": { + "source_field_name": "dc.title", + "language": "en_US", + "delimiter": "", + "required": true }, "dc.publisher": { "source_field_name": "dc.publisher", @@ -34,11 +41,6 @@ "language": "", "delimiter": "" }, - "dc.title": { - "source_field_name": "dc.title", - "language": "en_US", - "delimiter": "" - }, "dc.relation.journal": { "source_field_name": "dc.relation.journal", "language": "", diff --git a/dsc/workflows/metadata_mapping/sccs.json b/dsc/workflows/metadata_mapping/sccs.json new file mode 100644 index 0000000..d95f3cb --- /dev/null +++ b/dsc/workflows/metadata_mapping/sccs.json @@ -0,0 +1,79 @@ +{ + "item_identifier": { + "source_field_name": "item_identifier", + "language": null, + "delimiter": "", + "required": true + }, + "dc.title": { + "source_field_name": "dc.title", + "language": "en_US", + "delimiter": "", + "required": true + }, + "dc.publisher": { + "source_field_name": "dc.publisher", + "language": "en_US", + "delimiter": "" + }, + "dc.identifier.mitlicense": { + "source_field_name": "dc.identifier.mitlicense", + "language": "en_US", + "delimiter": "" + }, + "dc.eprint.version": { + "source_field_name": "dc.eprint.version", + "language": "en_US", + "delimiter": "" + }, + "dc.type": { + "source_field_name": "dc.type", + "language": "en_US", + "delimiter": "" + }, + "dc.source": { + "source_field_name": "dc.source", + "language": "en_US", + "delimiter": "" + }, + "dc.contributor.author": { + "source_field_name": "dc.contributor.author", + "language": "en_US", + "delimiter": "|" + }, + "dc.relation.isversionof": { + "source_field_name": "dc.relation.isversionof", + "language": "", + "delimiter": "" + }, + "dc.relation.journal": { + "source_field_name": "dc.relation.journal", + "language": "", + "delimiter": "" + }, + "dc.identifier.issn": { + "source_field_name": "dc.identifier.issn", + "language": "", + "delimiter": "" + }, + "dc.date.issued": { + "source_field_name": "dc.date.issued", + "language": "", + "delimiter": "" + }, + "dc.rights": { + "source_field_name": "dc.rights", + "language": "en_US", + "delimiter": "" + }, + "dc.rights.uri": { + "source_field_name": "dc.rights.uri", + "language": "", + "delimiter": "" + }, + "dc.description.sponsorship": { + "source_field_name": "dc.description.sponsorship", + "language": "en_US", + "delimiter": "" + } +} \ No newline at end of file diff --git a/dsc/workflows/sccs.py b/dsc/workflows/sccs.py new file mode 100644 index 0000000..25a81c2 --- /dev/null +++ b/dsc/workflows/sccs.py @@ -0,0 +1,13 @@ +from dsc.workflows import SimpleCSV + + +class SCCS(SimpleCSV): + """Workflow for SCCS-requested deposits. + + The deposits managed by this workflow are requested by the Scholarly + Communication and Collection Strategy (SCCS) department + and are for submission to DSpace@MIT. + """ + + workflow_name: str = "sccs" + metadata_mapping_path: str = "dsc/workflows/metadata_mapping/sccs.json" diff --git a/tests/conftest.py b/tests/conftest.py index a4a58e4..85ce873 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,6 @@ class TestWorkflow(Workflow): workflow_name: str = "test" submission_system: str = "Test@MIT" - email_recipients: tuple[str] = ("test@test.test",) metadata_mapping_path: str = "tests/fixtures/test_metadata_mapping.json" def item_metadata_iter(self): @@ -56,10 +55,7 @@ class TestSimpleCSV(SimpleCSV): workflow_name = "simple_csv" submission_system: str = "Test@MIT" - email_recipients: tuple[str] = ("test@test.test",) metadata_mapping_path: str = "tests/fixtures/test_metadata_mapping.json" - s3_bucket: str = "dsc" - output_queue: str = "mock-output_queue" @pytest.fixture(autouse=True) @@ -79,6 +75,7 @@ def base_workflow_instance(item_metadata, metadata_mapping, mocked_s3): collection_handle="123.4/5678", batch_id="batch-aaa", email_recipients=["test@test.test"], + output_queue="mock-output_queue", ) @@ -88,6 +85,7 @@ def simple_csv_workflow_instance(metadata_mapping): collection_handle="123.4/5678", batch_id="batch-aaa", email_recipients=["test@test.test"], + output_queue="mock-output_queue", ) From a8879bc92ee6b7762c5d13faa5f1104534ced25f Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Wed, 15 Jan 2025 12:22:38 -0500 Subject: [PATCH 2/3] Rename DemoWorkflow -> Demo --- dsc/workflows/__init__.py | 4 ++-- dsc/workflows/demo.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dsc/workflows/__init__.py b/dsc/workflows/__init__.py index 8d5e3b2..b154351 100644 --- a/dsc/workflows/__init__.py +++ b/dsc/workflows/__init__.py @@ -5,7 +5,7 @@ from dsc.workflows.base import Workflow from dsc.workflows.base.simple_csv import SimpleCSV -from dsc.workflows.demo import DemoWorkflow +from dsc.workflows.demo import Demo from dsc.workflows.sccs import SCCS -__all__ = ["SCCS", "DemoWorkflow", "SimpleCSV", "Workflow"] +__all__ = ["SCCS", "Demo", "SimpleCSV", "Workflow"] diff --git a/dsc/workflows/demo.py b/dsc/workflows/demo.py index b39856e..7e0bf00 100644 --- a/dsc/workflows/demo.py +++ b/dsc/workflows/demo.py @@ -1,7 +1,7 @@ from dsc.workflows.base.simple_csv import SimpleCSV -class DemoWorkflow(SimpleCSV): +class Demo(SimpleCSV): workflow_name: str = "demo" submission_system: str = "DSpace@MIT" From 72fef3098e137385a1d680b9ab2fd184ed7a8c97 Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Wed, 15 Jan 2025 16:37:20 -0500 Subject: [PATCH 3/3] Address comments in PR #77 * Make 'delimiter' and 'language' optional configs * Omit null and empty string configs from metadata mapping JSON files --- dsc/workflows/base/__init__.py | 19 ++++++---- dsc/workflows/metadata_mapping/demo.json | 35 +++++------------- dsc/workflows/metadata_mapping/sccs.json | 44 +++++++---------------- tests/fixtures/test_metadata_mapping.json | 4 --- 4 files changed, 34 insertions(+), 68 deletions(-) diff --git a/dsc/workflows/base/__init__.py b/dsc/workflows/base/__init__.py index 2b0f062..4902fbb 100644 --- a/dsc/workflows/base/__init__.py +++ b/dsc/workflows/base/__init__.py @@ -259,12 +259,19 @@ def create_dspace_metadata(self, item_metadata: dict[str, Any]) -> dict[str, Any A metadata mapping is a dict with the format seen below: { - "dc.contributor": { - "source_field_name": "contributor", - "language": None, - "delimiter": "|", + "dc.contributor": { + "source_field_name": "contributor", + "language": "", + "delimiter": "", + "required": true | false + } } + When setting up the metadata mapping JSON file, "language" and "delimiter" + can be omitted from the file if not applicable. Required fields ("item_identifier" + and "title") must be set as required (true); if "required" is not listed as a + a config, the field defaults as not required (false). + MUST NOT be overridden by workflow subclasses. Args: @@ -281,8 +288,8 @@ def create_dspace_metadata(self, item_metadata: dict[str, Any]) -> dict[str, Any f"{field_mapping["source_field_name"]}'" ) if field_value: - delimiter = field_mapping["delimiter"] - language = field_mapping["language"] + delimiter = field_mapping.get("delimiter") + language = field_mapping.get("language") if delimiter: metadata_entries.extend( [ diff --git a/dsc/workflows/metadata_mapping/demo.json b/dsc/workflows/metadata_mapping/demo.json index 1f849c2..392be94 100644 --- a/dsc/workflows/metadata_mapping/demo.json +++ b/dsc/workflows/metadata_mapping/demo.json @@ -1,35 +1,28 @@ { "item_identifier": { "source_field_name": "item_identifier", - "language": null, - "delimiter": "", "required": true }, "dc.title": { "source_field_name": "dc.title", "language": "en_US", - "delimiter": "", "required": true }, "dc.publisher": { "source_field_name": "dc.publisher", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.eprint.version": { "source_field_name": "dc.eprint.version", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.type": { "source_field_name": "dc.type", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.source": { "source_field_name": "dc.source", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.contributor.author": { "source_field_name": "dc.contributor.author", @@ -37,28 +30,18 @@ "delimiter": "|" }, "dc.relation.isversionof": { - "source_field_name": "dc.relation.isversionof", - "language": "", - "delimiter": "" + "source_field_name": "dc.relation.isversionof" }, "dc.relation.journal": { - "source_field_name": "dc.relation.journal", - "language": "", - "delimiter": "" + "source_field_name": "dc.relation.journal" }, "dc.identifier.issn": { - "source_field_name": "dc.identifier.issn", - "language": "", - "delimiter": "" + "source_field_name": "dc.identifier.issn" }, "dc.date.issued": { - "source_field_name": "dc.date.issued", - "language": "", - "delimiter": "" + "source_field_name": "dc.date.issued" }, "dc.rights.uri": { - "source_field_name": "dc.rights.uri", - "language": "", - "delimiter": "" + "source_field_name": "dc.rights.uri" } } \ No newline at end of file diff --git a/dsc/workflows/metadata_mapping/sccs.json b/dsc/workflows/metadata_mapping/sccs.json index d95f3cb..5a0fd17 100644 --- a/dsc/workflows/metadata_mapping/sccs.json +++ b/dsc/workflows/metadata_mapping/sccs.json @@ -1,40 +1,32 @@ { "item_identifier": { "source_field_name": "item_identifier", - "language": null, - "delimiter": "", "required": true }, "dc.title": { "source_field_name": "dc.title", "language": "en_US", - "delimiter": "", "required": true }, "dc.publisher": { "source_field_name": "dc.publisher", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.identifier.mitlicense": { "source_field_name": "dc.identifier.mitlicense", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.eprint.version": { "source_field_name": "dc.eprint.version", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.type": { "source_field_name": "dc.type", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.source": { "source_field_name": "dc.source", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.contributor.author": { "source_field_name": "dc.contributor.author", @@ -42,38 +34,26 @@ "delimiter": "|" }, "dc.relation.isversionof": { - "source_field_name": "dc.relation.isversionof", - "language": "", - "delimiter": "" + "source_field_name": "dc.relation.isversionof" }, "dc.relation.journal": { - "source_field_name": "dc.relation.journal", - "language": "", - "delimiter": "" + "source_field_name": "dc.relation.journal" }, "dc.identifier.issn": { - "source_field_name": "dc.identifier.issn", - "language": "", - "delimiter": "" + "source_field_name": "dc.identifier.issn" }, "dc.date.issued": { - "source_field_name": "dc.date.issued", - "language": "", - "delimiter": "" + "source_field_name": "dc.date.issued" }, "dc.rights": { "source_field_name": "dc.rights", - "language": "en_US", - "delimiter": "" + "language": "en_US" }, "dc.rights.uri": { - "source_field_name": "dc.rights.uri", - "language": "", - "delimiter": "" + "source_field_name": "dc.rights.uri" }, "dc.description.sponsorship": { "source_field_name": "dc.description.sponsorship", - "language": "en_US", - "delimiter": "" + "language": "en_US" } } \ No newline at end of file diff --git a/tests/fixtures/test_metadata_mapping.json b/tests/fixtures/test_metadata_mapping.json index 0b3e3dd..38809c6 100644 --- a/tests/fixtures/test_metadata_mapping.json +++ b/tests/fixtures/test_metadata_mapping.json @@ -1,19 +1,15 @@ { "item_identifier": { "source_field_name": "item_identifier", - "language": null, - "delimiter": "", "required": true }, "dc.title": { "source_field_name": "title", "language": "en_US", - "delimiter": "", "required": true }, "dc.contributor": { "source_field_name": "contributor", - "language": null, "delimiter": "|" } } \ No newline at end of file