From aa5f07b773d02f0a77f600f754e8266e01f92798 Mon Sep 17 00:00:00 2001 From: Mat Date: Tue, 23 Jan 2024 16:06:05 +0000 Subject: [PATCH] [DP-2782] Fix data product assets issue and bump the version (#3040) * Don't import openmetadata by default It's incompatable with python 3.11 which is really annoying. * Fix bug appending assets to a data product The list append() method returns None, so this was clearing the asset list every other time an asset was added. * Bump version --- .../data-platform-catalogue/CHANGELOG.md | 4 +- .../data_platform_catalogue/__init__.py | 1 - .../client/__init__.py | 1 - .../data_platform_catalogue/client/datahub.py | 6 +- .../data-platform-catalogue/pyproject.toml | 4 +- ...tahub_create_two_tables_with_metadata.json | 256 ++++++++++++++++++ .../tests/test_client_datahub.py | 47 ++++ .../tests/test_client_openmetadata.py | 6 +- ...st_integration_with_openmetadata_server.py | 2 +- 9 files changed, 314 insertions(+), 13 deletions(-) create mode 100644 python-libraries/data-platform-catalogue/tests/snapshots/datahub_create_two_tables_with_metadata.json diff --git a/python-libraries/data-platform-catalogue/CHANGELOG.md b/python-libraries/data-platform-catalogue/CHANGELOG.md index 750d886e14..d4b70d8b5f 100644 --- a/python-libraries/data-platform-catalogue/CHANGELOG.md +++ b/python-libraries/data-platform-catalogue/CHANGELOG.md @@ -7,7 +7,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.4.0] 2024-01-19 ### Breaking changes @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 about where a node in the metadata graph should be located, and what kind of database it comes from. +- Renamed `create_or_update_*` methods to `upsert_*`. + - Extracted `BaseCatalogueClient` base class from `CatalogueClient`. Use this as a type annotation to avoid coupling to the OpenMetadata implementation. diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/__init__.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/__init__.py index 1489b0f37d..6e427eb891 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/__init__.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/__init__.py @@ -1,5 +1,4 @@ from .client import DataHubCatalogueClient # noqa: F401 -from .client import OpenMetadataCatalogueClient # noqa: F401 from .client import CatalogueError, ReferencedEntityMissing # noqa: F401 from .entities import DataProductMetadata # noqa: F401 from .entities import CatalogueMetadata, DataLocation, TableMetadata # noqa: F401 diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/__init__.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/__init__.py index e5becdb2e4..f19de51a7b 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/__init__.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/__init__.py @@ -2,4 +2,3 @@ from .base import CatalogueError # noqa: F401 from .base import ReferencedEntityMissing # noqa: F401 from .datahub import DataHubCatalogueClient # noqa: F401 -from .openmetadata import OpenMetadataCatalogueClient # noqa: F401 diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub.py index d7afa9efc4..87966bd6b6 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub.py @@ -294,11 +294,11 @@ def upsert_table( data_product_existing_properties is not None and data_product_existing_properties.assets is not None ): - assets = data_product_existing_properties.assets.append( - data_product_association - ) + assets = data_product_existing_properties.assets[::] + assets.append(data_product_association) else: assets = [data_product_association] + data_product_properties = DataProductPropertiesClass(assets=assets) metadata_event = MetadataChangeProposalWrapper( diff --git a/python-libraries/data-platform-catalogue/pyproject.toml b/python-libraries/data-platform-catalogue/pyproject.toml index f7addbb5e9..c22ad548a4 100644 --- a/python-libraries/data-platform-catalogue/pyproject.toml +++ b/python-libraries/data-platform-catalogue/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ministryofjustice-data-platform-catalogue" -version = "0.3.1" +version = "0.4.0" description = "Library to integrate the MoJ data platform with the catalogue component." authors = ["MoJ Data Platform Team "] license = "MIT" @@ -10,7 +10,7 @@ packages = [{ include = "data_platform_catalogue" }] [tool.poetry.dependencies] python = "^3.10" openmetadata-ingestion = "~1.2.0.1" -acryl-datahub = {extras = ["datahub-rest"], version = "^0.12.1.3"} +acryl-datahub = { extras = ["datahub-rest"], version = "^0.12.1.3" } freezegun = "^1.4.0" deepdiff = "^6.7.1" diff --git a/python-libraries/data-platform-catalogue/tests/snapshots/datahub_create_two_tables_with_metadata.json b/python-libraries/data-platform-catalogue/tests/snapshots/datahub_create_two_tables_with_metadata.json new file mode 100644 index 0000000000..4d424eef68 --- /dev/null +++ b/python-libraries/data-platform-catalogue/tests/snapshots/datahub_create_two_tables_with_metadata.json @@ -0,0 +1,256 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "description": "bla bla", + "tags": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "my_table", + "platform": "urn:li:dataPlatform:glue", + "version": 1, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "foo", + "nullable": false, + "description": "a", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "bar", + "nullable": false, + "description": "b", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + } + ] + } + } +}, +{ + "entityType": "domain", + "entityUrn": "urn:li:domain:legal-aid", + "changeType": "UPSERT", + "aspectName": "domainProperties", + "aspect": { + "json": { + "name": "legal-aid", + "description": "" + } + } +}, +{ + "entityType": "dataproduct", + "entityUrn": "urn:li:dataProduct:my_data_product", + "changeType": "UPSERT", + "aspectName": "domains", + "aspect": { + "json": { + "domains": [ + "urn:li:domain:legal-aid" + ] + } + } +}, +{ + "entityType": "dataproduct", + "entityUrn": "urn:li:dataProduct:my_data_product", + "changeType": "UPSERT", + "aspectName": "dataProductProperties", + "aspect": { + "json": { + "customProperties": { + "email": "justice@justice.gov.uk", + "retention_period_in_days": "365", + "dpia_required": "False" + }, + "name": "my_data_product", + "description": "bla bla" + } + } +}, +{ + "entityType": "dataproduct", + "entityUrn": "urn:li:dataProduct:my_data_product", + "changeType": "UPSERT", + "aspectName": "dataProductProperties", + "aspect": { + "json": { + "customProperties": {}, + "assets": [ + { + "sourceUrn": "urn:li:dataProduct:my_data_product", + "destinationUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table,PROD)" + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table2,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "description": "this is a different table", + "tags": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table2,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "my_table2", + "platform": "urn:li:dataPlatform:glue", + "version": 1, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "boo", + "nullable": false, + "description": "spooky", + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "yar", + "nullable": false, + "description": "shiver my timbers", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + } + ] + } + } +}, +{ + "entityType": "domain", + "entityUrn": "urn:li:domain:legal-aid", + "changeType": "UPSERT", + "aspectName": "domainProperties", + "aspect": { + "json": { + "name": "legal-aid", + "description": "" + } + } +}, +{ + "entityType": "dataproduct", + "entityUrn": "urn:li:dataProduct:my_data_product", + "changeType": "UPSERT", + "aspectName": "domains", + "aspect": { + "json": { + "domains": [ + "urn:li:domain:legal-aid" + ] + } + } +}, +{ + "entityType": "dataproduct", + "entityUrn": "urn:li:dataProduct:my_data_product", + "changeType": "UPSERT", + "aspectName": "dataProductProperties", + "aspect": { + "json": { + "customProperties": { + "email": "justice@justice.gov.uk", + "retention_period_in_days": "365", + "dpia_required": "False" + }, + "name": "my_data_product", + "description": "bla bla" + } + } +}, +{ + "entityType": "dataproduct", + "entityUrn": "urn:li:dataProduct:my_data_product", + "changeType": "UPSERT", + "aspectName": "dataProductProperties", + "aspect": { + "json": { + "customProperties": {}, + "assets": [ + { + "sourceUrn": "urn:li:dataProduct:my_data_product", + "destinationUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table2,PROD)" + } + ] + } + } +} +] diff --git a/python-libraries/data-platform-catalogue/tests/test_client_datahub.py b/python-libraries/data-platform-catalogue/tests/test_client_datahub.py index 583ac556ee..27992b4e74 100644 --- a/python-libraries/data-platform-catalogue/tests/test_client_datahub.py +++ b/python-libraries/data-platform-catalogue/tests/test_client_datahub.py @@ -51,6 +51,18 @@ def table(self): retention_period_in_days=365, ) + @pytest.fixture + def table2(self): + return TableMetadata( + name="my_table2", + description="this is a different table", + column_details=[ + {"name": "boo", "type": "boolean", "description": "spooky"}, + {"name": "yar", "type": "string", "description": "shiver my timbers"}, + ], + retention_period_in_days=1, + ) + @pytest.fixture def datahub_client(self, base_mock_graph) -> DataHubCatalogueClient: return DataHubCatalogueClient( @@ -100,6 +112,41 @@ def test_create_table_with_metadata_datahub( base_mock_graph.sink_to_file(output_file) check_snapshot("datahub_create_table_with_metadata.json", output_file) + def test_create_two_tables_with_metadata( + self, + datahub_client, + table, + table2, + data_product, + base_mock_graph, + tmp_path, + check_snapshot, + ): + """ + Case where we create a dataset, data product and domain + """ + fqn = datahub_client.upsert_table( + metadata=table, + data_product_metadata=data_product, + location=DataLocation("my_database"), + ) + fqn_out = "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table,PROD)" + + assert fqn == fqn_out + + fqn = datahub_client.upsert_table( + metadata=table2, + data_product_metadata=data_product, + location=DataLocation("my_database"), + ) + fqn_out = "urn:li:dataset:(urn:li:dataPlatform:glue,my_database.my_table2,PROD)" + + assert fqn == fqn_out + + output_file = Path(tmp_path / "datahub_create_table_with_metadata.json") + base_mock_graph.sink_to_file(output_file) + check_snapshot("datahub_create_two_tables_with_metadata.json", output_file) + def test_create_table_and_metadata_idempotent_datahub( self, datahub_client, diff --git a/python-libraries/data-platform-catalogue/tests/test_client_openmetadata.py b/python-libraries/data-platform-catalogue/tests/test_client_openmetadata.py index e4d03003e6..d386a8d0b9 100644 --- a/python-libraries/data-platform-catalogue/tests/test_client_openmetadata.py +++ b/python-libraries/data-platform-catalogue/tests/test_client_openmetadata.py @@ -1,8 +1,6 @@ import pytest -from data_platform_catalogue.client import ( - OpenMetadataCatalogueClient, - ReferencedEntityMissing, -) +from data_platform_catalogue.client import ReferencedEntityMissing +from data_platform_catalogue.client.openmetadata import OpenMetadataCatalogueClient from data_platform_catalogue.entities import ( CatalogueMetadata, DataLocation, diff --git a/python-libraries/data-platform-catalogue/tests/test_integration_with_openmetadata_server.py b/python-libraries/data-platform-catalogue/tests/test_integration_with_openmetadata_server.py index 11243a9889..0337d63145 100644 --- a/python-libraries/data-platform-catalogue/tests/test_integration_with_openmetadata_server.py +++ b/python-libraries/data-platform-catalogue/tests/test_integration_with_openmetadata_server.py @@ -11,7 +11,7 @@ import pytest from data_platform_catalogue import DataProductMetadata, TableMetadata -from data_platform_catalogue.client import OpenMetadataCatalogueClient +from data_platform_catalogue.client.openmetadata import OpenMetadataCatalogueClient from data_platform_catalogue.entities import DataLocation jwt_token = os.environ.get("JWT_TOKEN")