diff --git a/python-libraries/data-platform-catalogue/CHANGELOG.md b/python-libraries/data-platform-catalogue/CHANGELOG.md index 91bf7ffbcc..a68d1609a8 100644 --- a/python-libraries/data-platform-catalogue/CHANGELOG.md +++ b/python-libraries/data-platform-catalogue/CHANGELOG.md @@ -7,6 +7,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.7.0] 2024-01-25 + +### Added + +- Added filters param to the search function +- Return facets attribute to the search response. This is a dictionary mapping + fieldnames to `FacetOptions`, which expose values, display names and the + count of results with that value. + ## [0.6.0] 2024-01-24 ### Added diff --git a/python-libraries/data-platform-catalogue/README.md b/python-libraries/data-platform-catalogue/README.md index 3569fdcb07..4aeca66a97 100644 --- a/python-libraries/data-platform-catalogue/README.md +++ b/python-libraries/data-platform-catalogue/README.md @@ -70,6 +70,26 @@ except CatalogueError: print("oh no") ``` +## Search example + +```python +response = client.search() + +# Total results across all pages +print(response.total_results) + +# Iterate over search results +for item in response.page_results: + print(item) + +# Iterate over facet options +for option in response.facets['domains']: + print(option) + +# Filter by domain +client.search(filters=[MultiSelectFilter("domains", [response.facets['domains'][0].value])]) +``` + ## Catalogue Implementations ### DataHub diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py index 0d534d67f5..8f02d228cf 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py @@ -8,7 +8,7 @@ DataProductMetadata, TableMetadata, ) -from ..search_types import ResultType, SearchResponse +from ..search_types import MultiSelectFilter, ResultType, SearchResponse logger = logging.getLogger(__name__) @@ -65,6 +65,7 @@ def search( ResultType.DATA_PRODUCT, ResultType.TABLE, ), + filters: Sequence[MultiSelectFilter] = (), ) -> SearchResponse: """ Wraps the catalogue's search function. diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py index 78b162041f..e7402e92ad 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py @@ -2,13 +2,6 @@ import datahub.emitter.mce_builder as mce_builder import datahub.metadata.schema_classes as schema_classes -from data_platform_catalogue.client.base import ( - BaseCatalogueClient, - CatalogueError, - ReferencedEntityMissing, - logger, -) -from data_platform_catalogue.search_types import ResultType, SearchResponse from datahub.emitter.mce_builder import make_data_platform_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph @@ -31,6 +24,8 @@ DataProductMetadata, TableMetadata, ) +from ...search_types import MultiSelectFilter, ResultType, SearchResponse +from ..base import BaseCatalogueClient, CatalogueError, ReferencedEntityMissing, logger from .search import SearchClient DATAHUB_DATA_TYPE_MAPPING = { @@ -325,10 +320,15 @@ def search( ResultType.DATA_PRODUCT, ResultType.TABLE, ), + filters: Sequence[MultiSelectFilter] = (), ) -> SearchResponse: """ Wraps the catalogue's search function. """ return self.search_client.search( - query=query, count=count, page=page, result_types=result_types + query=query, + count=count, + page=page, + result_types=result_types, + filters=filters, ) diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql index 506ba0fbf5..18835f9787 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql @@ -3,13 +3,45 @@ query Search( $count: Int! $start: Int! $types: [EntityType!] + $filters: [FacetFilterInput!] ) { searchAcrossEntities( - input: { types: $types, query: $query, start: $start, count: $count } + input: { + types: $types + query: $query + start: $start + count: $count + filters: $filters + } ) { start count total + facets { + field + displayName + aggregations { + value + count + entity { + ... on Domain { + properties { + name + } + } + ... on Tag { + properties { + name + } + } + ... on GlossaryTerm { + properties { + name + } + } + } + } + } searchResults { insights { text diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py index fdbe48942a..1c0ae129c6 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py @@ -2,15 +2,18 @@ import logging from datetime import datetime from importlib.resources import files -from typing import Any, Sequence +from typing import Any, Literal, Sequence -from data_platform_catalogue.search_types import ( +from datahub.configuration.common import GraphError +from datahub.ingestion.graph.client import DataHubGraph + +from ...search_types import ( + FacetOption, + MultiSelectFilter, ResultType, SearchResponse, SearchResult, ) -from datahub.configuration.common import GraphError -from datahub.ingestion.graph.client import DataHubGraph logger = logging.getLogger(__name__) @@ -33,6 +36,7 @@ def search( ResultType.DATA_PRODUCT, ResultType.TABLE, ), + filters: Sequence[MultiSelectFilter] = (), ) -> SearchResponse: """ Wraps the catalogue's search function. @@ -43,8 +47,15 @@ def search( start = int(page) types = self._map_result_types(result_types) + formatted_filters = self._map_filters(filters) - variables = {"count": count, "query": query, "start": start, "types": types} + variables = { + "count": count, + "query": query, + "start": start, + "types": types, + "filters": formatted_filters, + } try: response = self.graph.execute_graphql(self.search_query, variables) @@ -53,6 +64,7 @@ def search( page_results = [] response = response["searchAcrossEntities"] + facets = self._parse_facets(response.get("facets", [])) logger.debug(json.dumps(response, indent=2)) @@ -71,7 +83,7 @@ def search( raise ValueError(f"Unexpected entity type: {entity_type}") return SearchResponse( - total_results=response["total"], page_results=page_results + total_results=response["total"], page_results=page_results, facets=facets ) def _map_result_types(self, result_types: Sequence[ResultType]): @@ -85,6 +97,14 @@ def _map_result_types(self, result_types: Sequence[ResultType]): types.append("DATASET") return types + def _map_filters(self, filters: Sequence[MultiSelectFilter]): + result = [] + for filter in filters: + result.append( + {"field": filter.filter_name, "values": filter.included_values} + ) + return result + def _parse_owner(self, entity: dict[str, Any]): """ Parse ownership information, if it is set. @@ -180,3 +200,31 @@ def _parse_data_product(self, entity: dict[str, Any], matches) -> SearchResult: tags=tags, last_updated=last_updated, ) + + def _parse_facets( + self, facets: list[dict[str, Any]] + ) -> dict[ + Literal["domains", "tags", "customProperties", "glossaryTerms"], + list[FacetOption], + ]: + """ + Parse the facets and aggregate information from the query results + """ + results = {} + for facet in facets: + field = facet["field"] + if field not in ("domains", "tags", "customProperties", "glossaryTerms"): + continue + + options = [] + for aggregate in facet["aggregations"]: + value = aggregate["value"] + count = aggregate["count"] + entity = aggregate.get("entity") or {} + properties = entity.get("properties") or {} + label = properties.get("name", value) + options.append(FacetOption(value=value, label=label, count=count)) + + results[field] = options + + return results diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py index 91ad4d46a2..989072dd93 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py @@ -1,12 +1,6 @@ import json from http import HTTPStatus -from data_platform_catalogue.client.base import ( - BaseCatalogueClient, - CatalogueError, - ReferencedEntityMissing, - logger, -) from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest from metadata.generated.schema.api.data.createDatabaseSchema import ( CreateDatabaseSchemaRequest, @@ -40,6 +34,7 @@ DataProductMetadata, TableMetadata, ) +from .base import BaseCatalogueClient, CatalogueError, ReferencedEntityMissing, logger OMD_DATA_TYPE_MAPPING = { "boolean": OpenMetadataDataType.BOOLEAN, diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py index 3baf4b7063..da9ca3ad9a 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from datetime import datetime from enum import Enum, auto -from typing import Any +from typing import Any, Literal class ResultType(Enum): @@ -9,6 +9,27 @@ class ResultType(Enum): TABLE = auto() +@dataclass +class MultiSelectFilter: + """ + Values to filter the result set by + """ + + filter_name: Literal["domains", "tags", "customProperties", "glossaryTerms"] + included_values: list[Any] + + +@dataclass +class FacetOption: + """ + A specific value that may be used to filter the search + """ + + value: str + label: str + count: int + + @dataclass class SearchResult: id: str @@ -25,3 +46,7 @@ class SearchResult: class SearchResponse: total_results: int page_results: list[SearchResult] + facets: dict[ + Literal["domains", "tags", "customProperties", "glossaryTerms"], + list[FacetOption], + ] = field(default_factory=dict) diff --git a/python-libraries/data-platform-catalogue/pyproject.toml b/python-libraries/data-platform-catalogue/pyproject.toml index fb50e4b4b9..d7aa6a5a5f 100644 --- a/python-libraries/data-platform-catalogue/pyproject.toml +++ b/python-libraries/data-platform-catalogue/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ministryofjustice-data-platform-catalogue" -version = "0.6.0" +version = "0.7.0" description = "Library to integrate the MoJ data platform with the catalogue component." authors = ["MoJ Data Platform Team "] license = "MIT" diff --git a/python-libraries/data-platform-catalogue/tests/test_datahub_search.py b/python-libraries/data-platform-catalogue/tests/test_datahub_search.py index d3c36dbd8e..acf00bdc46 100644 --- a/python-libraries/data-platform-catalogue/tests/test_datahub_search.py +++ b/python-libraries/data-platform-catalogue/tests/test_datahub_search.py @@ -4,6 +4,8 @@ import pytest from data_platform_catalogue.client.datahub.search import SearchClient from data_platform_catalogue.search_types import ( + FacetOption, + MultiSelectFilter, ResultType, SearchResponse, SearchResult, @@ -307,3 +309,109 @@ def test_result_with_owner(mock_graph, searcher): ) ], ) + + +def test_filter(searcher, mock_graph): + datahub_response = { + "searchAcrossEntities": { + "start": 0, + "count": 0, + "total": 0, + "searchResults": [], + } + } + mock_graph.execute_graphql = MagicMock(return_value=datahub_response) + + response = searcher.search(filters=[MultiSelectFilter("domains", ["Abc", "Def"])]) + + assert response == SearchResponse( + total_results=0, + page_results=[], + ) + + +def test_facets(searcher, mock_graph): + datahub_response = { + "searchAcrossEntities": { + "start": 0, + "count": 10, + "total": 10, + "searchResults": [], + "facets": [ + { + "field": "_entityType", + "displayName": "Type", + "aggregations": [ + {"value": "DATASET", "count": 1505, "entity": None} + ], + }, + { + "field": "glossaryTerms", + "displayName": "Glossary Term", + "aggregations": [ + { + "value": "urn:li:glossaryTerm:Classification.Sensitive", + "count": 1, + "entity": {"properties": {"name": "Sensitive"}}, + }, + { + "value": "urn:li:glossaryTerm:Silver", + "count": 1, + "entity": {"properties": None}, + }, + ], + }, + { + "field": "domains", + "displayName": "Domain", + "aggregations": [ + { + "value": "urn:li:domain:094dc54b-0ebc-40a6-a4cf-e1b75e8b8089", + "count": 7, + "entity": {"properties": {"name": "Pet Adoptions"}}, + }, + { + "value": "urn:li:domain:7186eeff-a860-4b0a-989f-69473a0c9c67", + "count": 4, + "entity": {"properties": {"name": "E-Commerce"}}, + }, + ], + }, + ], + } + } + + mock_graph.execute_graphql = MagicMock(return_value=datahub_response) + + response = searcher.search() + + assert response == SearchResponse( + total_results=10, + page_results=[], + facets={ + "glossaryTerms": [ + FacetOption( + value="urn:li:glossaryTerm:Classification.Sensitive", + label="Sensitive", + count=1, + ), + FacetOption( + value="urn:li:glossaryTerm:Silver", + label="urn:li:glossaryTerm:Silver", + count=1, + ), + ], + "domains": [ + FacetOption( + value="urn:li:domain:094dc54b-0ebc-40a6-a4cf-e1b75e8b8089", + label="Pet Adoptions", + count=7, + ), + FacetOption( + value="urn:li:domain:7186eeff-a860-4b0a-989f-69473a0c9c67", + label="E-Commerce", + count=4, + ), + ], + }, + ) diff --git a/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py b/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py index 3ad1bbabd4..aa54791c6d 100644 --- a/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py +++ b/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py @@ -13,7 +13,7 @@ from data_platform_catalogue import DataProductMetadata, TableMetadata from data_platform_catalogue.client.datahub.datahub_client import DataHubCatalogueClient from data_platform_catalogue.entities import DataLocation -from data_platform_catalogue.search_types import ResultType +from data_platform_catalogue.search_types import MultiSelectFilter, ResultType from datahub.metadata.schema_classes import DatasetPropertiesClass, SchemaMetadataClass jwt_token = os.environ.get("JWT_TOKEN") @@ -91,3 +91,36 @@ def test_search_for_data_product(): ) assert response.total_results >= 1 assert response.page_results[0].id == "urn:li:dataProduct:lfdskjflkjflkjsdflksfjds" + + +@runs_on_development_server +def test_search_by_domain(): + client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url) + + response = client.search( + filters=[MultiSelectFilter("domains", ["does-not-exist"])], + result_types=(ResultType.DATA_PRODUCT,), + ) + assert response.total_results == 0 + + +@runs_on_development_server +def test_domain_facets_are_returned(): + client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url) + + client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url) + + data_product = DataProductMetadata( + name="lfdskjflkjflkjsdflksfjds", + description="lfdskjflkjflkjsdflksfjds", + version="v1.0.0", + owner="7804c127-d677-4900-82f9-83517e51bb94", + email="justice@justice.gov.uk", + retention_period_in_days=365, + domain="Sample", + dpia_required=False, + ) + client.upsert_data_product(data_product) + + response = client.search() + assert response.facets["domains"]