From 2a8a8353c5f544c1a32dd836bee62c6080f99487 Mon Sep 17 00:00:00 2001 From: Mat Date: Fri, 26 Jan 2024 14:21:19 +0000 Subject: [PATCH] [DP-3074] Add filtering to search function (#3075) * Add parameter for search filters * Allow the search response to contain facet information Facets are the dynamic search filters that show how the search results break down across different dimensions. Returning these allows us to display only options that are relevant to the current result set and indicate the number of results matching each option. For most filters, we are expected to pass a URN as the value, so this is also the easiest way for the frontend to figure out what the possible values are. * Use relative imports throughout --- .../data-platform-catalogue/CHANGELOG.md | 9 ++ .../data-platform-catalogue/README.md | 20 ++++ .../data_platform_catalogue/client/base.py | 3 +- .../client/datahub/datahub_client.py | 16 +-- .../client/datahub/graphql/search.graphql | 34 +++++- .../client/datahub/search.py | 60 +++++++++- .../client/openmetadata.py | 7 +- .../data_platform_catalogue/search_types.py | 27 ++++- .../data-platform-catalogue/pyproject.toml | 2 +- .../tests/test_datahub_search.py | 108 ++++++++++++++++++ .../test_integration_with_datahub_server.py | 35 +++++- 11 files changed, 296 insertions(+), 25 deletions(-) diff --git a/python-libraries/data-platform-catalogue/CHANGELOG.md b/python-libraries/data-platform-catalogue/CHANGELOG.md index 91bf7ffbcc..a68d1609a8 100644 --- a/python-libraries/data-platform-catalogue/CHANGELOG.md +++ b/python-libraries/data-platform-catalogue/CHANGELOG.md @@ -7,6 +7,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.7.0] 2024-01-25 + +### Added + +- Added filters param to the search function +- Return facets attribute to the search response. This is a dictionary mapping + fieldnames to `FacetOptions`, which expose values, display names and the + count of results with that value. + ## [0.6.0] 2024-01-24 ### Added diff --git a/python-libraries/data-platform-catalogue/README.md b/python-libraries/data-platform-catalogue/README.md index 3569fdcb07..4aeca66a97 100644 --- a/python-libraries/data-platform-catalogue/README.md +++ b/python-libraries/data-platform-catalogue/README.md @@ -70,6 +70,26 @@ except CatalogueError: print("oh no") ``` +## Search example + +```python +response = client.search() + +# Total results across all pages +print(response.total_results) + +# Iterate over search results +for item in response.page_results: + print(item) + +# Iterate over facet options +for option in response.facets['domains']: + print(option) + +# Filter by domain +client.search(filters=[MultiSelectFilter("domains", [response.facets['domains'][0].value])]) +``` + ## Catalogue Implementations ### DataHub diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py index 0d534d67f5..8f02d228cf 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/base.py @@ -8,7 +8,7 @@ DataProductMetadata, TableMetadata, ) -from ..search_types import ResultType, SearchResponse +from ..search_types import MultiSelectFilter, ResultType, SearchResponse logger = logging.getLogger(__name__) @@ -65,6 +65,7 @@ def search( ResultType.DATA_PRODUCT, ResultType.TABLE, ), + filters: Sequence[MultiSelectFilter] = (), ) -> SearchResponse: """ Wraps the catalogue's search function. diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py index 78b162041f..e7402e92ad 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/datahub_client.py @@ -2,13 +2,6 @@ import datahub.emitter.mce_builder as mce_builder import datahub.metadata.schema_classes as schema_classes -from data_platform_catalogue.client.base import ( - BaseCatalogueClient, - CatalogueError, - ReferencedEntityMissing, - logger, -) -from data_platform_catalogue.search_types import ResultType, SearchResponse from datahub.emitter.mce_builder import make_data_platform_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph @@ -31,6 +24,8 @@ DataProductMetadata, TableMetadata, ) +from ...search_types import MultiSelectFilter, ResultType, SearchResponse +from ..base import BaseCatalogueClient, CatalogueError, ReferencedEntityMissing, logger from .search import SearchClient DATAHUB_DATA_TYPE_MAPPING = { @@ -325,10 +320,15 @@ def search( ResultType.DATA_PRODUCT, ResultType.TABLE, ), + filters: Sequence[MultiSelectFilter] = (), ) -> SearchResponse: """ Wraps the catalogue's search function. """ return self.search_client.search( - query=query, count=count, page=page, result_types=result_types + query=query, + count=count, + page=page, + result_types=result_types, + filters=filters, ) diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql index 506ba0fbf5..18835f9787 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/graphql/search.graphql @@ -3,13 +3,45 @@ query Search( $count: Int! $start: Int! $types: [EntityType!] + $filters: [FacetFilterInput!] ) { searchAcrossEntities( - input: { types: $types, query: $query, start: $start, count: $count } + input: { + types: $types + query: $query + start: $start + count: $count + filters: $filters + } ) { start count total + facets { + field + displayName + aggregations { + value + count + entity { + ... on Domain { + properties { + name + } + } + ... on Tag { + properties { + name + } + } + ... on GlossaryTerm { + properties { + name + } + } + } + } + } searchResults { insights { text diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py index fdbe48942a..1c0ae129c6 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/datahub/search.py @@ -2,15 +2,18 @@ import logging from datetime import datetime from importlib.resources import files -from typing import Any, Sequence +from typing import Any, Literal, Sequence -from data_platform_catalogue.search_types import ( +from datahub.configuration.common import GraphError +from datahub.ingestion.graph.client import DataHubGraph + +from ...search_types import ( + FacetOption, + MultiSelectFilter, ResultType, SearchResponse, SearchResult, ) -from datahub.configuration.common import GraphError -from datahub.ingestion.graph.client import DataHubGraph logger = logging.getLogger(__name__) @@ -33,6 +36,7 @@ def search( ResultType.DATA_PRODUCT, ResultType.TABLE, ), + filters: Sequence[MultiSelectFilter] = (), ) -> SearchResponse: """ Wraps the catalogue's search function. @@ -43,8 +47,15 @@ def search( start = int(page) types = self._map_result_types(result_types) + formatted_filters = self._map_filters(filters) - variables = {"count": count, "query": query, "start": start, "types": types} + variables = { + "count": count, + "query": query, + "start": start, + "types": types, + "filters": formatted_filters, + } try: response = self.graph.execute_graphql(self.search_query, variables) @@ -53,6 +64,7 @@ def search( page_results = [] response = response["searchAcrossEntities"] + facets = self._parse_facets(response.get("facets", [])) logger.debug(json.dumps(response, indent=2)) @@ -71,7 +83,7 @@ def search( raise ValueError(f"Unexpected entity type: {entity_type}") return SearchResponse( - total_results=response["total"], page_results=page_results + total_results=response["total"], page_results=page_results, facets=facets ) def _map_result_types(self, result_types: Sequence[ResultType]): @@ -85,6 +97,14 @@ def _map_result_types(self, result_types: Sequence[ResultType]): types.append("DATASET") return types + def _map_filters(self, filters: Sequence[MultiSelectFilter]): + result = [] + for filter in filters: + result.append( + {"field": filter.filter_name, "values": filter.included_values} + ) + return result + def _parse_owner(self, entity: dict[str, Any]): """ Parse ownership information, if it is set. @@ -180,3 +200,31 @@ def _parse_data_product(self, entity: dict[str, Any], matches) -> SearchResult: tags=tags, last_updated=last_updated, ) + + def _parse_facets( + self, facets: list[dict[str, Any]] + ) -> dict[ + Literal["domains", "tags", "customProperties", "glossaryTerms"], + list[FacetOption], + ]: + """ + Parse the facets and aggregate information from the query results + """ + results = {} + for facet in facets: + field = facet["field"] + if field not in ("domains", "tags", "customProperties", "glossaryTerms"): + continue + + options = [] + for aggregate in facet["aggregations"]: + value = aggregate["value"] + count = aggregate["count"] + entity = aggregate.get("entity") or {} + properties = entity.get("properties") or {} + label = properties.get("name", value) + options.append(FacetOption(value=value, label=label, count=count)) + + results[field] = options + + return results diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py index 91ad4d46a2..989072dd93 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/client/openmetadata.py @@ -1,12 +1,6 @@ import json from http import HTTPStatus -from data_platform_catalogue.client.base import ( - BaseCatalogueClient, - CatalogueError, - ReferencedEntityMissing, - logger, -) from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest from metadata.generated.schema.api.data.createDatabaseSchema import ( CreateDatabaseSchemaRequest, @@ -40,6 +34,7 @@ DataProductMetadata, TableMetadata, ) +from .base import BaseCatalogueClient, CatalogueError, ReferencedEntityMissing, logger OMD_DATA_TYPE_MAPPING = { "boolean": OpenMetadataDataType.BOOLEAN, diff --git a/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py b/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py index 3baf4b7063..da9ca3ad9a 100644 --- a/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py +++ b/python-libraries/data-platform-catalogue/data_platform_catalogue/search_types.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from datetime import datetime from enum import Enum, auto -from typing import Any +from typing import Any, Literal class ResultType(Enum): @@ -9,6 +9,27 @@ class ResultType(Enum): TABLE = auto() +@dataclass +class MultiSelectFilter: + """ + Values to filter the result set by + """ + + filter_name: Literal["domains", "tags", "customProperties", "glossaryTerms"] + included_values: list[Any] + + +@dataclass +class FacetOption: + """ + A specific value that may be used to filter the search + """ + + value: str + label: str + count: int + + @dataclass class SearchResult: id: str @@ -25,3 +46,7 @@ class SearchResult: class SearchResponse: total_results: int page_results: list[SearchResult] + facets: dict[ + Literal["domains", "tags", "customProperties", "glossaryTerms"], + list[FacetOption], + ] = field(default_factory=dict) diff --git a/python-libraries/data-platform-catalogue/pyproject.toml b/python-libraries/data-platform-catalogue/pyproject.toml index fb50e4b4b9..d7aa6a5a5f 100644 --- a/python-libraries/data-platform-catalogue/pyproject.toml +++ b/python-libraries/data-platform-catalogue/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ministryofjustice-data-platform-catalogue" -version = "0.6.0" +version = "0.7.0" description = "Library to integrate the MoJ data platform with the catalogue component." authors = ["MoJ Data Platform Team "] license = "MIT" diff --git a/python-libraries/data-platform-catalogue/tests/test_datahub_search.py b/python-libraries/data-platform-catalogue/tests/test_datahub_search.py index d3c36dbd8e..acf00bdc46 100644 --- a/python-libraries/data-platform-catalogue/tests/test_datahub_search.py +++ b/python-libraries/data-platform-catalogue/tests/test_datahub_search.py @@ -4,6 +4,8 @@ import pytest from data_platform_catalogue.client.datahub.search import SearchClient from data_platform_catalogue.search_types import ( + FacetOption, + MultiSelectFilter, ResultType, SearchResponse, SearchResult, @@ -307,3 +309,109 @@ def test_result_with_owner(mock_graph, searcher): ) ], ) + + +def test_filter(searcher, mock_graph): + datahub_response = { + "searchAcrossEntities": { + "start": 0, + "count": 0, + "total": 0, + "searchResults": [], + } + } + mock_graph.execute_graphql = MagicMock(return_value=datahub_response) + + response = searcher.search(filters=[MultiSelectFilter("domains", ["Abc", "Def"])]) + + assert response == SearchResponse( + total_results=0, + page_results=[], + ) + + +def test_facets(searcher, mock_graph): + datahub_response = { + "searchAcrossEntities": { + "start": 0, + "count": 10, + "total": 10, + "searchResults": [], + "facets": [ + { + "field": "_entityType", + "displayName": "Type", + "aggregations": [ + {"value": "DATASET", "count": 1505, "entity": None} + ], + }, + { + "field": "glossaryTerms", + "displayName": "Glossary Term", + "aggregations": [ + { + "value": "urn:li:glossaryTerm:Classification.Sensitive", + "count": 1, + "entity": {"properties": {"name": "Sensitive"}}, + }, + { + "value": "urn:li:glossaryTerm:Silver", + "count": 1, + "entity": {"properties": None}, + }, + ], + }, + { + "field": "domains", + "displayName": "Domain", + "aggregations": [ + { + "value": "urn:li:domain:094dc54b-0ebc-40a6-a4cf-e1b75e8b8089", + "count": 7, + "entity": {"properties": {"name": "Pet Adoptions"}}, + }, + { + "value": "urn:li:domain:7186eeff-a860-4b0a-989f-69473a0c9c67", + "count": 4, + "entity": {"properties": {"name": "E-Commerce"}}, + }, + ], + }, + ], + } + } + + mock_graph.execute_graphql = MagicMock(return_value=datahub_response) + + response = searcher.search() + + assert response == SearchResponse( + total_results=10, + page_results=[], + facets={ + "glossaryTerms": [ + FacetOption( + value="urn:li:glossaryTerm:Classification.Sensitive", + label="Sensitive", + count=1, + ), + FacetOption( + value="urn:li:glossaryTerm:Silver", + label="urn:li:glossaryTerm:Silver", + count=1, + ), + ], + "domains": [ + FacetOption( + value="urn:li:domain:094dc54b-0ebc-40a6-a4cf-e1b75e8b8089", + label="Pet Adoptions", + count=7, + ), + FacetOption( + value="urn:li:domain:7186eeff-a860-4b0a-989f-69473a0c9c67", + label="E-Commerce", + count=4, + ), + ], + }, + ) diff --git a/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py b/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py index 3ad1bbabd4..aa54791c6d 100644 --- a/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py +++ b/python-libraries/data-platform-catalogue/tests/test_integration_with_datahub_server.py @@ -13,7 +13,7 @@ from data_platform_catalogue import DataProductMetadata, TableMetadata from data_platform_catalogue.client.datahub.datahub_client import DataHubCatalogueClient from data_platform_catalogue.entities import DataLocation -from data_platform_catalogue.search_types import ResultType +from data_platform_catalogue.search_types import MultiSelectFilter, ResultType from datahub.metadata.schema_classes import DatasetPropertiesClass, SchemaMetadataClass jwt_token = os.environ.get("JWT_TOKEN") @@ -91,3 +91,36 @@ def test_search_for_data_product(): ) assert response.total_results >= 1 assert response.page_results[0].id == "urn:li:dataProduct:lfdskjflkjflkjsdflksfjds" + + +@runs_on_development_server +def test_search_by_domain(): + client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url) + + response = client.search( + filters=[MultiSelectFilter("domains", ["does-not-exist"])], + result_types=(ResultType.DATA_PRODUCT,), + ) + assert response.total_results == 0 + + +@runs_on_development_server +def test_domain_facets_are_returned(): + client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url) + + client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url) + + data_product = DataProductMetadata( + name="lfdskjflkjflkjsdflksfjds", + description="lfdskjflkjflkjsdflksfjds", + version="v1.0.0", + owner="7804c127-d677-4900-82f9-83517e51bb94", + email="justice@justice.gov.uk", + retention_period_in_days=365, + domain="Sample", + dpia_required=False, + ) + client.upsert_data_product(data_product) + + response = client.search() + assert response.facets["domains"]