Skip to content

Commit

Permalink
[DP-3074] Add filtering to search function (#3075)
Browse files Browse the repository at this point in the history
* Add parameter for search filters

* Allow the search response to contain facet information

Facets are the dynamic search filters that show how the search results
break down across different dimensions. Returning these allows us
to display only options that are relevant to the current result set
and indicate the number of results matching each option.

For most filters, we are expected to pass a URN as the value, so
this is also the easiest way for the frontend to figure out what
the possible values are.

* Use relative imports throughout
  • Loading branch information
MatMoore authored Jan 26, 2024
1 parent 8797ae9 commit 2a8a835
Show file tree
Hide file tree
Showing 11 changed files with 296 additions and 25 deletions.
9 changes: 9 additions & 0 deletions python-libraries/data-platform-catalogue/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.7.0] 2024-01-25

### Added

- Added filters param to the search function
- Return facets attribute to the search response. This is a dictionary mapping
fieldnames to `FacetOptions`, which expose values, display names and the
count of results with that value.

## [0.6.0] 2024-01-24

### Added
Expand Down
20 changes: 20 additions & 0 deletions python-libraries/data-platform-catalogue/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,26 @@ except CatalogueError:
print("oh no")
```

## Search example

```python
response = client.search()

# Total results across all pages
print(response.total_results)

# Iterate over search results
for item in response.page_results:
print(item)

# Iterate over facet options
for option in response.facets['domains']:
print(option)

# Filter by domain
client.search(filters=[MultiSelectFilter("domains", [response.facets['domains'][0].value])])
```

## Catalogue Implementations

### DataHub
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
DataProductMetadata,
TableMetadata,
)
from ..search_types import ResultType, SearchResponse
from ..search_types import MultiSelectFilter, ResultType, SearchResponse

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -65,6 +65,7 @@ def search(
ResultType.DATA_PRODUCT,
ResultType.TABLE,
),
filters: Sequence[MultiSelectFilter] = (),
) -> SearchResponse:
"""
Wraps the catalogue's search function.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,6 @@

import datahub.emitter.mce_builder as mce_builder
import datahub.metadata.schema_classes as schema_classes
from data_platform_catalogue.client.base import (
BaseCatalogueClient,
CatalogueError,
ReferencedEntityMissing,
logger,
)
from data_platform_catalogue.search_types import ResultType, SearchResponse
from datahub.emitter.mce_builder import make_data_platform_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
Expand All @@ -31,6 +24,8 @@
DataProductMetadata,
TableMetadata,
)
from ...search_types import MultiSelectFilter, ResultType, SearchResponse
from ..base import BaseCatalogueClient, CatalogueError, ReferencedEntityMissing, logger
from .search import SearchClient

DATAHUB_DATA_TYPE_MAPPING = {
Expand Down Expand Up @@ -325,10 +320,15 @@ def search(
ResultType.DATA_PRODUCT,
ResultType.TABLE,
),
filters: Sequence[MultiSelectFilter] = (),
) -> SearchResponse:
"""
Wraps the catalogue's search function.
"""
return self.search_client.search(
query=query, count=count, page=page, result_types=result_types
query=query,
count=count,
page=page,
result_types=result_types,
filters=filters,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,45 @@ query Search(
$count: Int!
$start: Int!
$types: [EntityType!]
$filters: [FacetFilterInput!]
) {
searchAcrossEntities(
input: { types: $types, query: $query, start: $start, count: $count }
input: {
types: $types
query: $query
start: $start
count: $count
filters: $filters
}
) {
start
count
total
facets {
field
displayName
aggregations {
value
count
entity {
... on Domain {
properties {
name
}
}
... on Tag {
properties {
name
}
}
... on GlossaryTerm {
properties {
name
}
}
}
}
}
searchResults {
insights {
text
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
import logging
from datetime import datetime
from importlib.resources import files
from typing import Any, Sequence
from typing import Any, Literal, Sequence

from data_platform_catalogue.search_types import (
from datahub.configuration.common import GraphError
from datahub.ingestion.graph.client import DataHubGraph

from ...search_types import (
FacetOption,
MultiSelectFilter,
ResultType,
SearchResponse,
SearchResult,
)
from datahub.configuration.common import GraphError
from datahub.ingestion.graph.client import DataHubGraph

logger = logging.getLogger(__name__)

Expand All @@ -33,6 +36,7 @@ def search(
ResultType.DATA_PRODUCT,
ResultType.TABLE,
),
filters: Sequence[MultiSelectFilter] = (),
) -> SearchResponse:
"""
Wraps the catalogue's search function.
Expand All @@ -43,8 +47,15 @@ def search(
start = int(page)

types = self._map_result_types(result_types)
formatted_filters = self._map_filters(filters)

variables = {"count": count, "query": query, "start": start, "types": types}
variables = {
"count": count,
"query": query,
"start": start,
"types": types,
"filters": formatted_filters,
}

try:
response = self.graph.execute_graphql(self.search_query, variables)
Expand All @@ -53,6 +64,7 @@ def search(

page_results = []
response = response["searchAcrossEntities"]
facets = self._parse_facets(response.get("facets", []))

logger.debug(json.dumps(response, indent=2))

Expand All @@ -71,7 +83,7 @@ def search(
raise ValueError(f"Unexpected entity type: {entity_type}")

return SearchResponse(
total_results=response["total"], page_results=page_results
total_results=response["total"], page_results=page_results, facets=facets
)

def _map_result_types(self, result_types: Sequence[ResultType]):
Expand All @@ -85,6 +97,14 @@ def _map_result_types(self, result_types: Sequence[ResultType]):
types.append("DATASET")
return types

def _map_filters(self, filters: Sequence[MultiSelectFilter]):
result = []
for filter in filters:
result.append(
{"field": filter.filter_name, "values": filter.included_values}
)
return result

def _parse_owner(self, entity: dict[str, Any]):
"""
Parse ownership information, if it is set.
Expand Down Expand Up @@ -180,3 +200,31 @@ def _parse_data_product(self, entity: dict[str, Any], matches) -> SearchResult:
tags=tags,
last_updated=last_updated,
)

def _parse_facets(
self, facets: list[dict[str, Any]]
) -> dict[
Literal["domains", "tags", "customProperties", "glossaryTerms"],
list[FacetOption],
]:
"""
Parse the facets and aggregate information from the query results
"""
results = {}
for facet in facets:
field = facet["field"]
if field not in ("domains", "tags", "customProperties", "glossaryTerms"):
continue

options = []
for aggregate in facet["aggregations"]:
value = aggregate["value"]
count = aggregate["count"]
entity = aggregate.get("entity") or {}
properties = entity.get("properties") or {}
label = properties.get("name", value)
options.append(FacetOption(value=value, label=label, count=count))

results[field] = options

return results
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
import json
from http import HTTPStatus

from data_platform_catalogue.client.base import (
BaseCatalogueClient,
CatalogueError,
ReferencedEntityMissing,
logger,
)
from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest
from metadata.generated.schema.api.data.createDatabaseSchema import (
CreateDatabaseSchemaRequest,
Expand Down Expand Up @@ -40,6 +34,7 @@
DataProductMetadata,
TableMetadata,
)
from .base import BaseCatalogueClient, CatalogueError, ReferencedEntityMissing, logger

OMD_DATA_TYPE_MAPPING = {
"boolean": OpenMetadataDataType.BOOLEAN,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum, auto
from typing import Any
from typing import Any, Literal


class ResultType(Enum):
DATA_PRODUCT = auto()
TABLE = auto()


@dataclass
class MultiSelectFilter:
"""
Values to filter the result set by
"""

filter_name: Literal["domains", "tags", "customProperties", "glossaryTerms"]
included_values: list[Any]


@dataclass
class FacetOption:
"""
A specific value that may be used to filter the search
"""

value: str
label: str
count: int


@dataclass
class SearchResult:
id: str
Expand All @@ -25,3 +46,7 @@ class SearchResult:
class SearchResponse:
total_results: int
page_results: list[SearchResult]
facets: dict[
Literal["domains", "tags", "customProperties", "glossaryTerms"],
list[FacetOption],
] = field(default_factory=dict)
2 changes: 1 addition & 1 deletion python-libraries/data-platform-catalogue/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ministryofjustice-data-platform-catalogue"
version = "0.6.0"
version = "0.7.0"
description = "Library to integrate the MoJ data platform with the catalogue component."
authors = ["MoJ Data Platform Team <data-platform-tech@digital.justice.gov.uk>"]
license = "MIT"
Expand Down
Loading

0 comments on commit 2a8a835

Please sign in to comment.