Skip to content

Commit

Permalink
[DP-2983] Add a method for search (#3051)
Browse files Browse the repository at this point in the history
* Add a method for search

This method accepts a query and pagination variables, and returns
the total number of results plus the results to display on the
current page.

For each search result, return:

- the result type, DataSet or DataProduct
- ID, name and description
- tags
- a dictionary of additional metadata fields (exactly what this will
  contain will depend on what tests well with users)
- information about the properties which matched the search query
- the time the metadata was last updated

The raw responses from Datahub are logged at DEBUG level.

For now I've excluded filters, facets, and sorting but these are
supported by the underlying API.

See
- https://datahubproject.io/docs/graphql/inputObjects#facetfilterinput
- https://datahubproject.io/docs/graphql/objects#facetmetadata
- https://datahubproject.io/docs/graphql/inputObjects#searchsortinput

* Parameterise result types in search queries
  • Loading branch information
MatMoore authored Jan 24, 2024
1 parent 332a213 commit c242e9a
Show file tree
Hide file tree
Showing 13 changed files with 737 additions and 6 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,4 @@ __pycache__
.github/path-filter/pytest.yml
.github/path-filter/containers.yml
.github/path-filter/terraform.yml
**/**datahub
.coverage
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .client import DataHubCatalogueClient # noqa: F401
from .client import CatalogueError, ReferencedEntityMissing # noqa: F401
from .entities import DataProductMetadata # noqa: F401
from .entities import CatalogueMetadata, DataLocation, TableMetadata # noqa: F401
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .base import BaseCatalogueClient # noqa: F401
from .base import CatalogueError # noqa: F401
from .base import ReferencedEntityMissing # noqa: F401
from .datahub import DataHubCatalogueClient # noqa: F401
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging
from abc import ABC, abstractmethod
from typing import Sequence

from ..entities import (
CatalogueMetadata,
DataLocation,
DataProductMetadata,
TableMetadata,
)
from ..search_types import ResultType, SearchResponse

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -53,3 +55,18 @@ def upsert_table(
data_product_metadata: DataProductMetadata | None = None,
) -> str:
pass

def search(
self,
query: str = "*",
count: int = 20,
page: str | None = None,
result_types: Sequence[ResultType] = (
ResultType.DATA_PRODUCT,
ResultType.TABLE,
),
) -> SearchResponse:
"""
Wraps the catalogue's search function.
"""
raise NotImplementedError
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .datahub_client import DataHubCatalogueClient # noqa: F401
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Sequence

import datahub.emitter.mce_builder as mce_builder
import datahub.metadata.schema_classes as schema_classes
from data_platform_catalogue.client.base import (
Expand All @@ -6,6 +8,7 @@
ReferencedEntityMissing,
logger,
)
from data_platform_catalogue.search_types import ResultType, SearchResponse
from datahub.emitter.mce_builder import make_data_platform_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
Expand All @@ -22,12 +25,13 @@
SchemaMetadataClass,
)

from ..entities import (
from ...entities import (
CatalogueMetadata,
DataLocation,
DataProductMetadata,
TableMetadata,
)
from .search import SearchClient

DATAHUB_DATA_TYPE_MAPPING = {
"boolean": schema_classes.BooleanTypeClass(),
Expand Down Expand Up @@ -81,6 +85,7 @@ def __init__(self, jwt_token, api_url: str, graph=None):
server=self.gms_endpoint, token=jwt_token
)
self.graph = graph or DataHubGraph(self.server_config)
self.search_client = SearchClient(self.graph)

def upsert_database_service(self, platform: str = "glue", *args, **kwargs) -> str:
"""
Expand Down Expand Up @@ -310,3 +315,20 @@ def upsert_table(
self.graph.emit(metadata_event)

return dataset_urn

def search(
self,
query: str = "*",
count: int = 20,
page: str | None = None,
result_types: Sequence[ResultType] = (
ResultType.DATA_PRODUCT,
ResultType.TABLE,
),
) -> SearchResponse:
"""
Wraps the catalogue's search function.
"""
return self.search_client.search(
query=query, count=count, page=page, result_types=result_types
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
query Search(
$query: String!
$count: Int!
$start: Int!
$types: [EntityType!]
) {
searchAcrossEntities(
input: { types: $types, query: $query, start: $start, count: $count }
) {
start
count
total
searchResults {
insights {
text
}
matchedFields {
name
value
}
entity {
type
... on Dataset {
urn
type
platform {
name
}
ownership {
owners {
owner {
... on CorpUser {
urn
properties {
fullName
email
}
}
... on CorpGroup {
urn
properties {
displayName
email
}
}
}
}
}
name
properties {
name
qualifiedName
description
customProperties {
key
value
}
created
lastModified
}
editableProperties {
description
}
tags {
tags {
tag {
urn
properties {
name
description
}
}
}
}
lastIngested
domain {
domain {
urn
id
properties {
name
description
}
}
}
}
... on DataProduct {
urn
type
ownership {
owners {
owner {
... on CorpUser {
urn
properties {
fullName
email
}
}
... on CorpGroup {
urn
properties {
displayName
email
}
}
}
}
}
properties {
name
description
customProperties {
key
value
}
numAssets
}
domain {
domain {
urn
id
properties {
name
description
}
}
}
tags {
tags {
tag {
urn
properties {
name
description
}
}
}
}
}
}
}
}
}
Loading

0 comments on commit c242e9a

Please sign in to comment.