Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Vector Search Indexes #3266

Merged
merged 9 commits into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 40 additions & 40 deletions docs/resources/lakehouse_monitor.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,62 +19,62 @@ resource "databricks_catalog" "sandbox" {
}

resource "databricks_schema" "things" {
catalog_name = databricks_catalog.sandbox.id
name = "things"
comment = "this database is managed by terraform"
properties = {
kind = "various"
}
catalog_name = databricks_catalog.sandbox.id
name = "things"
comment = "this database is managed by terraform"
properties = {
kind = "various"
}

}

resource "databricks_sql_table" "myTestTable" {
catalog_name = "main"
schema_name = databricks_schema.things.name
name = "bar"
table_type = "MANAGED"
data_source_format = "DELTA"

column {
name = "timestamp"
position = 1
type = "int"
}
catalog_name = "main"
schema_name = databricks_schema.things.name
name = "bar"
table_type = "MANAGED"
data_source_format = "DELTA"

column {
name = "timestamp"
position = 1
type = "int"
}
}

resource "databricks_lakehouse_monitor" "testTimeseriesMonitor" {
table_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}.${databricks_sql_table.myTestTable.name}"
assets_dir = "/Shared/provider-test/databricks_lakehouse_monitoring/${databricks_sql_table.myTestTable.name}"
output_schema_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}"
time_series {
granularities = ["1 hour"]
timestamp_col = "timestamp"
}
table_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}.${databricks_sql_table.myTestTable.name}"
assets_dir = "/Shared/provider-test/databricks_lakehouse_monitoring/${databricks_sql_table.myTestTable.name}"
output_schema_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}"
time_series {
granularities = ["1 hour"]
timestamp_col = "timestamp"
}
}
```

### Inference Monitor

```hcl
resource "databricks_lakehouse_monitor" "testMonitorInference" {
table_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}.${databricks_table.myTestTable.name}"
assets_dir = "/Shared/provider-test/databricks_lakehouse_monitoring/${databricks_table.myTestTable.name}"
output_schema_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}"
inference_log {
granularities = ["1 hour"]
timestamp_col = "timestamp"
prediction_col = "prediction"
model_id_col = "model_id"
problem_type = "PROBLEM_TYPE_REGRESSION"
}
table_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}.${databricks_table.myTestTable.name}"
assets_dir = "/Shared/provider-test/databricks_lakehouse_monitoring/${databricks_table.myTestTable.name}"
output_schema_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}"
inference_log {
granularities = ["1 hour"]
timestamp_col = "timestamp"
prediction_col = "prediction"
model_id_col = "model_id"
problem_type = "PROBLEM_TYPE_REGRESSION"
}
}
```
### Snapshot Monitor
```hcl
resource "databricks_lakehouse_monitor" "testMonitorInference" {
table_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}.${databricks_table.myTestTable.name}"
assets_dir = "/Shared/provider-test/databricks_lakehouse_monitoring/${databricks_table.myTestTable.name}"
output_schema_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}"
snapshot {}
table_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}.${databricks_table.myTestTable.name}"
assets_dir = "/Shared/provider-test/databricks_lakehouse_monitoring/${databricks_table.myTestTable.name}"
output_schema_name = "${databricks_catalog.sandbox.name}.${databricks_schema.things.name}"
snapshot {}
}
```

Expand Down Expand Up @@ -129,4 +129,4 @@ The following resources are often used in the same context:

* [databricks_catalog](catalog.md)
* [databricks_schema](schema.md)
* [databricks_sql_table](sql_table.md)
* [databricks_sql_table](sql_table.md)
8 changes: 4 additions & 4 deletions docs/resources/model_serving.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ resource "databricks_model_serving" "this" {
config {
served_entities {
name = "prod_model"
entity_name = "ads-model"
entity_version = "2"
entity_name = "ads-model"
entity_version = "2"
workload_size = "Small"
scale_to_zero_enabled = true
}
served_entities {
name = "candidate_model"
entity_name = "ads-model"
entity_version = "4"
entity_name = "ads-model"
entity_version = "4"
workload_size = "Small"
scale_to_zero_enabled = false
}
Expand Down
14 changes: 7 additions & 7 deletions docs/resources/vector_search_endpoint.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@ resource "databricks_vector_search_endpoint" "this" {

## Argument Reference

The following arguments are supported:
The following arguments are supported (change of any parameter leads to recreation of the resource):

* `name` - (Required) Name of the Vector Search Endpoint to create. If name is changed, Vector Search Endpoint is recreated.
* `endpoint_type` (Required) type of Vector Search Endpoint. Currently only accepting single value: `STANDARD` (See [documentation](https://docs.databricks.com/api/workspace/vectorsearchendpoints/createendpoint) for the list of currently supported values). If it's changed, Vector Search Endpoint is recreated.
* `name` - (Required) Name of the Vector Search Endpoint to create.
* `endpoint_type` (Required) Type of Vector Search Endpoint. Currently only accepting single value: `STANDARD` (See [documentation](https://docs.databricks.com/api/workspace/vectorsearchendpoints/createendpoint) for the list of currently supported values).

## Attribute Reference

In addition to all arguments above, the following attributes are exported:
In addition to all the arguments above, the following attributes are exported:

* `id` - The same as the name of the endpoint.
* `creator` - Creator of the endpoint.
* `creation_timestamp` - Timestamp of endpoint creation (milliseconds).
* `last_updated_user` - User who last updated the endpoint.
* `last_updated_timestamp` - Timestamp of last update to the endpoint (milliseconds).
* `last_updated_timestamp` - Timestamp of the last update to the endpoint (milliseconds).
* `endpoint_id` - Unique internal identifier of the endpoint (UUID).
* `num_indexes` - Number of indexes on the endpoint.
* `endpoint_status` - Object describing the current status of the endpoint consisting of following fields:
* `state` - Current state of the endpoint. Currently following values are supported: `PROVISIONING`, `ONLINE`, `OFFLINE`.
* `endpoint_status` - Object describing the current status of the endpoint consisting of the following fields:
* `state` - Current state of the endpoint. Currently following values are supported: `PROVISIONING`, `ONLINE`, and `OFFLINE`.
* `message` - Additional status message.

## Import
Expand Down
79 changes: 79 additions & 0 deletions docs/resources/vector_search_index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
---
subcategory: "Vector Search"
---
# databricks_vector_search_index Resource

-> **Note** This resource could be only used on Unity Catalog-enabled workspace!

This resource allows you to create [Vector Search Index](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html) in Databricks. Vector Search is a serverless similarity search engine that allows you to store a vector representation of your data, including metadata, in a vector database. The Vector Search Index provides the ability to search data in the linked Delta Table.

## Example Usage

```hcl
resource "databricks_vector_search_index" "sync" {
name = "main.default.vector_search_index"
endpoint_name = databricks_vector_search_endpoint.this.name
primary_key = "id"
index_type = "DELTA_SYNC"
delta_sync_index_spec {
source_table = "main.default.source_table"
pipeline_type = "TRIGGERED"
embedding_source_columns {
name = "text"
embedding_model_endpoint_name = databricks_model_serving.this.name
}
}
}
```

## Argument Reference

The following arguments are supported (change of any parameter leads to recreation of the resource):

* `name` - (required) Three-level name of the Vector Search Index to create (`catalog.schema.index_name`).
* `endpoint_name` - (required) The name of the Vector Search Endpoint that will be used for indexing the data.
* `primary_key` - (required) The column name that will be used as a primary key.
* `index_type` - (required) Vector Search index type. Currently supported values are:
* `DELTA_SYNC`: An index that automatically syncs with a source Delta Table, automatically and incrementally updating the index as the underlying data in the Delta Table changes.
* `DIRECT_ACCESS`: An index that supports the direct read and write of vectors and metadata through our REST and SDK APIs. With this model, the user manages index updates.
* `delta_sync_index_spec` - (object) Specification for Delta Sync Index. Required if `index_type` is `DELTA_SYNC`.
* `source_table` (required) The name of the source table.
* `embedding_source_columns` - (required if `embedding_vector_columns` isn't provided) array of objects representing columns that contain the embedding source. Each entry consists of:
* `name` - The name of the column
* `embedding_model_endpoint_name` - The name of the embedding model endpoint
* `embedding_vector_columns` - (required if `embedding_source_columns` isn't provided) array of objects representing columns that contain the embedding vectors. Each entry consists of:
* `name` - The name of the column.
* `embedding_dimension` - Dimension of the embedding vector.
* `pipeline_type` - Pipeline execution mode. Possible values are:
* `TRIGGERED`: If the pipeline uses the triggered execution mode, the system stops processing after successfully refreshing the source table in the pipeline once, ensuring the table is updated based on the data available when the update started.
* `CONTINUOUS`: If the pipeline uses continuous execution, the pipeline processes new data as it arrives in the source table to keep the vector index fresh.
* `direct_access_index_spec` - (object) Specification for Direct Vector Access Index. Required if `index_type` is `DIRECT_ACCESS`.
* `schema_json` - The schema of the index in JSON format. Check the [API documentation](https://docs.databricks.com/api/workspace/vectorsearchindexes/createindex#direct_access_index_spec-schema_json) for a list of supported data types.
* `embedding_source_columns` - (required if `embedding_vector_columns` isn't provided) array of objects representing columns that contain the embedding source. Each entry consists of:
* `name` - The name of the column
* `embedding_model_endpoint_name` - The name of the embedding model endpoint
* `embedding_vector_columns` - (required if `embedding_source_columns` isn't provided) array of objects representing columns that contain the embedding vectors. Each entry consists of:
* `name` - The name of the column.
* `embedding_dimension` - Dimension of the embedding vector.

## Attribute Reference

In addition to all arguments above, the following attributes are exported:

* `id` - The same as the name of the index.
* `creator` - Creator of the endpoint.
* `delta_sync_index_spec`:
* `pipeline_id` - ID of the associated Delta Live Table pipeline.
* `status` - Object describing the current status of the index consisting of the following fields:
* `message` - Message associated with the index status
* `indexed_row_count` - Number of rows indexed
* `ready` - Whether the index is ready for search
* `index_url` - Index API Url to be used to perform operations on the index

## Import

The resource can be imported using the name of the Vector Search Index

```bash
terraform import databricks_vector_search_index.this <index-name>
```
1 change: 1 addition & 0 deletions provider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ func DatabricksProvider() *schema.Provider {
"databricks_user_instance_profile": aws.ResourceUserInstanceProfile().ToResource(),
"databricks_user_role": aws.ResourceUserRole().ToResource(),
"databricks_vector_search_endpoint": vectorsearch.ResourceVectorSearchEndpoint().ToResource(),
"databricks_vector_search_index": vectorsearch.ResourceVectorSearchIndex().ToResource(),
"databricks_volume": catalog.ResourceVolume().ToResource(),
"databricks_workspace_conf": workspace.ResourceWorkspaceConf().ToResource(),
"databricks_workspace_file": workspace.ResourceWorkspaceFile().ToResource(),
Expand Down
4 changes: 2 additions & 2 deletions vectorsearch/resource_vector_search_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
"github.com/databricks/databricks-sdk-go/service/vectorsearch"
)

const defaultProvisionTimeout = 75 * time.Minute
const defaultEndpointProvisionTimeout = 75 * time.Minute
const deleteCallTimeout = 10 * time.Second

func ResourceVectorSearchEndpoint() common.Resource {
Expand Down Expand Up @@ -86,7 +86,7 @@ func ResourceVectorSearchEndpoint() common.Resource {
Schema: s,
SchemaVersion: 0,
Timeouts: &schema.ResourceTimeout{
Create: schema.DefaultTimeout(defaultProvisionTimeout),
Create: schema.DefaultTimeout(defaultEndpointProvisionTimeout),
},
}
}
6 changes: 3 additions & 3 deletions vectorsearch/resource_vector_search_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ func TestVectorSearchEndpointRead(t *testing.T) {
})
}

func TestResourcePASDelete(t *testing.T) {
func TestVectorSearchEndpointDelete(t *testing.T) {
qa.ResourceFixture{
MockWorkspaceClientFunc: func(a *mocks.MockWorkspaceClient) {
a.GetMockVectorSearchEndpointsAPI().EXPECT().DeleteEndpointByEndpointName(mock.Anything, "abc").Return(nil)
MockWorkspaceClientFunc: func(w *mocks.MockWorkspaceClient) {
w.GetMockVectorSearchEndpointsAPI().EXPECT().DeleteEndpointByEndpointName(mock.Anything, "abc").Return(nil)
},
Resource: ResourceVectorSearchEndpoint(),
Delete: true,
Expand Down
118 changes: 118 additions & 0 deletions vectorsearch/resource_vector_search_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package vectorsearch

import (
"context"
"errors"
"fmt"
"log"
"time"

"github.com/databricks/databricks-sdk-go"
"github.com/databricks/terraform-provider-databricks/common"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"

"github.com/databricks/databricks-sdk-go/apierr"
"github.com/databricks/databricks-sdk-go/service/vectorsearch"
)

const defaultIndexProvisionTimeout = 15 * time.Minute

func waitForVectorSearchIndexDeletion(w *databricks.WorkspaceClient, ctx context.Context, searchIndexName string) error {
return retry.RetryContext(ctx, defaultIndexProvisionTimeout, func() *retry.RetryError {
_, err := w.VectorSearchIndexes.GetIndexByIndexName(ctx, searchIndexName)
if err == nil {
return retry.RetryableError(fmt.Errorf("vector search index %s is still not deleted", searchIndexName))
}
if errors.Is(err, apierr.ErrResourceDoesNotExist) || errors.Is(err, apierr.ErrNotFound) {
return nil
}
return retry.NonRetryableError(fmt.Errorf("vector search index %w", err))
})
}

func waitForSearchIndexCreation(w *databricks.WorkspaceClient, ctx context.Context, searchIndexName string) error {
return retry.RetryContext(ctx, defaultIndexProvisionTimeout-deleteCallTimeout, func() *retry.RetryError {
index, err := w.VectorSearchIndexes.GetIndexByIndexName(ctx, searchIndexName)
if err != nil {
return retry.NonRetryableError(err)
}
if index.Status.Ready { // We really need to depend on the detailed status of the index, but it's not available in the API yet
return nil
}
return retry.RetryableError(fmt.Errorf("vector search index %s is still pending", searchIndexName))
})
}

func ResourceVectorSearchIndex() common.Resource {
s := common.StructToSchema(
vectorsearch.VectorIndex{},
func(s map[string]*schema.Schema) map[string]*schema.Schema {
common.MustSchemaPath(s, "delta_sync_index_spec", "embedding_vector_columns").MinItems = 1
exof := []string{"delta_sync_index_spec", "direct_access_index_spec"}
s["delta_sync_index_spec"].ExactlyOneOf = exof
s["direct_access_index_spec"].ExactlyOneOf = exof

common.CustomizeSchemaPath(s, "endpoint_name").SetRequired()
common.CustomizeSchemaPath(s, "primary_key").SetRequired()
common.CustomizeSchemaPath(s, "status").SetReadOnly()
common.CustomizeSchemaPath(s, "creator").SetReadOnly()
common.CustomizeSchemaPath(s, "name").SetRequired()
common.CustomizeSchemaPath(s, "index_type").SetRequired()
common.CustomizeSchemaPath(s, "delta_sync_index_spec", "pipeline_id").SetReadOnly()
return s
})

return common.Resource{
alexott marked this conversation as resolved.
Show resolved Hide resolved
Create: func(ctx context.Context, d *schema.ResourceData, c *common.DatabricksClient) error {
w, err := c.WorkspaceClient()
if err != nil {
return err
}
var req vectorsearch.CreateVectorIndexRequest
common.DataToStructPointer(d, s, &req)
_, err = w.VectorSearchIndexes.CreateIndex(ctx, req)
if err != nil {
return err
}
err = waitForSearchIndexCreation(w, ctx, req.Name)
if err != nil {
nestedErr := w.VectorSearchIndexes.DeleteIndexByIndexName(ctx, req.Name)
if nestedErr != nil {
log.Printf("[ERROR] Error cleaning up search index: %s", nestedErr.Error())
}
return err
}
d.SetId(req.Name)
return nil
},
Read: func(ctx context.Context, d *schema.ResourceData, c *common.DatabricksClient) error {
w, err := c.WorkspaceClient()
if err != nil {
return err
}
index, err := w.VectorSearchIndexes.GetIndexByIndexName(ctx, d.Id())
if err != nil {
return err
}
return common.StructToData(*index, s, d)
},
Delete: func(ctx context.Context, d *schema.ResourceData, c *common.DatabricksClient) error {
w, err := c.WorkspaceClient()
if err != nil {
return err
}
err = w.VectorSearchIndexes.DeleteIndexByIndexName(ctx, d.Id())
if err != nil {
return err
}
return waitForVectorSearchIndexDeletion(w, ctx, d.Id())
},
StateUpgraders: []schema.StateUpgrader{},
Schema: s,
SchemaVersion: 0,
Timeouts: &schema.ResourceTimeout{
Create: schema.DefaultTimeout(defaultIndexProvisionTimeout),
},
}
}
Loading
Loading