diff --git a/docs/guides/experimental-exporter.md b/docs/guides/experimental-exporter.md index 6f41bf6154..9e0f357c03 100644 --- a/docs/guides/experimental-exporter.md +++ b/docs/guides/experimental-exporter.md @@ -61,6 +61,9 @@ All arguments are optional, and they tune what code is being generated. * `-listing` - Comma-separated list of services to be listed and further passed on for importing. For each service specified, the exporter performs a listing of available resources using the `List` function and emits them for importing together with their dependencies. The `-services` parameter could be used to control which transitive dependencies will be also imported. * `-services` - Comma-separated list of services to import. By default, all services are imported. * `-match` - Match resource names during listing operation. This filter applies to all resources that are getting listed, so if you want to import all dependencies of just one cluster, specify `-match=autoscaling -listing=compute`. By default, it is empty, which matches everything. +* `-matchRegex` - Match resource names against a given regex during listing operation. Applicable to all resources selected for listing. +* `-excludeRegex` - Exclude resource names matching a given regex. Applied during the listing operation and has higher priority than `-match` and `-matchRegex`. Applicable to all resources selected for listing. Could be used to exclude things like `databricks_automl` notebooks, etc. +* `-filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking. *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`*. * `-mounts` - List DBFS mount points, an extremely slow operation that would not trigger unless explicitly specified. * `-generateProviderDeclaration` - the flag that toggles the generation of `databricks.tf` file with the declaration of the Databricks Terraform provider that is necessary for Terraform versions since Terraform 0.13 (disabled by default). * `-prefix` - optional prefix that will be added to the name of all exported resources - that's useful for exporting resources from multiple workspaces for merging into a single one. diff --git a/exporter/command.go b/exporter/command.go index 5e40b9a039..72eb8f25dd 100644 --- a/exporter/command.go +++ b/exporter/command.go @@ -131,6 +131,8 @@ func Run(args ...string) error { flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.") flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true, "Generate Databricks provider declaration.") + flags.BoolVar(&ic.filterDirectoriesDuringWorkspaceWalking, "filterDirectoriesDuringWorkspaceWalking", false, + "Apply filtering to directory names during workspace walking") flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE", "Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE") services, listing := ic.allServicesAndListing() @@ -145,6 +147,12 @@ func Run(args ...string) error { flags.StringVar(&ic.match, "match", "", "Match resource names during listing operation. "+ "This filter applies to all resources that are getting listed, so if you want to import "+ "all dependencies of just one cluster, specify -listing=compute") + flags.StringVar(&ic.matchRegexStr, "matchRegex", "", "Match resource names during listing operation against a regex. "+ + "This filter applies to all resources that are getting listed, so if you want to import "+ + "all dependencies of just one cluster, specify -listing=compute") + flags.StringVar(&ic.excludeRegexStr, "excludeRegex", "", "Exclude resource names matching regex during listing operation. "+ + "This filter applies to all resources that are getting listed, so if you want to import "+ + "all dependencies of just one cluster, specify -listing=compute") prefix := "" flags.StringVar(&prefix, "prefix", "", "Prefix that will be added to the name of all exported resources") newArgs := args diff --git a/exporter/context.go b/exporter/context.go index ffb230a4e8..bfba5d24f1 100644 --- a/exporter/context.go +++ b/exporter/context.go @@ -78,28 +78,33 @@ type importContext struct { Scope importedResources // command-line resources (immutable, or set by the single thread) - includeUserDomains bool - importAllUsers bool - exportDeletedUsersAssets bool - incremental bool - mounts bool - noFormat bool - nativeImportSupported bool - services map[string]struct{} - listing map[string]struct{} - match string - lastActiveDays int64 - lastActiveMs int64 - generateDeclaration bool - exportSecrets bool - meAdmin bool - meUserName string - prefix string - accountLevel bool - shImports map[string]bool - notebooksFormat string - updatedSinceStr string - updatedSinceMs int64 + includeUserDomains bool + importAllUsers bool + exportDeletedUsersAssets bool + incremental bool + mounts bool + noFormat bool + nativeImportSupported bool + services map[string]struct{} + listing map[string]struct{} + match string + matchRegexStr string + matchRegex *regexp.Regexp + excludeRegexStr string + excludeRegex *regexp.Regexp + filterDirectoriesDuringWorkspaceWalking bool + lastActiveDays int64 + lastActiveMs int64 + generateDeclaration bool + exportSecrets bool + meAdmin bool + meUserName string + prefix string + accountLevel bool + shImports map[string]bool + notebooksFormat string + updatedSinceStr string + updatedSinceMs int64 waitGroup *sync.WaitGroup @@ -297,6 +302,24 @@ func (ic *importContext) Run() error { return fmt.Errorf("no services to import") } + if ic.matchRegexStr != "" { + log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.matchRegexStr) + re, err := regexp.Compile(ic.matchRegexStr) + if err != nil { + log.Printf("[ERROR] can't compile regex '%s': %v", ic.matchRegexStr, err) + return err + } + ic.matchRegex = re + } + if ic.excludeRegexStr != "" { + log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.excludeRegexStr) + re, err := regexp.Compile(ic.excludeRegexStr) + if err != nil { + log.Printf("[ERROR] can't compile regex '%s': %v", ic.excludeRegexStr, err) + return err + } + ic.excludeRegex = re + } if ic.incremental { if ic.updatedSinceStr == "" { ic.updatedSinceStr = getLastRunString(statsFileName) diff --git a/exporter/exporter_test.go b/exporter/exporter_test.go index 9c2f64cf15..ad485b9557 100644 --- a/exporter/exporter_test.go +++ b/exporter/exporter_test.go @@ -2349,7 +2349,7 @@ func TestImportingGlobalSqlConfig(t *testing.T) { }) } -func TestImportingNotebooksWorkspaceFiles(t *testing.T) { +func TestImportingNotebooksWorkspaceFilesWithFilter(t *testing.T) { fileStatus := workspace.ObjectStatus{ ObjectID: 123, ObjectType: workspace.File, @@ -2371,7 +2371,135 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) { Method: "GET", Resource: "/api/2.0/workspace/list?path=%2F", Response: workspace.ObjectList{ - Objects: []workspace.ObjectStatus{notebookStatus, fileStatus}, + Objects: []workspace.ObjectStatus{notebookStatus, fileStatus, + { + ObjectID: 4567, + ObjectType: workspace.Notebook, + Path: "/UnmatchedNotebook", + Language: "PYTHON", + }, + { + ObjectID: 1234, + ObjectType: workspace.File, + Path: "/UnmatchedFile", + }, + { + ObjectID: 456, + ObjectType: workspace.Directory, + Path: "/databricks_automl", + }, + { + ObjectID: 456, + ObjectType: workspace.Directory, + Path: "/.bundle", + }, + }, + }, + ReuseRequest: true, + }, + { + Method: "GET", + Resource: "/api/2.0/workspace/list?path=%2Fdatabricks_automl", + Response: workspace.ObjectList{}, + }, + { + Method: "GET", + Resource: "/api/2.0/workspace/get-status?path=%2FNotebook", + Response: notebookStatus, + ReuseRequest: true, + }, + { + Method: "GET", + Resource: "/api/2.0/workspace/get-status?path=%2FFile", + Response: fileStatus, + ReuseRequest: true, + }, + { + Method: "GET", + Resource: "/api/2.0/workspace/export?format=AUTO&path=%2FFile", + Response: workspace.ExportPath{ + Content: "dGVzdA==", + }, + ReuseRequest: true, + }, + { + Method: "GET", + Resource: "/api/2.0/workspace/export?format=SOURCE&path=%2FNotebook", + Response: workspace.ExportPath{ + Content: "dGVzdA==", + }, + ReuseRequest: true, + }, + }, + func(ctx context.Context, client *common.DatabricksClient) { + tmpDir := fmt.Sprintf("/tmp/tf-%s", qa.RandomName()) + defer os.RemoveAll(tmpDir) + + ic := newImportContext(client) + ic.Directory = tmpDir + ic.enableListing("notebooks,wsfiles") + ic.excludeRegexStr = "databricks_automl" + ic.matchRegexStr = "^/[FN].*$" + + err := ic.Run() + assert.NoError(t, err) + // check generated code for notebooks + content, err := os.ReadFile(tmpDir + "/notebooks.tf") + assert.NoError(t, err) + contentStr := string(content) + assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`)) + assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`)) + assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`)) + // check generated code for workspace files + content, err = os.ReadFile(tmpDir + "/wsfiles.tf") + assert.NoError(t, err) + contentStr = string(content) + assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`)) + assert.True(t, strings.Contains(contentStr, `path = "/File"`)) + assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`)) + }) +} + +func TestImportingNotebooksWorkspaceFilesWithFilterDuringWalking(t *testing.T) { + fileStatus := workspace.ObjectStatus{ + ObjectID: 123, + ObjectType: workspace.File, + Path: "/File", + } + notebookStatus := workspace.ObjectStatus{ + ObjectID: 456, + ObjectType: workspace.Notebook, + Path: "/Notebook", + Language: "PYTHON", + } + qa.HTTPFixturesApply(t, + []qa.HTTPFixture{ + meAdminFixture, + noCurrentMetastoreAttached, + emptyRepos, + emptyIpAccessLIst, + { + Method: "GET", + Resource: "/api/2.0/workspace/list?path=%2F", + Response: workspace.ObjectList{ + Objects: []workspace.ObjectStatus{notebookStatus, fileStatus, + { + ObjectID: 4567, + ObjectType: workspace.Notebook, + Path: "/UnmatchedNotebook", + Language: "PYTHON", + }, + { + ObjectID: 1234, + ObjectType: workspace.File, + Path: "/UnmatchedFile", + }, + { + ObjectID: 456, + ObjectType: workspace.Directory, + Path: "/databricks_automl", + }, + }, }, ReuseRequest: true, }, @@ -2410,10 +2538,27 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) { ic := newImportContext(client) ic.Directory = tmpDir - ic.enableListing("notebooks") + ic.enableListing("notebooks,wsfiles") + ic.excludeRegexStr = "databricks_automl" + ic.matchRegexStr = "^/[FN].*$" + ic.filterDirectoriesDuringWorkspaceWalking = true err := ic.Run() assert.NoError(t, err) + // check generated code for notebooks + content, err := os.ReadFile(tmpDir + "/notebooks.tf") + assert.NoError(t, err) + contentStr := string(content) + assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`)) + assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`)) + assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`)) + // check generated code for workspace files + content, err = os.ReadFile(tmpDir + "/wsfiles.tf") + assert.NoError(t, err) + contentStr = string(content) + assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`)) + assert.True(t, strings.Contains(contentStr, `path = "/File"`)) + assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`)) }) } diff --git a/exporter/util.go b/exporter/util.go index e9380a9b56..5e4f53dcaa 100644 --- a/exporter/util.go +++ b/exporter/util.go @@ -35,9 +35,15 @@ func (ic *importContext) isServiceInListing(service string) bool { } func (ic *importContext) MatchesName(n string) bool { - if ic.match == "" { + if ic.match == "" && ic.matchRegex == nil && ic.excludeRegex == nil { return true } + if ic.excludeRegex != nil && ic.excludeRegex.MatchString(n) { + return false + } + if ic.matchRegex != nil { + return ic.matchRegex.MatchString(n) + } return strings.Contains(strings.ToLower(n), strings.ToLower(ic.match)) } diff --git a/exporter/util_test.go b/exporter/util_test.go index 588c831db7..912baa78b4 100644 --- a/exporter/util_test.go +++ b/exporter/util_test.go @@ -316,16 +316,16 @@ func TestGetEnvAsInt(t *testing.T) { } func TestExcludeAuxiliaryDirectories(t *testing.T) { - assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory})) - assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{ObjectType: workspace.File})) - assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc", + assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory})) + assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{ObjectType: workspace.File})) + assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc", ObjectType: workspace.Directory})) // should be ignored - assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/user@domain.com/.ide", + assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/user@domain.com/.ide", ObjectType: workspace.Directory})) - assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Shared/.bundle", + assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Shared/.bundle", ObjectType: workspace.Directory})) - assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc/__pycache__", + assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc/__pycache__", ObjectType: workspace.Directory})) } diff --git a/exporter/util_workspace.go b/exporter/util_workspace.go index 5a5621f806..8dcbefbaf0 100644 --- a/exporter/util_workspace.go +++ b/exporter/util_workspace.go @@ -93,17 +93,18 @@ func (ic *importContext) getAllDirectories() []workspace.ObjectStatus { var directoriesToIgnore = []string{".ide", ".bundle", "__pycache__"} // TODO: add ignoring directories of deleted users? This could potentially decrease the number of processed objects... -func excludeAuxiliaryDirectories(v workspace.ObjectStatus) bool { +func isAuxiliaryDirectory(v workspace.ObjectStatus) bool { if v.ObjectType != workspace.Directory { - return true + return false } // TODO: rewrite to use suffix check, etc., instead of split and slice contains? parts := strings.Split(v.Path, "/") result := len(parts) > 1 && slices.Contains[[]string, string](directoriesToIgnore, parts[len(parts)-1]) + log.Printf("[DEBUG] directory %s: %v", v.Path, result) if result { log.Printf("[DEBUG] Ignoring directory %s", v.Path) } - return !result + return result } func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectStatus)) []workspace.ObjectStatus { @@ -113,7 +114,15 @@ func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectS t1 := time.Now() log.Print("[INFO] Starting to list all workspace objects") notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client) - ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", excludeAuxiliaryDirectories, visitor) + shouldIncludeDirectory := func(v workspace.ObjectStatus) bool { + decision := !isAuxiliaryDirectory(v) + if decision && ic.filterDirectoriesDuringWorkspaceWalking { + decision = ic.MatchesName(v.Path) + } + // log.Printf("[DEBUG] decision of shouldIncludeDirectory for %s: %v", v.Path, decision) + return decision + } + ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", shouldIncludeDirectory, visitor) log.Printf("[INFO] Finished listing of all workspace objects. %d objects in total. %v seconds", len(ic.allWorkspaceObjects), time.Since(t1).Seconds()) }