From ccfaccf36ddb6ca5850838767d3e5e043d87cd34 Mon Sep 17 00:00:00 2001 From: Sergei Zaychenko Date: Thu, 21 Nov 2024 23:59:28 +0200 Subject: [PATCH] Introduced DatasetRegistry abstraction, encapsulating listing and resolution of datasets (#941) * Introduced `DatasetRegistry` abstraction, encapsulating listing and resolution of datasets. Key changes: - Registry is backed by database-stored dataset entries, which are automatically maintained - Scope for `DatasetRepository` is now limited to support `DatasetRegistry` and in-memory dataset dependency graph - New concept of `ResolvedDataset`: a wrapper arround `Arc`, aware of dataset identity - `DatasetRegistryRepoBridge` utility connects both abstractions in a simple way for testing needs - Query and Dataset Search functions now consider only the datasets accessible for current user - Core services now explicitly separate planning (transactional) and execution (non-transactional) processing phases - Similar decomposition introduced in task system execution logic - Revised implementation of core commands and services: `pull`, `push`, `reset`, `verify`, `compact`, setting watermark - More parallelism from `pull` command, allowing to mix ingest/sync/transform operations of the same depth level - Optimized `pull` flow, when a single non-recursive dataset is sent for processing - Batched form for dataset authorization checks - Ensuring correct transactionality for dataset lookup and authorization checks all over the code base - Passing multi/single tenancy as an enum configuration instead of boolean - Renamed outbox "durability" term to "delivery mechanism" to clarify the design intent - Greatly reduced complexity and code duplication of many use case and service tests with `oop` macro for inheritance of harnesses * Lock corrections * v0.208.0 + minor updates --- CHANGELOG.md | 16 + Cargo.lock | 230 +-- Cargo.toml | 138 +- LICENSE.txt | 4 +- resources/openapi-mt.json | 2 +- resources/openapi.json | 2 +- .../auth-oso/src/oso_dataset_authorizer.rs | 81 +- .../tests/test_oso_dataset_authorizer.rs | 9 +- .../src/mutations/dataset_metadata_mut.rs | 10 +- .../graphql/src/mutations/dataset_mut.rs | 12 +- .../graphql/src/mutations/datasets_mut.rs | 8 +- .../mutations/flows_mut/flows_mut_utils.rs | 34 +- .../src/queries/accounts/account_flow_runs.rs | 6 +- .../graphql/src/queries/datasets/dataset.rs | 24 +- .../src/queries/datasets/dataset_data.rs | 12 +- .../src/queries/datasets/dataset_metadata.rs | 57 +- .../graphql/src/queries/datasets/datasets.rs | 23 +- .../src/queries/datasets/metadata_chain.rs | 29 +- src/adapter/graphql/src/queries/flows/flow.rs | 11 +- .../graphql/src/queries/flows/flow_outcome.rs | 9 +- .../graphql/src/queries/flows/flow_trigger.rs | 8 +- src/adapter/graphql/src/queries/search.rs | 26 +- .../graphql/src/scalars/flow_configuration.rs | 6 +- src/adapter/graphql/src/utils.rs | 8 +- .../tests/tests/test_error_handling.rs | 15 +- .../tests/test_gql_account_flow_configs.rs | 27 +- .../graphql/tests/tests/test_gql_data.rs | 28 +- .../tests/tests/test_gql_dataset_env_vars.rs | 17 +- .../tests/test_gql_dataset_flow_configs.rs | 48 +- .../tests/tests/test_gql_dataset_flow_runs.rs | 64 +- .../graphql/tests/tests/test_gql_datasets.rs | 51 +- .../graphql/tests/tests/test_gql_metadata.rs | 8 +- .../tests/tests/test_gql_metadata_chain.rs | 16 +- .../graphql/tests/tests/test_gql_search.rs | 12 +- src/adapter/http/src/data/ingest_handler.rs | 30 +- src/adapter/http/src/data/metadata_handler.rs | 19 +- src/adapter/http/src/data/query_handler.rs | 9 - src/adapter/http/src/data/query_types.rs | 9 - src/adapter/http/src/data/router.rs | 9 - src/adapter/http/src/data/tail_handler.rs | 9 - src/adapter/http/src/data/verify_handler.rs | 9 - src/adapter/http/src/data/verify_types.rs | 9 - .../http/src/e2e/system_time_handler.rs | 9 - .../http/src/general/account_handler.rs | 9 - .../http/src/general/dataset_info_handler.rs | 17 +- .../http/src/general/node_info_handler.rs | 15 +- src/adapter/http/src/general/router.rs | 9 - .../http/src/http_server_dataset_router.rs | 14 +- .../middleware/dataset_authorization_layer.rs | 97 +- .../src/middleware/dataset_resolver_layer.rs | 48 +- src/adapter/http/src/middleware/mod.rs | 2 - .../run_in_database_transaction_layer.rs | 91 -- .../http/src/simple_protocol/handlers.rs | 50 +- .../axum_server_push_protocol.rs | 20 +- src/adapter/http/src/smart_protocol/phases.rs | 2 + .../smart_protocol/ws_tungstenite_client.rs | 39 +- .../http/tests/harness/client_side_harness.rs | 81 +- .../http/tests/harness/common_harness.rs | 11 +- .../http/tests/harness/server_side_harness.rs | 7 +- .../harness/server_side_local_fs_harness.rs | 47 +- .../tests/harness/server_side_s3_harness.rs | 34 +- .../http/tests/harness/test_api_server.rs | 16 +- src/adapter/http/tests/tests/mod.rs | 42 +- .../http/tests/tests/test_account_info.rs | 10 +- .../http/tests/tests/test_data_ingest.rs | 2 +- .../http/tests/tests/test_data_query.rs | 2 +- .../tests/test_dataset_authorization_layer.rs | 5 +- .../http/tests/tests/test_dataset_info.rs | 9 +- .../http/tests/tests/test_node_info.rs | 9 +- .../tests/test_platform_login_validate.rs | 3 +- .../tests/test_protocol_dataset_helpers.rs | 7 +- src/adapter/http/tests/tests/test_routing.rs | 19 +- .../http/tests/tests/test_upload_local.rs | 4 +- .../http/tests/tests/test_upload_s3.rs | 3 +- ...xisting_evolved_dataset_reread_succeeds.rs | 4 +- ...rio_aborted_read_of_new_reread_succeeds.rs | 2 +- ...cenario_existing_advanced_dataset_fails.rs | 6 +- .../scenario_existing_diverged_dataset.rs | 12 +- .../scenario_existing_evolved_dataset.rs | 4 +- .../scenario_existing_up_to_date_dataset.rs | 2 +- .../scenarios/scenario_new_dataset.rs | 2 +- .../tests_pull/test_smart_pull_local_fs.rs | 2 + .../tests/tests_pull/test_smart_pull_s3.rs | 2 + .../tests_pull/test_smart_pull_special.rs | 9 +- ...o_aborted_write_of_new_rewrite_succeeds.rs | 2 +- ...orted_write_of_updated_rewrite_succeeds.rs | 8 +- ...isting_dataset_fails_as_server_advanced.rs | 4 +- .../scenario_existing_diverged_dataset.rs | 12 +- .../scenario_existing_evolved_dataset.rs | 8 +- .../scenario_existing_ref_collision.rs | 2 +- .../scenario_existing_up_to_date_dataset.rs | 2 +- .../scenarios/scenario_new_dataset.rs | 2 +- .../scenario_new_dataset_via_repo_ref.rs | 2 +- .../tests_push/test_smart_push_local_fs.rs | 2 + .../tests/tests_push/test_smart_push_s3.rs | 2 + .../tests_push/test_smart_push_special.rs | 91 +- src/adapter/odata/src/context.rs | 47 +- src/adapter/odata/src/handler.rs | 10 +- .../odata/tests/tests/test_api_server.rs | 10 +- .../odata/tests/tests/test_handlers.rs | 13 +- src/app/cli/src/app.rs | 90 +- src/app/cli/src/cli_commands.rs | 69 +- src/app/cli/src/commands/add_command.rs | 17 +- src/app/cli/src/commands/alias_add_command.rs | 15 +- .../cli/src/commands/alias_delete_command.rs | 41 +- .../cli/src/commands/alias_list_command.rs | 32 +- src/app/cli/src/commands/compact_command.rs | 45 +- src/app/cli/src/commands/complete_command.rs | 21 +- src/app/cli/src/commands/delete_command.rs | 17 +- src/app/cli/src/commands/ingest_command.rs | 18 +- src/app/cli/src/commands/init_command.rs | 17 +- .../src/commands/inspect_lineage_command.rs | 13 +- .../cli/src/commands/inspect_query_command.rs | 14 +- src/app/cli/src/commands/list_command.rs | 48 +- src/app/cli/src/commands/log_command.rs | 18 +- src/app/cli/src/commands/pull_command.rs | 155 +- src/app/cli/src/commands/push_command.rs | 67 +- src/app/cli/src/commands/reset_command.rs | 20 +- .../cli/src/commands/set_watermark_command.rs | 50 +- .../commands/system_api_server_run_command.rs | 11 +- .../src/commands/system_diagnose_command.rs | 59 +- .../cli/src/commands/system_e2e_command.rs | 17 +- .../src/commands/system_ipfs_add_command.rs | 16 +- src/app/cli/src/commands/ui_command.rs | 9 +- src/app/cli/src/commands/verify_command.rs | 163 ++- src/app/cli/src/database.rs | 5 +- src/app/cli/src/error.rs | 1 - src/app/cli/src/explore/api_server.rs | 20 +- src/app/cli/src/explore/web_ui_server.rs | 24 +- .../src/services/accounts/account_service.rs | 27 +- .../services/workspace/workspace_layout.rs | 8 +- src/app/cli/tests/tests/test_di_graph.rs | 29 +- .../src/entities/current_account_subject.rs | 16 + ...ebac_dataset_lifecycle_message_consumer.rs | 4 +- .../src/auth/dataset_action_authorizer.rs | 39 + src/domain/core/src/entities/dataset.rs | 3 + src/domain/core/src/entities/engine.rs | 3 +- .../core/src/entities/metadata_chain.rs | 1 + src/domain/core/src/entities/mod.rs | 4 + .../core/src/entities/resolved_dataset.rs | 74 + .../src/entities/resolved_datasets_map.rs | 64 + src/domain/core/src/lib.rs | 1 + src/domain/core/src/repos/dataset_registry.rs | 56 - .../core/src/repos/dataset_repository.rs | 66 +- src/domain/core/src/repos/mod.rs | 2 - .../core/src/services/compaction_service.rs | 19 +- .../core/src/services/dataset_registry.rs | 99 ++ .../services/ingest/polling_ingest_service.rs | 52 +- .../services/ingest/push_ingest_service.rs | 31 +- src/domain/core/src/services/mod.rs | 16 +- .../core/src/services/pull_request_planner.rs | 427 ++++++ src/domain/core/src/services/pull_service.rs | 335 ----- ...ush_service.rs => push_request_planner.rs} | 55 +- .../src/services/remote_aliases_registry.rs | 36 +- src/domain/core/src/services/reset_service.rs | 2 +- src/domain/core/src/services/sync_service.rs | 133 +- src/domain/core/src/services/transform/mod.rs | 20 + .../transform_elaboration_service.rs | 84 ++ .../transform/transform_execution_service.rs | 110 ++ .../services/transform/transform_listener.rs | 46 + .../transform/transform_request_planner.rs | 229 +++ .../src/services/transform/transform_types.rs | 110 ++ .../core/src/services/transform_service.rs | 209 --- .../core/src/services/verification_service.rs | 19 +- .../core/src/services/watermark_service.rs | 95 ++ .../src/testing/mock_dataset_repository.rs | 28 +- .../src/use_cases/compact_dataset_use_case.rs | 42 + src/domain/core/src/use_cases/mod.rs | 12 + .../src/use_cases/pull_dataset_use_case.rs | 41 + .../src/use_cases/push_dataset_use_case.rs | 29 + .../src/use_cases/reset_dataset_use_case.rs | 26 + .../src/use_cases/set_watermark_use_case.rs | 26 + .../src/use_cases/verify_dataset_use_case.rs | 38 + src/domain/core/src/utils/mod.rs | 3 + src/domain/core/src/utils/tenancy_config.rs | 24 + src/domain/datasets/domain/Cargo.toml | 1 + .../src/repos/dataset_entry_repository.rs | 56 +- .../src/services/dataset_entry_service.rs | 48 + .../datasets/domain/src/services/mod.rs | 2 + src/domain/datasets/services/Cargo.toml | 1 + .../services/src/dataset_entry_indexer.rs | 6 +- .../services/src/dataset_entry_service.rs | 148 -- .../src/dataset_entry_service_impl.rs | 556 ++++++++ src/domain/datasets/services/src/lib.rs | 4 +- .../tests/tests/test_dataset_entry_service.rs | 37 +- .../domain/src/entities/flow/flow_outcome.rs | 4 +- .../services/src/flow/flow_executor_impl.rs | 24 +- .../flow_configuration_service_impl.rs | 4 +- .../test_flow_configuration_service_impl.rs | 8 +- .../tests/tests/test_flow_executor_impl.rs | 156 +- .../tests/utils/flow_config_test_listener.rs | 4 +- .../tests/tests/utils/flow_harness_shared.rs | 10 +- .../tests/utils/flow_system_test_listener.rs | 4 +- .../services/tests/tests/utils/task_driver.rs | 2 +- src/domain/task-system/domain/Cargo.toml | 1 + .../domain/src/entities/logical_plan.rs | 32 +- .../task-system/domain/src/services/mod.rs | 6 +- .../src/services/task_definition_planner.rs | 68 + ..._logical_plan_runner.rs => task_runner.rs} | 8 +- src/domain/task-system/services/Cargo.toml | 5 + .../task-system/services/src/dependencies.rs | 3 +- src/domain/task-system/services/src/lib.rs | 6 +- .../src/task_definition_planner_impl.rs | 172 +++ .../services/src/task_executor_impl.rs | 28 +- .../src/task_logical_plan_runner_impl.rs | 214 --- .../services/src/task_runner_impl.rs | 236 ++++ .../tests/tests/test_task_aggregate.rs | 18 +- .../tests/tests/test_task_executor_impl.rs | 109 +- .../tests/tests/test_task_scheduler_impl.rs | 62 +- .../src/commands/test_compact_command.rs | 2 +- src/infra/core/Cargo.toml | 3 + src/infra/core/src/compaction_service_impl.rs | 102 +- .../core/src/dataset_changes_service_impl.rs | 37 +- .../src/dataset_ownership_service_inmem.rs | 12 +- .../core/src/dataset_registry_repo_bridge.rs | 81 ++ .../src/dependency_graph_repository_inmem.rs | 2 +- .../src/dependency_graph_service_inmem.rs | 4 +- .../src/engine/engine_datafusion_inproc.rs | 2 + .../core/src/engine/engine_io_strategy.rs | 43 +- src/infra/core/src/engine/engine_odf.rs | 31 +- .../src/engine/engine_provisioner_local.rs | 8 +- .../src/ingest/polling_ingest_service_impl.rs | 77 +- .../src/ingest/push_ingest_service_impl.rs | 49 +- src/infra/core/src/lib.rs | 19 +- src/infra/core/src/provenance_service_impl.rs | 87 +- .../core/src/pull_request_planner_impl.rs | 759 ++++++++++ src/infra/core/src/pull_service_impl.rs | 791 ----------- .../core/src/push_request_planner_impl.rs | 79 ++ src/infra/core/src/push_service_impl.rs | 177 --- src/infra/core/src/query/mod.rs | 161 +-- src/infra/core/src/query_service_impl.rs | 141 +- .../core/src/remote_alias_resolver_impl.rs | 23 +- .../core/src/remote_aliases_registry_impl.rs | 54 +- .../core/src/repos/dataset_factory_impl.rs | 8 +- src/infra/core/src/repos/dataset_impl.rs | 8 + .../src/repos/dataset_repository_helpers.rs | 4 +- .../src/repos/dataset_repository_local_fs.rs | 63 +- .../core/src/repos/dataset_repository_s3.rs | 54 +- src/infra/core/src/reset_service_impl.rs | 34 +- src/infra/core/src/sync_request_builder.rs | 193 +++ src/infra/core/src/sync_service_impl.rs | 376 ++--- .../core/src/testing/dataset_test_helper.rs | 39 +- .../dummy_smart_transfer_protocol_client.rs | 11 +- .../core/src/testing/metadata_factory.rs | 17 + .../testing/mock_dataset_action_authorizer.rs | 46 + .../testing/mock_dataset_changes_service.rs | 1 + .../mock_dependency_graph_repository.rs | 1 + .../mock_odf_server_access_token_resolver.rs | 1 + .../testing/mock_polling_source_service.rs | 46 +- .../core/src/testing/mock_sync_service.rs | 64 + .../mock_transform_elaboration_service.rs | 60 + .../mock_transform_execution_service.rs | 51 + .../testing/mock_transform_request_planner.rs | 74 + .../src/testing/mock_transform_service.rs | 106 -- src/infra/core/src/testing/mod.rs | 10 +- src/infra/core/src/transform/mod.rs | 18 + .../transform_elaboration_service_impl.rs | 230 +++ .../transform_execution_service_impl.rs | 354 +++++ .../core/src/transform/transform_helpers.rs | 305 ++++ .../transform_request_planner_impl.rs | 327 +++++ src/infra/core/src/transform_service_impl.rs | 959 ------------- ...nd_dataset_metadata_batch_use_case_impl.rs | 6 + .../commit_dataset_event_use_case_impl.rs | 18 +- .../compact_dataset_use_case_impl.rs | 111 ++ ...ate_dataset_from_snapshot_use_case_impl.rs | 1 + .../use_cases/create_dataset_use_case_impl.rs | 1 + .../use_cases/delete_dataset_use_case_impl.rs | 30 +- src/infra/core/src/use_cases/mod.rs | 12 + .../use_cases/pull_dataset_use_case_impl.rs | 599 ++++++++ .../use_cases/push_dataset_use_case_impl.rs | 225 +++ .../use_cases/rename_dataset_use_case_impl.rs | 20 +- .../use_cases/reset_dataset_use_case_impl.rs | 70 + .../use_cases/set_watermark_use_case_impl.rs | 76 + .../use_cases/verify_dataset_use_case_impl.rs | 133 ++ .../core/src/utils/datasets_filtering.rs | 44 +- .../src/utils/simple_transfer_protocol.rs | 69 +- .../core/src/utils/smart_transfer_protocol.rs | 8 +- .../core/src/verification_service_impl.rs | 165 +-- src/infra/core/src/watermark_service_impl.rs | 152 ++ .../parallel_simple_transfer_protocol.rs | 76 +- .../core/tests/tests/engine/test_engine_io.rs | 86 +- .../tests/engine/test_engine_transform.rs | 124 +- .../tests/tests/ingest/test_polling_ingest.rs | 148 +- .../tests/tests/ingest/test_push_ingest.rs | 78 +- .../core/tests/tests/ingest/test_writer.rs | 7 +- src/infra/core/tests/tests/mod.rs | 4 +- .../repos/test_dataset_repository_local_fs.rs | 43 +- .../tests/repos/test_dataset_repository_s3.rs | 47 +- .../repos/test_dataset_repository_shared.rs | 60 +- .../tests/tests/test_compact_service_impl.rs | 269 ++-- .../test_dataset_changes_service_impl.rs | 229 +-- .../test_dataset_ownership_service_inmem.rs | 98 +- .../tests/tests/test_datasets_filtering.rs | 183 +-- .../tests/test_dependency_graph_inmem.rs | 85 +- .../tests/test_pull_request_planner_impl.rs | 849 +++++++++++ .../tests/tests/test_pull_service_impl.rs | 1252 ----------------- .../tests/test_push_request_planner_impl.rs | 354 +++++ .../tests/tests/test_query_service_impl.rs | 62 +- .../tests/tests/test_reset_service_impl.rs | 68 +- .../tests/tests/test_search_service_impl.rs | 22 +- src/infra/core/tests/tests/test_setup.rs | 1 - .../tests/tests/test_sync_service_impl.rs | 434 +++--- .../tests/test_transform_service_impl.rs | 303 ++-- .../tests/test_verification_service_impl.rs | 159 ++- .../tests/test_watermark_service_impl.rs | 192 +++ .../tests/use_cases/base_use_case_harness.rs | 85 ++ src/infra/core/tests/tests/use_cases/mod.rs | 11 + .../use_cases/outbox_expectation_helpers.rs | 93 ++ ..._append_dataset_metadata_batch_use_case.rs | 161 +-- .../test_commit_dataset_event_use_case.rs | 116 +- .../test_compact_dataset_use_case.rs | 189 +++ ...t_create_dataset_from_snapshot_use_case.rs | 110 +- .../use_cases/test_create_dataset_use_case.rs | 71 +- .../use_cases/test_delete_dataset_use_case.rs | 173 +-- .../use_cases/test_pull_dataset_use_case.rs | 803 +++++++++++ .../use_cases/test_push_dataset_use_case.rs | 401 ++++++ .../use_cases/test_rename_dataset_use_case.rs | 103 +- .../use_cases/test_reset_dataset_use_case.rs | 100 ++ .../use_cases/test_set_watermark_use_case.rs | 108 ++ .../use_cases/test_verify_dataset_use_case.rs | 220 +++ .../core/tests/utils/base_repo_harness.rs | 149 ++ src/infra/core/tests/utils/ftp_server.rs | 2 +- .../tests/utils/mock_engine_provisioner.rs | 1 + src/infra/core/tests/utils/mod.rs | 6 + .../core/tests/utils/transform_test_helper.rs | 120 ++ src/infra/datasets/inmem/Cargo.toml | 2 + .../repos/inmem_dateset_entry_repository.rs | 171 ++- .../test_inmem_dataset_entry_repository.rs | 16 + ...c17cd7452d48887938a2a28cbd9a1408472e2.json | 41 + ...5eca95214025291d1bd310c3900040a3c9c8.json} | 6 +- ...568af9d34f9614bb06303f05fc42601a07523.json | 22 + ...3113608f5ea1bdcf8319aa8e00a97e55269ed.json | 42 + ...c66874757b9f56f23ed86f8494c6ed4b0b7a.json} | 4 +- src/infra/datasets/postgres/Cargo.toml | 2 + .../postgres_dataset_entry_repository.rs | 166 ++- .../test_postgres_dataset_entry_repository.rs | 16 + src/infra/datasets/repo-tests/Cargo.toml | 1 + .../dataset_entry_repository_test_suite.rs | 267 +++- ...c17cd7452d48887938a2a28cbd9a1408472e2.json | 38 + ...568af9d34f9614bb06303f05fc42601a07523.json | 20 + ...113608f5ea1bdcf8319aa8e00a97e55269ed.json} | 6 +- src/infra/datasets/sqlite/Cargo.toml | 2 + .../repos/sqlite_dateset_entry_repository.rs | 198 ++- .../test_sqlite_dataset_entry_repository.rs | 16 + .../ingest-datafusion/benches/cdc_project.rs | 4 +- src/infra/ingest-datafusion/benches/ledger.rs | 9 +- .../ingest-datafusion/benches/snapshot.rs | 9 +- .../src/task_system_repository_test_suite.rs | 46 +- src/utils/database-common/src/entities.rs | 2 +- .../src/kamu_cli_puppet_ext.rs | 2 +- .../src/consumers/message_consumer.rs | 8 +- .../src/consumers/message_consumers_utils.rs | 14 +- .../src/consumers/message_dispatcher.rs | 2 +- .../src/executors/outbox_executor.rs | 10 +- .../implementation/outbox_dispatching_impl.rs | 33 +- src/utils/messaging-outbox/tests/mod.rs | 4 +- .../tests/test_dispatching_outbox_impl.rs | 18 +- .../tests/tests/test_immediate_outbox_impl.rs | 8 +- .../tests/tests/test_outbox_executor.rs | 8 +- 359 files changed, 15921 insertions(+), 9568 deletions(-) delete mode 100644 src/adapter/http/src/middleware/run_in_database_transaction_layer.rs create mode 100644 src/domain/core/src/entities/resolved_dataset.rs create mode 100644 src/domain/core/src/entities/resolved_datasets_map.rs delete mode 100644 src/domain/core/src/repos/dataset_registry.rs create mode 100644 src/domain/core/src/services/dataset_registry.rs create mode 100644 src/domain/core/src/services/pull_request_planner.rs delete mode 100644 src/domain/core/src/services/pull_service.rs rename src/domain/core/src/services/{push_service.rs => push_request_planner.rs} (67%) create mode 100644 src/domain/core/src/services/transform/mod.rs create mode 100644 src/domain/core/src/services/transform/transform_elaboration_service.rs create mode 100644 src/domain/core/src/services/transform/transform_execution_service.rs create mode 100644 src/domain/core/src/services/transform/transform_listener.rs create mode 100644 src/domain/core/src/services/transform/transform_request_planner.rs create mode 100644 src/domain/core/src/services/transform/transform_types.rs delete mode 100644 src/domain/core/src/services/transform_service.rs create mode 100644 src/domain/core/src/services/watermark_service.rs create mode 100644 src/domain/core/src/use_cases/compact_dataset_use_case.rs create mode 100644 src/domain/core/src/use_cases/pull_dataset_use_case.rs create mode 100644 src/domain/core/src/use_cases/push_dataset_use_case.rs create mode 100644 src/domain/core/src/use_cases/reset_dataset_use_case.rs create mode 100644 src/domain/core/src/use_cases/set_watermark_use_case.rs create mode 100644 src/domain/core/src/use_cases/verify_dataset_use_case.rs create mode 100644 src/domain/core/src/utils/tenancy_config.rs create mode 100644 src/domain/datasets/domain/src/services/dataset_entry_service.rs delete mode 100644 src/domain/datasets/services/src/dataset_entry_service.rs create mode 100644 src/domain/datasets/services/src/dataset_entry_service_impl.rs create mode 100644 src/domain/task-system/domain/src/services/task_definition_planner.rs rename src/domain/task-system/domain/src/services/{task_logical_plan_runner.rs => task_runner.rs} (77%) create mode 100644 src/domain/task-system/services/src/task_definition_planner_impl.rs delete mode 100644 src/domain/task-system/services/src/task_logical_plan_runner_impl.rs create mode 100644 src/domain/task-system/services/src/task_runner_impl.rs create mode 100644 src/infra/core/src/dataset_registry_repo_bridge.rs create mode 100644 src/infra/core/src/pull_request_planner_impl.rs delete mode 100644 src/infra/core/src/pull_service_impl.rs create mode 100644 src/infra/core/src/push_request_planner_impl.rs delete mode 100644 src/infra/core/src/push_service_impl.rs create mode 100644 src/infra/core/src/sync_request_builder.rs create mode 100644 src/infra/core/src/testing/mock_sync_service.rs create mode 100644 src/infra/core/src/testing/mock_transform_elaboration_service.rs create mode 100644 src/infra/core/src/testing/mock_transform_execution_service.rs create mode 100644 src/infra/core/src/testing/mock_transform_request_planner.rs delete mode 100644 src/infra/core/src/testing/mock_transform_service.rs create mode 100644 src/infra/core/src/transform/mod.rs create mode 100644 src/infra/core/src/transform/transform_elaboration_service_impl.rs create mode 100644 src/infra/core/src/transform/transform_execution_service_impl.rs create mode 100644 src/infra/core/src/transform/transform_helpers.rs create mode 100644 src/infra/core/src/transform/transform_request_planner_impl.rs delete mode 100644 src/infra/core/src/transform_service_impl.rs create mode 100644 src/infra/core/src/use_cases/compact_dataset_use_case_impl.rs create mode 100644 src/infra/core/src/use_cases/pull_dataset_use_case_impl.rs create mode 100644 src/infra/core/src/use_cases/push_dataset_use_case_impl.rs create mode 100644 src/infra/core/src/use_cases/reset_dataset_use_case_impl.rs create mode 100644 src/infra/core/src/use_cases/set_watermark_use_case_impl.rs create mode 100644 src/infra/core/src/use_cases/verify_dataset_use_case_impl.rs create mode 100644 src/infra/core/src/watermark_service_impl.rs create mode 100644 src/infra/core/tests/tests/test_pull_request_planner_impl.rs delete mode 100644 src/infra/core/tests/tests/test_pull_service_impl.rs create mode 100644 src/infra/core/tests/tests/test_push_request_planner_impl.rs create mode 100644 src/infra/core/tests/tests/test_watermark_service_impl.rs create mode 100644 src/infra/core/tests/tests/use_cases/base_use_case_harness.rs create mode 100644 src/infra/core/tests/tests/use_cases/outbox_expectation_helpers.rs create mode 100644 src/infra/core/tests/tests/use_cases/test_compact_dataset_use_case.rs create mode 100644 src/infra/core/tests/tests/use_cases/test_pull_dataset_use_case.rs create mode 100644 src/infra/core/tests/tests/use_cases/test_push_dataset_use_case.rs create mode 100644 src/infra/core/tests/tests/use_cases/test_reset_dataset_use_case.rs create mode 100644 src/infra/core/tests/tests/use_cases/test_set_watermark_use_case.rs create mode 100644 src/infra/core/tests/tests/use_cases/test_verify_dataset_use_case.rs create mode 100644 src/infra/core/tests/utils/base_repo_harness.rs create mode 100644 src/infra/core/tests/utils/transform_test_helper.rs create mode 100644 src/infra/datasets/postgres/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json rename src/infra/datasets/postgres/.sqlx/{query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json => query-2bcdb350c9c397529fafa84a0b575eca95214025291d1bd310c3900040a3c9c8.json} (83%) create mode 100644 src/infra/datasets/postgres/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json create mode 100644 src/infra/datasets/postgres/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json rename src/infra/datasets/postgres/.sqlx/{query-7954a6acf1cdb627dfe2890b042679ef9e3886268865cce559cf2268c66ea800.json => query-fcb34f3fa8f59b1f8190694fc38dc66874757b9f56f23ed86f8494c6ed4b0b7a.json} (85%) create mode 100644 src/infra/datasets/sqlite/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json create mode 100644 src/infra/datasets/sqlite/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json rename src/infra/datasets/sqlite/.sqlx/{query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json => query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json} (55%) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4c09aa949..65f3167e12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,22 @@ Recommendation: for ease of reading, use the following order: ## [0.207.3] - 2024-11-21 ### Changed - Add version for `OutboxMessage` structure to prevent startup failures after breaking changes +- Introduced `DatasetRegistry` abstraction, encapsulating listing and resolution of datasets: + - Registry is backed by database-stored dataset entries, which are automatically maintained + - Scope for `DatasetRepository` is now limited to support `DatasetRegistry` and in-memory dataset dependency graph + - New concept of `ResolvedDataset`: a wrapper arround `Arc`, aware of dataset identity + - `DatasetRegistryRepoBridge` utility connects both abstractions in a simple way for testing needs + - Query and Dataset Search functions now consider only the datasets accessible for current user + - Core services now explicitly separate planning (transactional) and execution (non-transactional) processing phases + - Similar decomposition introduced in task system execution logic + - Revised implementation of core commands and services: `pull`, `push`, `reset`, `verify`, `compact`, setting watermark + - More parallelism from `pull` command, allowing to mix ingest/sync/transform operations of the same depth level + - Optimized `pull` flow, when a single non-recursive dataset is sent for processing + - Batched form for dataset authorization checks + - Ensuring correct transactionality for dataset lookup and authorization checks all over the code base + - Passing multi/single tenancy as an enum configuration instead of boolean + - Renamed outbox "durability" term to "delivery mechanism" to clarify the design intent + - Greatly reduced complexity and code duplication of many use case and service tests with `oop` macro for inheritance of harnesses ## [0.207.2] - 2024-11-15 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 15608e2d1f..43312f6749 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1314,7 +1314,7 @@ dependencies = [ [[package]] name = "async-utils" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", ] @@ -1432,9 +1432,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.61.0" +version = "1.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e531658a0397d22365dfe26c3e1c0c8448bf6a3a2d8a098ded802f2b1261615" +checksum = "83d3a2854c7490b4c63d2b0e8c3976d628c80afa3045d078a715b2edb2ee4e0a" dependencies = [ "aws-credential-types", "aws-runtime", @@ -2428,9 +2428,9 @@ dependencies = [ [[package]] name = "const-hex" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0121754e84117e65f9d90648ee6aa4882a6e63110307ab73967a4c5e7e69e586" +checksum = "487981fa1af147182687064d0a2c336586d337a606595ced9ffb0c685c250c73" dependencies = [ "cfg-if", "cpufeatures", @@ -2473,7 +2473,7 @@ checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "container-runtime" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "cfg-if", @@ -2897,7 +2897,7 @@ checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2" [[package]] name = "database-common" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "aws-config", @@ -2921,7 +2921,7 @@ dependencies = [ [[package]] name = "database-common-macros" -version = "0.207.3" +version = "0.208.0" dependencies = [ "quote", "syn 2.0.89", @@ -3765,7 +3765,7 @@ dependencies = [ [[package]] name = "enum-variants" -version = "0.207.3" +version = "0.208.0" [[package]] name = "env_filter" @@ -3834,7 +3834,7 @@ dependencies = [ [[package]] name = "event-sourcing" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -3850,7 +3850,7 @@ dependencies = [ [[package]] name = "event-sourcing-macros" -version = "0.207.3" +version = "0.208.0" dependencies = [ "quote", "syn 2.0.89", @@ -4535,7 +4535,7 @@ dependencies = [ [[package]] name = "http-common" -version = "0.207.3" +version = "0.208.0" dependencies = [ "axum", "http 1.1.0", @@ -4654,7 +4654,7 @@ dependencies = [ "hyper 1.5.1", "hyper-util", "rustls 0.23.17", - "rustls-native-certs 0.8.0", + "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -4941,7 +4941,7 @@ checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] name = "init-on-startup" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "database-common", @@ -4984,7 +4984,7 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "internal-error" -version = "0.207.3" +version = "0.208.0" dependencies = [ "thiserror 1.0.69", ] @@ -5132,7 +5132,7 @@ dependencies = [ [[package]] name = "kamu" -version = "0.207.3" +version = "0.208.0" dependencies = [ "alloy", "async-recursion", @@ -5161,6 +5161,7 @@ dependencies = [ "filetime", "flatbuffers", "flate2", + "fs_extra", "futures", "glob", "hex", @@ -5170,6 +5171,7 @@ dependencies = [ "init-on-startup", "internal-error", "itertools 0.13.0", + "kamu", "kamu-accounts", "kamu-accounts-inmem", "kamu-accounts-services", @@ -5185,6 +5187,7 @@ dependencies = [ "mockall", "nanoid", "object_store", + "oop", "opendatafabric", "parking_lot", "petgraph", @@ -5222,7 +5225,7 @@ dependencies = [ [[package]] name = "kamu-accounts" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "base32", @@ -5248,7 +5251,7 @@ dependencies = [ [[package]] name = "kamu-accounts-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -5269,7 +5272,7 @@ dependencies = [ [[package]] name = "kamu-accounts-mysql" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -5290,7 +5293,7 @@ dependencies = [ [[package]] name = "kamu-accounts-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -5311,7 +5314,7 @@ dependencies = [ [[package]] name = "kamu-accounts-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "argon2", "chrono", @@ -5327,7 +5330,7 @@ dependencies = [ [[package]] name = "kamu-accounts-services" -version = "0.207.3" +version = "0.208.0" dependencies = [ "argon2", "async-trait", @@ -5354,7 +5357,7 @@ dependencies = [ [[package]] name = "kamu-accounts-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -5375,7 +5378,7 @@ dependencies = [ [[package]] name = "kamu-adapter-auth-oso" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "dill", @@ -5397,7 +5400,7 @@ dependencies = [ [[package]] name = "kamu-adapter-flight-sql" -version = "0.207.3" +version = "0.208.0" dependencies = [ "arrow-flight", "async-trait", @@ -5420,7 +5423,7 @@ dependencies = [ [[package]] name = "kamu-adapter-graphql" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-graphql", "async-trait", @@ -5472,7 +5475,7 @@ dependencies = [ [[package]] name = "kamu-adapter-http" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "aws-sdk-s3", @@ -5541,7 +5544,7 @@ dependencies = [ [[package]] name = "kamu-adapter-oauth" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -5560,7 +5563,7 @@ dependencies = [ [[package]] name = "kamu-adapter-odata" -version = "0.207.3" +version = "0.208.0" dependencies = [ "axum", "chrono", @@ -5598,7 +5601,7 @@ dependencies = [ [[package]] name = "kamu-auth-rebac" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "internal-error", @@ -5610,7 +5613,7 @@ dependencies = [ [[package]] name = "kamu-auth-rebac-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "database-common-macros", @@ -5624,7 +5627,7 @@ dependencies = [ [[package]] name = "kamu-auth-rebac-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "database-common", @@ -5641,7 +5644,7 @@ dependencies = [ [[package]] name = "kamu-auth-rebac-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "dill", "kamu-auth-rebac", @@ -5650,7 +5653,7 @@ dependencies = [ [[package]] name = "kamu-auth-rebac-services" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "dill", @@ -5669,7 +5672,7 @@ dependencies = [ [[package]] name = "kamu-auth-rebac-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "database-common", @@ -5686,7 +5689,7 @@ dependencies = [ [[package]] name = "kamu-cli" -version = "0.207.3" +version = "0.208.0" dependencies = [ "arrow-flight", "async-graphql", @@ -5810,7 +5813,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-common" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -5840,7 +5843,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-common-macros" -version = "0.207.3" +version = "0.208.0" dependencies = [ "quote", "syn 2.0.89", @@ -5848,7 +5851,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ "indoc 2.0.5", "kamu-cli-e2e-common", @@ -5861,7 +5864,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-mysql" -version = "0.207.3" +version = "0.208.0" dependencies = [ "indoc 2.0.5", "kamu-cli-e2e-common", @@ -5875,7 +5878,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ "indoc 2.0.5", "kamu-cli-e2e-common", @@ -5889,7 +5892,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "chrono", "http-common", @@ -5914,7 +5917,7 @@ dependencies = [ [[package]] name = "kamu-cli-e2e-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ "indoc 2.0.5", "kamu-cli-e2e-common", @@ -5928,7 +5931,7 @@ dependencies = [ [[package]] name = "kamu-cli-puppet" -version = "0.207.3" +version = "0.208.0" dependencies = [ "assert_cmd", "async-trait", @@ -5947,7 +5950,7 @@ dependencies = [ [[package]] name = "kamu-core" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -5978,7 +5981,7 @@ dependencies = [ [[package]] name = "kamu-data-utils" -version = "0.207.3" +version = "0.208.0" dependencies = [ "arrow", "arrow-digest", @@ -6003,7 +6006,7 @@ dependencies = [ [[package]] name = "kamu-datafusion-cli" -version = "0.207.3" +version = "0.208.0" dependencies = [ "arrow", "async-trait", @@ -6027,7 +6030,7 @@ dependencies = [ [[package]] name = "kamu-datasets" -version = "0.207.3" +version = "0.208.0" dependencies = [ "aes-gcm", "async-trait", @@ -6042,18 +6045,21 @@ dependencies = [ "serde_with", "sqlx", "thiserror 1.0.69", + "tokio-stream", "uuid", ] [[package]] name = "kamu-datasets-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ + "async-stream", "async-trait", "chrono", "database-common", "database-common-macros", "dill", + "futures", "internal-error", "kamu-accounts-inmem", "kamu-datasets", @@ -6070,13 +6076,15 @@ dependencies = [ [[package]] name = "kamu-datasets-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ + "async-stream", "async-trait", "chrono", "database-common", "database-common-macros", "dill", + "futures", "internal-error", "kamu-accounts-postgres", "kamu-datasets", @@ -6093,11 +6101,12 @@ dependencies = [ [[package]] name = "kamu-datasets-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "chrono", "database-common", "dill", + "futures", "kamu-accounts", "kamu-datasets", "opendatafabric", @@ -6107,8 +6116,9 @@ dependencies = [ [[package]] name = "kamu-datasets-services" -version = "0.207.3" +version = "0.208.0" dependencies = [ + "async-stream", "async-trait", "chrono", "database-common", @@ -6138,13 +6148,15 @@ dependencies = [ [[package]] name = "kamu-datasets-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ + "async-stream", "async-trait", "chrono", "database-common", "database-common-macros", "dill", + "futures", "internal-error", "kamu-accounts-sqlite", "kamu-datasets", @@ -6161,7 +6173,7 @@ dependencies = [ [[package]] name = "kamu-flow-system" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -6190,7 +6202,7 @@ dependencies = [ [[package]] name = "kamu-flow-system-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6220,7 +6232,7 @@ dependencies = [ [[package]] name = "kamu-flow-system-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6245,7 +6257,7 @@ dependencies = [ [[package]] name = "kamu-flow-system-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "chrono", "database-common", @@ -6258,7 +6270,7 @@ dependencies = [ [[package]] name = "kamu-flow-system-services" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6302,7 +6314,7 @@ dependencies = [ [[package]] name = "kamu-flow-system-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6327,7 +6339,7 @@ dependencies = [ [[package]] name = "kamu-ingest-datafusion" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -6363,7 +6375,7 @@ dependencies = [ [[package]] name = "kamu-messaging-outbox-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -6382,7 +6394,7 @@ dependencies = [ [[package]] name = "kamu-messaging-outbox-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6405,7 +6417,7 @@ dependencies = [ [[package]] name = "kamu-messaging-outbox-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "chrono", "database-common", @@ -6419,7 +6431,7 @@ dependencies = [ [[package]] name = "kamu-messaging-outbox-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6442,7 +6454,7 @@ dependencies = [ [[package]] name = "kamu-repo-tools" -version = "0.207.3" +version = "0.208.0" dependencies = [ "chrono", "clap", @@ -6457,13 +6469,14 @@ dependencies = [ [[package]] name = "kamu-task-system" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", "database-common", "enum-variants", "event-sourcing", + "internal-error", "kamu-core", "messaging-outbox", "opendatafabric", @@ -6475,7 +6488,7 @@ dependencies = [ [[package]] name = "kamu-task-system-inmem" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -6494,7 +6507,7 @@ dependencies = [ [[package]] name = "kamu-task-system-postgres" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6517,7 +6530,7 @@ dependencies = [ [[package]] name = "kamu-task-system-repo-tests" -version = "0.207.3" +version = "0.208.0" dependencies = [ "chrono", "database-common", @@ -6529,7 +6542,7 @@ dependencies = [ [[package]] name = "kamu-task-system-services" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6540,8 +6553,12 @@ dependencies = [ "futures", "init-on-startup", "internal-error", + "kamu", + "kamu-accounts", "kamu-core", "kamu-datasets", + "kamu-datasets-inmem", + "kamu-datasets-services", "kamu-task-system", "kamu-task-system-inmem", "messaging-outbox", @@ -6549,6 +6566,7 @@ dependencies = [ "observability", "opendatafabric", "serde_json", + "tempfile", "test-log", "time-source", "tokio", @@ -6557,7 +6575,7 @@ dependencies = [ [[package]] name = "kamu-task-system-sqlite" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-stream", "async-trait", @@ -6966,7 +6984,7 @@ dependencies = [ [[package]] name = "messaging-outbox" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -7107,7 +7125,7 @@ dependencies = [ [[package]] name = "multiformats" -version = "0.207.3" +version = "0.208.0" dependencies = [ "base64 0.22.1", "bs58", @@ -7432,7 +7450,7 @@ dependencies = [ [[package]] name = "observability" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "axum", @@ -7476,6 +7494,26 @@ dependencies = [ "parking_lot_core", ] +[[package]] +name = "oop" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a3014baf68cf315fbfc3c3d69efd1f79fea3c3ac44d9cd9e10b7cf8fd8e4920" +dependencies = [ + "oop-macro", +] + +[[package]] +name = "oop-macro" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e493043dcf88be852a9716f7b065640867a72cab403f360003534e8cbf11bf84" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", +] + [[package]] name = "oorandom" version = "11.1.4" @@ -7490,7 +7528,7 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" [[package]] name = "opendatafabric" -version = "0.207.3" +version = "0.208.0" dependencies = [ "arrow", "base64 0.22.1", @@ -8247,9 +8285,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.91" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "307e3004becf10f5a6e0d59d20f3cd28231b0e0827a96cd3e0ce6d14bc1e4bb3" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -8455,7 +8493,7 @@ dependencies = [ [[package]] name = "random-names" -version = "0.207.3" +version = "0.208.0" dependencies = [ "rand", ] @@ -8584,7 +8622,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.17", - "rustls-native-certs 0.8.0", + "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", @@ -8881,7 +8919,7 @@ dependencies = [ "openssl-probe", "rustls-pemfile 1.0.4", "schannel", - "security-framework", + "security-framework 2.11.1", ] [[package]] @@ -8894,20 +8932,19 @@ dependencies = [ "rustls-pemfile 2.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 2.11.1", ] [[package]] name = "rustls-native-certs" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ "openssl-probe", - "rustls-pemfile 2.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.0.1", ] [[package]] @@ -9099,6 +9136,19 @@ dependencies = [ "security-framework-sys", ] +[[package]] +name = "security-framework" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.10.0", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + [[package]] name = "security-framework-sys" version = "2.12.1" @@ -10059,7 +10109,7 @@ dependencies = [ [[package]] name = "time-source" -version = "0.207.3" +version = "0.208.0" dependencies = [ "async-trait", "chrono", @@ -10205,7 +10255,7 @@ dependencies = [ "futures-util", "log", "rustls 0.23.17", - "rustls-native-certs 0.8.0", + "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -10467,7 +10517,7 @@ dependencies = [ [[package]] name = "tracing-perfetto" -version = "0.207.3" +version = "0.208.0" dependencies = [ "conv", "serde", @@ -11029,9 +11079,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.26.6" +version = "0.26.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" +checksum = "5d642ff16b7e79272ae451b7322067cdc17cadf68c23264be9d94a32319efe7e" dependencies = [ "rustls-pki-types", ] diff --git a/Cargo.toml b/Cargo.toml index d365c9ab8a..50cd69cc45 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -92,95 +92,95 @@ resolver = "2" [workspace.dependencies] # Apps -kamu-cli = { version = "0.207.3", path = "src/app/cli", default-features = false } +kamu-cli = { version = "0.208.0", path = "src/app/cli", default-features = false } # Utils -async-utils = { version = "0.207.3", path = "src/utils/async-utils", default-features = false } -container-runtime = { version = "0.207.3", path = "src/utils/container-runtime", default-features = false } -database-common = { version = "0.207.3", path = "src/utils/database-common", default-features = false } -database-common-macros = { version = "0.207.3", path = "src/utils/database-common-macros", default-features = false } -enum-variants = { version = "0.207.3", path = "src/utils/enum-variants", default-features = false } -event-sourcing = { version = "0.207.3", path = "src/utils/event-sourcing", default-features = false } -event-sourcing-macros = { version = "0.207.3", path = "src/utils/event-sourcing-macros", default-features = false } -http-common = { version = "0.207.3", path = "src/utils/http-common", default-features = false } -init-on-startup = { version = "0.207.3", path = "src/utils/init-on-startup", default-features = false } -internal-error = { version = "0.207.3", path = "src/utils/internal-error", default-features = false } -kamu-cli-puppet = { version = "0.207.3", path = "src/utils/kamu-cli-puppet", default-features = false } -kamu-data-utils = { version = "0.207.3", path = "src/utils/data-utils", default-features = false } -kamu-datafusion-cli = { version = "0.207.3", path = "src/utils/datafusion-cli", default-features = false } -messaging-outbox = { version = "0.207.3", path = "src/utils/messaging-outbox", default-features = false } -multiformats = { version = "0.207.3", path = "src/utils/multiformats", default-features = false } -observability = { version = "0.207.3", path = "src/utils/observability", default-features = false } -random-names = { version = "0.207.3", path = "src/utils/random-names", default-features = false } -time-source = { version = "0.207.3", path = "src/utils/time-source", default-features = false } -tracing-perfetto = { version = "0.207.3", path = "src/utils/tracing-perfetto", default-features = false } +async-utils = { version = "0.208.0", path = "src/utils/async-utils", default-features = false } +container-runtime = { version = "0.208.0", path = "src/utils/container-runtime", default-features = false } +database-common = { version = "0.208.0", path = "src/utils/database-common", default-features = false } +database-common-macros = { version = "0.208.0", path = "src/utils/database-common-macros", default-features = false } +enum-variants = { version = "0.208.0", path = "src/utils/enum-variants", default-features = false } +event-sourcing = { version = "0.208.0", path = "src/utils/event-sourcing", default-features = false } +event-sourcing-macros = { version = "0.208.0", path = "src/utils/event-sourcing-macros", default-features = false } +http-common = { version = "0.208.0", path = "src/utils/http-common", default-features = false } +init-on-startup = { version = "0.208.0", path = "src/utils/init-on-startup", default-features = false } +internal-error = { version = "0.208.0", path = "src/utils/internal-error", default-features = false } +kamu-cli-puppet = { version = "0.208.0", path = "src/utils/kamu-cli-puppet", default-features = false } +kamu-data-utils = { version = "0.208.0", path = "src/utils/data-utils", default-features = false } +kamu-datafusion-cli = { version = "0.208.0", path = "src/utils/datafusion-cli", default-features = false } +messaging-outbox = { version = "0.208.0", path = "src/utils/messaging-outbox", default-features = false } +multiformats = { version = "0.208.0", path = "src/utils/multiformats", default-features = false } +observability = { version = "0.208.0", path = "src/utils/observability", default-features = false } +random-names = { version = "0.208.0", path = "src/utils/random-names", default-features = false } +time-source = { version = "0.208.0", path = "src/utils/time-source", default-features = false } +tracing-perfetto = { version = "0.208.0", path = "src/utils/tracing-perfetto", default-features = false } # Domain -kamu-accounts = { version = "0.207.3", path = "src/domain/accounts/domain", default-features = false } -kamu-auth-rebac = { version = "0.207.3", path = "src/domain/auth-rebac/domain", default-features = false } -kamu-core = { version = "0.207.3", path = "src/domain/core", default-features = false } -kamu-datasets = { version = "0.207.3", path = "src/domain/datasets/domain", default-features = false } -kamu-flow-system = { version = "0.207.3", path = "src/domain/flow-system/domain", default-features = false } -kamu-task-system = { version = "0.207.3", path = "src/domain/task-system/domain", default-features = false } -opendatafabric = { version = "0.207.3", path = "src/domain/opendatafabric", default-features = false } +kamu-accounts = { version = "0.208.0", path = "src/domain/accounts/domain", default-features = false } +kamu-auth-rebac = { version = "0.208.0", path = "src/domain/auth-rebac/domain", default-features = false } +kamu-core = { version = "0.208.0", path = "src/domain/core", default-features = false } +kamu-datasets = { version = "0.208.0", path = "src/domain/datasets/domain", default-features = false } +kamu-flow-system = { version = "0.208.0", path = "src/domain/flow-system/domain", default-features = false } +kamu-task-system = { version = "0.208.0", path = "src/domain/task-system/domain", default-features = false } +opendatafabric = { version = "0.208.0", path = "src/domain/opendatafabric", default-features = false } # Domain service layer -kamu-accounts-services = { version = "0.207.3", path = "src/domain/accounts/services", default-features = false } -kamu-auth-rebac-services = { version = "0.207.3", path = "src/domain/auth-rebac/services", default-features = false } -kamu-datasets-services = { version = "0.207.3", path = "src/domain/datasets/services", default-features = false } -kamu-flow-system-services = { version = "0.207.3", path = "src/domain/flow-system/services", default-features = false } -kamu-task-system-services = { version = "0.207.3", path = "src/domain/task-system/services", default-features = false } +kamu-accounts-services = { version = "0.208.0", path = "src/domain/accounts/services", default-features = false } +kamu-auth-rebac-services = { version = "0.208.0", path = "src/domain/auth-rebac/services", default-features = false } +kamu-datasets-services = { version = "0.208.0", path = "src/domain/datasets/services", default-features = false } +kamu-flow-system-services = { version = "0.208.0", path = "src/domain/flow-system/services", default-features = false } +kamu-task-system-services = { version = "0.208.0", path = "src/domain/task-system/services", default-features = false } # Infra -kamu = { version = "0.207.3", path = "src/infra/core", default-features = false } -kamu-ingest-datafusion = { version = "0.207.3", path = "src/infra/ingest-datafusion", default-features = false } +kamu = { version = "0.208.0", path = "src/infra/core", default-features = false } +kamu-ingest-datafusion = { version = "0.208.0", path = "src/infra/ingest-datafusion", default-features = false } ## Flow System -kamu-flow-system-repo-tests = { version = "0.207.3", path = "src/infra/flow-system/repo-tests", default-features = false } -kamu-flow-system-inmem = { version = "0.207.3", path = "src/infra/flow-system/inmem", default-features = false } -kamu-flow-system-postgres = { version = "0.207.3", path = "src/infra/flow-system/postgres", default-features = false } -kamu-flow-system-sqlite = { version = "0.207.3", path = "src/infra/flow-system/sqlite", default-features = false } +kamu-flow-system-repo-tests = { version = "0.208.0", path = "src/infra/flow-system/repo-tests", default-features = false } +kamu-flow-system-inmem = { version = "0.208.0", path = "src/infra/flow-system/inmem", default-features = false } +kamu-flow-system-postgres = { version = "0.208.0", path = "src/infra/flow-system/postgres", default-features = false } +kamu-flow-system-sqlite = { version = "0.208.0", path = "src/infra/flow-system/sqlite", default-features = false } ## Accounts -kamu-accounts-inmem = { version = "0.207.3", path = "src/infra/accounts/inmem", default-features = false } -kamu-accounts-mysql = { version = "0.207.3", path = "src/infra/accounts/mysql", default-features = false } -kamu-accounts-postgres = { version = "0.207.3", path = "src/infra/accounts/postgres", default-features = false } -kamu-accounts-sqlite = { version = "0.207.3", path = "src/infra/accounts/sqlite", default-features = false } -kamu-accounts-repo-tests = { version = "0.207.3", path = "src/infra/accounts/repo-tests", default-features = false } +kamu-accounts-inmem = { version = "0.208.0", path = "src/infra/accounts/inmem", default-features = false } +kamu-accounts-mysql = { version = "0.208.0", path = "src/infra/accounts/mysql", default-features = false } +kamu-accounts-postgres = { version = "0.208.0", path = "src/infra/accounts/postgres", default-features = false } +kamu-accounts-sqlite = { version = "0.208.0", path = "src/infra/accounts/sqlite", default-features = false } +kamu-accounts-repo-tests = { version = "0.208.0", path = "src/infra/accounts/repo-tests", default-features = false } ## Datasets -kamu-datasets-inmem = { version = "0.207.3", path = "src/infra/datasets/inmem", default-features = false } -kamu-datasets-postgres = { version = "0.207.3", path = "src/infra/datasets/postgres", default-features = false } -kamu-datasets-sqlite = { version = "0.207.3", path = "src/infra/datasets/sqlite", default-features = false } -kamu-datasets-repo-tests = { version = "0.207.3", path = "src/infra/datasets/repo-tests", default-features = false } +kamu-datasets-inmem = { version = "0.208.0", path = "src/infra/datasets/inmem", default-features = false } +kamu-datasets-postgres = { version = "0.208.0", path = "src/infra/datasets/postgres", default-features = false } +kamu-datasets-sqlite = { version = "0.208.0", path = "src/infra/datasets/sqlite", default-features = false } +kamu-datasets-repo-tests = { version = "0.208.0", path = "src/infra/datasets/repo-tests", default-features = false } ## Task System -kamu-task-system-inmem = { version = "0.207.3", path = "src/infra/task-system/inmem", default-features = false } -kamu-task-system-postgres = { version = "0.207.3", path = "src/infra/task-system/postgres", default-features = false } -kamu-task-system-sqlite = { version = "0.207.3", path = "src/infra/task-system/sqlite", default-features = false } -kamu-task-system-repo-tests = { version = "0.207.3", path = "src/infra/task-system/repo-tests", default-features = false } +kamu-task-system-inmem = { version = "0.208.0", path = "src/infra/task-system/inmem", default-features = false } +kamu-task-system-postgres = { version = "0.208.0", path = "src/infra/task-system/postgres", default-features = false } +kamu-task-system-sqlite = { version = "0.208.0", path = "src/infra/task-system/sqlite", default-features = false } +kamu-task-system-repo-tests = { version = "0.208.0", path = "src/infra/task-system/repo-tests", default-features = false } ## ReBAC -kamu-auth-rebac-inmem = { version = "0.207.3", path = "src/infra/auth-rebac/inmem", default-features = false } -kamu-auth-rebac-repo-tests = { version = "0.207.3", path = "src/infra/auth-rebac/repo-tests", default-features = false } -kamu-auth-rebac-postgres = { version = "0.207.3", path = "src/infra/auth-rebac/postgres", default-features = false } -kamu-auth-rebac-sqlite = { version = "0.207.3", path = "src/infra/auth-rebac/sqlite", default-features = false } +kamu-auth-rebac-inmem = { version = "0.208.0", path = "src/infra/auth-rebac/inmem", default-features = false } +kamu-auth-rebac-repo-tests = { version = "0.208.0", path = "src/infra/auth-rebac/repo-tests", default-features = false } +kamu-auth-rebac-postgres = { version = "0.208.0", path = "src/infra/auth-rebac/postgres", default-features = false } +kamu-auth-rebac-sqlite = { version = "0.208.0", path = "src/infra/auth-rebac/sqlite", default-features = false } ## Outbox -kamu-messaging-outbox-inmem = { version = "0.207.3", path = "src/infra/messaging-outbox/inmem", default-features = false } -kamu-messaging-outbox-postgres = { version = "0.207.3", path = "src/infra/messaging-outbox/postgres", default-features = false } -kamu-messaging-outbox-sqlite = { version = "0.207.3", path = "src/infra/messaging-outbox/sqlite", default-features = false } -kamu-messaging-outbox-repo-tests = { version = "0.207.3", path = "src/infra/messaging-outbox/repo-tests", default-features = false } +kamu-messaging-outbox-inmem = { version = "0.208.0", path = "src/infra/messaging-outbox/inmem", default-features = false } +kamu-messaging-outbox-postgres = { version = "0.208.0", path = "src/infra/messaging-outbox/postgres", default-features = false } +kamu-messaging-outbox-sqlite = { version = "0.208.0", path = "src/infra/messaging-outbox/sqlite", default-features = false } +kamu-messaging-outbox-repo-tests = { version = "0.208.0", path = "src/infra/messaging-outbox/repo-tests", default-features = false } # Adapters -kamu-adapter-auth-oso = { version = "0.207.3", path = "src/adapter/auth-oso", default-features = false } -kamu-adapter-flight-sql = { version = "0.207.3", path = "src/adapter/flight-sql", default-features = false } -kamu-adapter-graphql = { version = "0.207.3", path = "src/adapter/graphql", default-features = false } -kamu-adapter-http = { version = "0.207.3", path = "src/adapter/http", default-features = false } -kamu-adapter-odata = { version = "0.207.3", path = "src/adapter/odata", default-features = false } -kamu-adapter-oauth = { version = "0.207.3", path = "src/adapter/oauth", default-features = false } +kamu-adapter-auth-oso = { version = "0.208.0", path = "src/adapter/auth-oso", default-features = false } +kamu-adapter-flight-sql = { version = "0.208.0", path = "src/adapter/flight-sql", default-features = false } +kamu-adapter-graphql = { version = "0.208.0", path = "src/adapter/graphql", default-features = false } +kamu-adapter-http = { version = "0.208.0", path = "src/adapter/http", default-features = false } +kamu-adapter-odata = { version = "0.208.0", path = "src/adapter/odata", default-features = false } +kamu-adapter-oauth = { version = "0.208.0", path = "src/adapter/oauth", default-features = false } # E2E -kamu-cli-e2e-common = { version = "0.207.3", path = "src/e2e/app/cli/common", default-features = false } -kamu-cli-e2e-common-macros = { version = "0.207.3", path = "src/e2e/app/cli/common-macros", default-features = false } -kamu-cli-e2e-repo-tests = { version = "0.207.3", path = "src/e2e/app/cli/repo-tests", default-features = false } +kamu-cli-e2e-common = { version = "0.208.0", path = "src/e2e/app/cli/common", default-features = false } +kamu-cli-e2e-common-macros = { version = "0.208.0", path = "src/e2e/app/cli/common-macros", default-features = false } +kamu-cli-e2e-repo-tests = { version = "0.208.0", path = "src/e2e/app/cli/repo-tests", default-features = false } [workspace.package] -version = "0.207.3" +version = "0.208.0" edition = "2021" homepage = "https://github.com/kamu-data/kamu-cli" repository = "https://github.com/kamu-data/kamu-cli" diff --git a/LICENSE.txt b/LICENSE.txt index d8d2f00f2a..86b33d41c8 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -11,7 +11,7 @@ Business Source License 1.1 Licensor: Kamu Data, Inc. -Licensed Work: Kamu CLI Version 0.207.3 +Licensed Work: Kamu CLI Version 0.208.0 The Licensed Work is © 2023 Kamu Data, Inc. Additional Use Grant: You may use the Licensed Work for any purpose, @@ -24,7 +24,7 @@ Additional Use Grant: You may use the Licensed Work for any purpose, Licensed Work where data or transformations are controlled by such third parties. -Change Date: 2028-11-11 +Change Date: 2028-11-21 Change License: Apache License, Version 2.0 diff --git a/resources/openapi-mt.json b/resources/openapi-mt.json index c4f5c92247..b8779888cc 100644 --- a/resources/openapi-mt.json +++ b/resources/openapi-mt.json @@ -862,7 +862,7 @@ "name": "" }, "title": "kamu-cli", - "version": "0.207.3" + "version": "0.208.0" }, "openapi": "3.1.0", "paths": { diff --git a/resources/openapi.json b/resources/openapi.json index 7c3cc148e7..bb4be73b16 100644 --- a/resources/openapi.json +++ b/resources/openapi.json @@ -862,7 +862,7 @@ "name": "" }, "title": "kamu-cli", - "version": "0.207.3" + "version": "0.208.0" }, "openapi": "3.1.0", "paths": { diff --git a/src/adapter/auth-oso/src/oso_dataset_authorizer.rs b/src/adapter/auth-oso/src/oso_dataset_authorizer.rs index 5614a37d89..dc09d71c86 100644 --- a/src/adapter/auth-oso/src/oso_dataset_authorizer.rs +++ b/src/adapter/auth-oso/src/oso_dataset_authorizer.rs @@ -12,8 +12,8 @@ use std::str::FromStr; use std::sync::Arc; use dill::*; -use internal_error::ErrorIntoInternal; -use kamu_accounts::{CurrentAccountSubject, DEFAULT_ACCOUNT_NAME_STR}; +use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; +use kamu_accounts::CurrentAccountSubject; use kamu_core::auth::*; use kamu_core::AccessError; use opendatafabric::DatasetHandle; @@ -57,10 +57,14 @@ impl OsoDatasetAuthorizer { fn dataset_resource(&self, dataset_handle: &DatasetHandle) -> DatasetResource { let dataset_alias = &dataset_handle.alias; - let creator = dataset_alias - .account_name - .as_ref() - .map_or(DEFAULT_ACCOUNT_NAME_STR, |a| a.as_str()); + let creator = dataset_alias.account_name.as_ref().map_or_else( + || { + self.current_account_subject + .account_name_or_default() + .as_str() + }, + |a| a.as_str(), + ); // TODO: for now let's treat all datasets as public // TODO: explicit read/write permissions @@ -122,6 +126,71 @@ impl DatasetActionAuthorizer for OsoDatasetAuthorizer { allowed_actions } + + #[tracing::instrument(level = "debug", skip_all, fields(dataset_handles=?dataset_handles, action=%action))] + async fn filter_datasets_allowing( + &self, + dataset_handles: Vec, + action: DatasetAction, + ) -> Result, InternalError> { + let mut matched_dataset_handles = Vec::new(); + for hdl in dataset_handles { + let is_allowed = self + .oso + .is_allowed( + self.actor(), + action.to_string(), + self.dataset_resource(&hdl), + ) + .int_err()?; + if is_allowed { + matched_dataset_handles.push(hdl); + } + } + + Ok(matched_dataset_handles) + } + + #[tracing::instrument(level = "debug", skip_all, fields(dataset_handles=?dataset_handles, action=%action))] + async fn classify_datasets_by_allowance( + &self, + dataset_handles: Vec, + action: DatasetAction, + ) -> Result { + let mut matched_dataset_handles = Vec::with_capacity(dataset_handles.len()); + let mut unmatched_results = Vec::new(); + + for hdl in dataset_handles { + let is_allowed = self + .oso + .is_allowed( + self.actor(), + action.to_string(), + self.dataset_resource(&hdl), + ) + .int_err()?; + if is_allowed { + matched_dataset_handles.push(hdl); + } else { + let dataset_ref = hdl.as_local_ref(); + unmatched_results.push(( + hdl, + DatasetActionUnauthorizedError::Access(AccessError::Forbidden( + DatasetActionNotEnoughPermissionsError { + action, + dataset_ref, + } + .into(), + )), + )); + } + } + + Ok(ClassifyByAllowanceResponse { + authorized_handles: matched_dataset_handles, + unauthorized_handles_with_errors: unmatched_results, + }) + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs b/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs index f063c840d2..82d156f243 100644 --- a/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs +++ b/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs @@ -17,7 +17,7 @@ use kamu::{CreateDatasetUseCaseImpl, DatasetRepositoryLocalFs, DatasetRepository use kamu_accounts::CurrentAccountSubject; use kamu_adapter_auth_oso::{KamuAuthOso, OsoDatasetAuthorizer}; use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer, DatasetActionUnauthorizedError}; -use kamu_core::{AccessError, CreateDatasetUseCase, DatasetRepository}; +use kamu_core::{AccessError, CreateDatasetUseCase, DatasetRepository, TenancyConfig}; use messaging_outbox::DummyOutboxImpl; use opendatafabric::{AccountID, AccountName, DatasetAlias, DatasetHandle, DatasetKind}; use tempfile::TempDir; @@ -116,11 +116,8 @@ impl DatasetAuthorizerHarness { )) .add::() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(true), - ) + .add_value(TenancyConfig::MultiTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() .add::() diff --git a/src/adapter/graphql/src/mutations/dataset_metadata_mut.rs b/src/adapter/graphql/src/mutations/dataset_metadata_mut.rs index b671e9d797..df40c836ce 100644 --- a/src/adapter/graphql/src/mutations/dataset_metadata_mut.rs +++ b/src/adapter/graphql/src/mutations/dataset_metadata_mut.rs @@ -33,10 +33,10 @@ impl DatasetMetadataMut { } #[graphql(skip)] - fn get_dataset(&self, ctx: &Context<'_>) -> std::sync::Arc { + fn get_dataset(&self, ctx: &Context<'_>) -> domain::ResolvedDataset { // TODO: cut off this dependency - extract a higher level use case - let dataset_repo = from_catalog::(ctx).unwrap(); - dataset_repo.get_dataset_by_handle(&self.dataset_handle) + let dataset_registry = from_catalog::(ctx).unwrap(); + dataset_registry.get_dataset_by_handle(&self.dataset_handle) } /// Access to the mutable metadata chain of the dataset @@ -51,9 +51,9 @@ impl DatasetMetadataMut { ctx: &Context<'_>, content: Option, ) -> Result { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - let old_attachments = dataset + let old_attachments = resolved_dataset .as_metadata_chain() .accept_one(SearchSetAttachmentsVisitor::new()) .await diff --git a/src/adapter/graphql/src/mutations/dataset_mut.rs b/src/adapter/graphql/src/mutations/dataset_mut.rs index 63dd173b96..387d8a06df 100644 --- a/src/adapter/graphql/src/mutations/dataset_mut.rs +++ b/src/adapter/graphql/src/mutations/dataset_mut.rs @@ -9,7 +9,7 @@ use chrono::{DateTime, Utc}; use domain::{DeleteDatasetError, RenameDatasetError}; -use kamu_core::{self as domain}; +use kamu_core::{self as domain, SetWatermarkUseCase}; use opendatafabric as odf; use super::{DatasetEnvVarsMut, DatasetFlowsMut, DatasetMetadataMut}; @@ -124,17 +124,17 @@ impl DatasetMut { ctx: &Context<'_>, watermark: DateTime, ) -> Result { - let pull_svc = from_catalog::(ctx).unwrap(); - match pull_svc - .set_watermark(&self.dataset_handle.as_local_ref(), watermark) + let set_watermark_use_case = from_catalog::(ctx).unwrap(); + match set_watermark_use_case + .execute(&self.dataset_handle, watermark) .await { - Ok(domain::PullResult::UpToDate(_)) => { + Ok(domain::SetWatermarkResult::UpToDate) => { Ok(SetWatermarkResult::UpToDate(SetWatermarkUpToDate { _dummy: String::new(), })) } - Ok(domain::PullResult::Updated { new_head, .. }) => { + Ok(domain::SetWatermarkResult::Updated { new_head, .. }) => { Ok(SetWatermarkResult::Updated(SetWatermarkUpdated { new_head: new_head.into(), })) diff --git a/src/adapter/graphql/src/mutations/datasets_mut.rs b/src/adapter/graphql/src/mutations/datasets_mut.rs index 9357a72d08..80c0470a26 100644 --- a/src/adapter/graphql/src/mutations/datasets_mut.rs +++ b/src/adapter/graphql/src/mutations/datasets_mut.rs @@ -7,7 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu_core::{self as domain, CreateDatasetUseCaseOptions, DatasetRepositoryExt}; +use kamu_core::{self as domain, CreateDatasetUseCaseOptions, DatasetRegistryExt}; use opendatafabric as odf; use crate::mutations::DatasetMut; @@ -25,9 +25,9 @@ pub struct DatasetsMut; impl DatasetsMut { /// Returns a mutable dataset by its ID async fn by_id(&self, ctx: &Context<'_>, dataset_id: DatasetID) -> Result> { - let dataset_repo = from_catalog::(ctx).unwrap(); - let hdl = dataset_repo - .try_resolve_dataset_ref(&dataset_id.as_local_ref()) + let dataset_registry = from_catalog::(ctx).unwrap(); + let hdl = dataset_registry + .try_resolve_dataset_handle_by_ref(&dataset_id.as_local_ref()) .await?; Ok(hdl.map(DatasetMut::new)) } diff --git a/src/adapter/graphql/src/mutations/flows_mut/flows_mut_utils.rs b/src/adapter/graphql/src/mutations/flows_mut/flows_mut_utils.rs index b2fad3c54b..140b998f6e 100644 --- a/src/adapter/graphql/src/mutations/flows_mut/flows_mut_utils.rs +++ b/src/adapter/graphql/src/mutations/flows_mut/flows_mut_utils.rs @@ -7,7 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu_core::{GetSummaryOpts, MetadataChainExt}; +use kamu_core::{DatasetRegistry, GetSummaryOpts, MetadataChainExt}; use {kamu_flow_system as fs, opendatafabric as odf}; use super::FlowNotFound; @@ -76,9 +76,9 @@ pub(crate) async fn ensure_expected_dataset_kind( let dataset_flow_type: kamu_flow_system::DatasetFlowType = dataset_flow_type.into(); match dataset_flow_type.dataset_kind_restriction() { Some(expected_kind) => { - let dataset = utils::get_dataset(ctx, dataset_handle); + let resolved_dataset = utils::get_dataset(ctx, dataset_handle); - let dataset_kind = dataset + let dataset_kind = resolved_dataset .get_summary(GetSummaryOpts::default()) .await .int_err()? @@ -106,12 +106,16 @@ pub(crate) async fn ensure_flow_preconditions( dataset_flow_type: DatasetFlowType, flow_run_configuration: Option<&FlowRunConfiguration>, ) -> Result> { + let dataset_registry = from_catalog::(ctx).unwrap(); + let target = dataset_registry.get_dataset_by_handle(dataset_handle); + match dataset_flow_type { DatasetFlowType::Ingest => { let polling_ingest_svc = from_catalog::(ctx).unwrap(); + let source_res = polling_ingest_svc - .get_active_polling_source(&dataset_handle.as_local_ref()) + .get_active_polling_source(target) .await .int_err()?; if source_res.is_none() { @@ -121,13 +125,12 @@ pub(crate) async fn ensure_flow_preconditions( } } DatasetFlowType::ExecuteTransform => { - let transform_svc = from_catalog::(ctx).unwrap(); - - let source_res = transform_svc - .get_active_transform(&dataset_handle.as_local_ref()) - .await - .int_err()?; + let transform_request_planner = + from_catalog::(ctx).unwrap(); + let source_res = transform_request_planner + .get_active_transform(target) + .await?; if source_res.is_none() { return Ok(Some(FlowPreconditionsNotMet { preconditions: "No SetTransform event defined".to_string(), @@ -140,12 +143,8 @@ pub(crate) async fn ensure_flow_preconditions( && let FlowRunConfiguration::Reset(reset_configuration) = flow_configuration { if let Some(new_head_hash) = &reset_configuration.new_head_hash() { - let dataset_repo = - from_catalog::(ctx).unwrap(); - - let dataset = dataset_repo.get_dataset_by_handle(dataset_handle); - let current_head_hash_maybe = dataset - .as_metadata_chain() + let metadata_chain = target.as_metadata_chain(); + let current_head_hash_maybe = metadata_chain .try_get_ref(&kamu_core::BlockRef::Head) .await .int_err()?; @@ -154,8 +153,7 @@ pub(crate) async fn ensure_flow_preconditions( preconditions: "Dataset does not contain any blocks".to_string(), })); } - if !dataset - .as_metadata_chain() + if !metadata_chain .contains_block(new_head_hash) .await .int_err()? diff --git a/src/adapter/graphql/src/queries/accounts/account_flow_runs.rs b/src/adapter/graphql/src/queries/accounts/account_flow_runs.rs index 87d32cfe0c..46b6f20a42 100644 --- a/src/adapter/graphql/src/queries/accounts/account_flow_runs.rs +++ b/src/adapter/graphql/src/queries/accounts/account_flow_runs.rs @@ -13,7 +13,7 @@ use database_common::PaginationOpts; use futures::TryStreamExt; use kamu::utils::datasets_filtering::filter_datasets_by_local_pattern; use kamu_accounts::Account as AccountEntity; -use kamu_core::DatasetRepository; +use kamu_core::DatasetRegistry; use kamu_flow_system as fs; use opendatafabric::DatasetRefPattern; @@ -111,7 +111,7 @@ impl AccountFlowRuns { .try_collect() .await?; - let dataset_repo = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); let account = Account::new( self.account.id.clone().into(), @@ -119,7 +119,7 @@ impl AccountFlowRuns { ); let matched_datasets: Vec<_> = - filter_datasets_by_local_pattern(dataset_repo.as_ref(), datasets_with_flows) + filter_datasets_by_local_pattern(dataset_registry.as_ref(), datasets_with_flows) .map_ok(|dataset_handle| Dataset::new(account.clone(), dataset_handle)) .try_collect() .await diff --git a/src/adapter/graphql/src/queries/datasets/dataset.rs b/src/adapter/graphql/src/queries/datasets/dataset.rs index 8ecbace6dc..27129c282a 100644 --- a/src/adapter/graphql/src/queries/datasets/dataset.rs +++ b/src/adapter/graphql/src/queries/datasets/dataset.rs @@ -33,12 +33,12 @@ impl Dataset { #[graphql(skip)] pub async fn from_ref(ctx: &Context<'_>, dataset_ref: &odf::DatasetRef) -> Result { - let dataset_repo = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); // TODO: Should we resolve reference at this point or allow unresolved and fail // later? - let hdl = dataset_repo - .resolve_dataset_ref(dataset_ref) + let hdl = dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) .await .int_err()?; let account = Account::from_dataset_alias(ctx, &hdl.alias) @@ -48,9 +48,9 @@ impl Dataset { } #[graphql(skip)] - fn get_dataset(&self, ctx: &Context<'_>) -> std::sync::Arc { - let dataset_repo = from_catalog::(ctx).unwrap(); - dataset_repo.get_dataset_by_handle(&self.dataset_handle) + fn get_dataset(&self, ctx: &Context<'_>) -> domain::ResolvedDataset { + let dataset_registry = from_catalog::(ctx).unwrap(); + dataset_registry.get_dataset_by_handle(&self.dataset_handle) } /// Unique identifier of the dataset @@ -77,8 +77,8 @@ impl Dataset { /// Returns the kind of dataset (Root or Derivative) async fn kind(&self, ctx: &Context<'_>) -> Result { - let dataset = self.get_dataset(ctx); - let summary = dataset + let resolved_dataset = self.get_dataset(ctx); + let summary = resolved_dataset .get_summary(domain::GetSummaryOpts::default()) .await .int_err()?; @@ -111,9 +111,9 @@ impl Dataset { // TODO: PERF: Avoid traversing the entire chain /// Creation time of the first metadata block in the chain async fn created_at(&self, ctx: &Context<'_>) -> Result> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .accept_one(SearchSeedVisitor::new()) .await @@ -125,9 +125,9 @@ impl Dataset { /// Creation time of the most recent metadata block in the chain async fn last_updated_at(&self, ctx: &Context<'_>) -> Result> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .get_block_by_ref(&domain::BlockRef::Head) .await? diff --git a/src/adapter/graphql/src/queries/datasets/dataset_data.rs b/src/adapter/graphql/src/queries/datasets/dataset_data.rs index b29f73b76d..0f49175ab0 100644 --- a/src/adapter/graphql/src/queries/datasets/dataset_data.rs +++ b/src/adapter/graphql/src/queries/datasets/dataset_data.rs @@ -30,9 +30,9 @@ impl DatasetData { /// Total number of records in this dataset #[tracing::instrument(level = "info", skip_all)] async fn num_records_total(&self, ctx: &Context<'_>) -> Result { - let dataset_repo = from_catalog::(ctx).unwrap(); - let dataset = dataset_repo.get_dataset_by_handle(&self.dataset_handle); - let summary = dataset + let dataset_registry = from_catalog::(ctx).unwrap(); + let resolved_dataset = dataset_registry.get_dataset_by_handle(&self.dataset_handle); + let summary = resolved_dataset .get_summary(GetSummaryOpts::default()) .await .int_err()?; @@ -43,9 +43,9 @@ impl DatasetData { /// caching #[tracing::instrument(level = "info", skip_all)] async fn estimated_size(&self, ctx: &Context<'_>) -> Result { - let dataset_repo = from_catalog::(ctx).unwrap(); - let dataset = dataset_repo.get_dataset_by_handle(&self.dataset_handle); - let summary = dataset + let dataset_registry = from_catalog::(ctx).unwrap(); + let resolved_dataset = dataset_registry.get_dataset_by_handle(&self.dataset_handle); + let summary = resolved_dataset .get_summary(GetSummaryOpts::default()) .await .int_err()?; diff --git a/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs b/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs index 9993e8f30a..be4cb48657 100644 --- a/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs +++ b/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs @@ -10,6 +10,7 @@ use chrono::prelude::*; use kamu_core::{ self as domain, + DatasetRegistry, MetadataChainExt, SearchSetAttachmentsVisitor, SearchSetInfoVisitor, @@ -33,9 +34,9 @@ impl DatasetMetadata { } #[graphql(skip)] - fn get_dataset(&self, ctx: &Context<'_>) -> std::sync::Arc { - let dataset_repo = from_catalog::(ctx).unwrap(); - dataset_repo.get_dataset_by_handle(&self.dataset_handle) + fn get_dataset(&self, ctx: &Context<'_>) -> domain::ResolvedDataset { + let dataset_registry = from_catalog::(ctx).unwrap(); + dataset_registry.get_dataset_by_handle(&self.dataset_handle) } /// Access to the temporal metadata chain of the dataset @@ -45,9 +46,9 @@ impl DatasetMetadata { /// Last recorded watermark async fn current_watermark(&self, ctx: &Context<'_>) -> Result>> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .last_data_block() .await @@ -94,11 +95,11 @@ impl DatasetMetadata { .collect() .await; - let dataset_repo = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); let mut upstream = Vec::with_capacity(upstream_dataset_ids.len()); for upstream_dataset_id in upstream_dataset_ids { - let hdl = dataset_repo - .resolve_dataset_ref(&upstream_dataset_id.as_local_ref()) + let hdl = dataset_registry + .resolve_dataset_handle_by_ref(&upstream_dataset_id.as_local_ref()) .await .int_err()?; let maybe_account = Account::from_dataset_alias(ctx, &hdl.alias).await?; @@ -129,11 +130,11 @@ impl DatasetMetadata { .collect() .await; - let dataset_repo = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); let mut downstream = Vec::with_capacity(downstream_dataset_ids.len()); for downstream_dataset_id in downstream_dataset_ids { - let hdl = dataset_repo - .resolve_dataset_ref(&downstream_dataset_id.as_local_ref()) + let hdl = dataset_registry + .resolve_dataset_handle_by_ref(&downstream_dataset_id.as_local_ref()) .await .int_err()?; let maybe_account = Account::from_dataset_alias(ctx, &hdl.alias).await?; @@ -152,10 +153,11 @@ impl DatasetMetadata { /// Current polling source used by the root dataset async fn current_polling_source(&self, ctx: &Context<'_>) -> Result> { + let dataset_registry = from_catalog::(ctx).unwrap(); let polling_ingest_svc = from_catalog::(ctx).unwrap(); let source = polling_ingest_svc - .get_active_polling_source(&self.dataset_handle.as_local_ref()) + .get_active_polling_source(dataset_registry.get_dataset_by_handle(&self.dataset_handle)) .await .int_err()?; @@ -165,9 +167,10 @@ impl DatasetMetadata { /// Current push sources used by the root dataset async fn current_push_sources(&self, ctx: &Context<'_>) -> Result> { let push_ingest_svc = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); let mut push_sources: Vec = push_ingest_svc - .get_active_push_sources(&self.dataset_handle.as_local_ref()) + .get_active_push_sources(dataset_registry.get_dataset_by_handle(&self.dataset_handle)) .await .int_err()? .into_iter() @@ -181,21 +184,23 @@ impl DatasetMetadata { /// Current transformation used by the derivative dataset async fn current_transform(&self, ctx: &Context<'_>) -> Result> { - let transform_svc = from_catalog::(ctx).unwrap(); + let transform_request_planner = + from_catalog::(ctx).unwrap(); - let source = transform_svc - .get_active_transform(&self.dataset_handle.as_local_ref()) - .await - .int_err()?; + let dataset_registry = from_catalog::(ctx).unwrap(); + + let source = transform_request_planner + .get_active_transform(dataset_registry.get_dataset_by_handle(&self.dataset_handle)) + .await?; Ok(source.map(|(_hash, block)| block.event.into())) } /// Current descriptive information about the dataset async fn current_info(&self, ctx: &Context<'_>) -> Result { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .accept_one(SearchSetInfoVisitor::new()) .await @@ -213,9 +218,9 @@ impl DatasetMetadata { /// Current readme file as discovered from attachments associated with the /// dataset async fn current_readme(&self, ctx: &Context<'_>) -> Result> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .accept_one(SearchSetAttachmentsVisitor::new()) .await @@ -234,9 +239,9 @@ impl DatasetMetadata { /// Current license associated with the dataset async fn current_license(&self, ctx: &Context<'_>) -> Result> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .accept_one(SearchSetLicenseVisitor::new()) .await @@ -247,9 +252,9 @@ impl DatasetMetadata { /// Current vocabulary associated with the dataset async fn current_vocab(&self, ctx: &Context<'_>) -> Result> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); - Ok(dataset + Ok(resolved_dataset .as_metadata_chain() .accept_one(SearchSetVocabVisitor::new()) .await diff --git a/src/adapter/graphql/src/queries/datasets/datasets.rs b/src/adapter/graphql/src/queries/datasets/datasets.rs index 3305b0fe3f..727dc5d4cb 100644 --- a/src/adapter/graphql/src/queries/datasets/datasets.rs +++ b/src/adapter/graphql/src/queries/datasets/datasets.rs @@ -8,7 +8,10 @@ // by the Apache License, Version 2.0. use futures::TryStreamExt; -use kamu_core::{self as domain, DatasetRepositoryExt}; +use kamu_core::{ + DatasetRegistryExt, + {self as domain}, +}; use opendatafabric as odf; use crate::prelude::*; @@ -24,9 +27,9 @@ impl Datasets { /// Returns dataset by its ID async fn by_id(&self, ctx: &Context<'_>, dataset_id: DatasetID) -> Result> { - let dataset_repo = from_catalog::(ctx).unwrap(); - let hdl = dataset_repo - .try_resolve_dataset_ref(&dataset_id.as_local_ref()) + let dataset_registry = from_catalog::(ctx).unwrap(); + let hdl = dataset_registry + .try_resolve_dataset_handle_by_ref(&dataset_id.as_local_ref()) .await?; Ok(match hdl { Some(h) => { @@ -49,9 +52,9 @@ impl Datasets { ) -> Result> { let dataset_alias = odf::DatasetAlias::new(Some(account_name.into()), dataset_name.into()); - let dataset_repo = from_catalog::(ctx).unwrap(); - let hdl = dataset_repo - .try_resolve_dataset_ref(&dataset_alias.into_local_ref()) + let dataset_registry = from_catalog::(ctx).unwrap(); + let hdl = dataset_registry + .try_resolve_dataset_handle_by_ref(&dataset_alias.into_local_ref()) .await?; Ok(match hdl { @@ -74,15 +77,15 @@ impl Datasets { page: Option, per_page: Option, ) -> Result { - let dataset_repo = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); let page = page.unwrap_or(0); let per_page = per_page.unwrap_or(Self::DEFAULT_PER_PAGE); let account_name = account_ref.account_name_internal(); - let mut all_datasets: Vec<_> = dataset_repo - .get_datasets_by_owner(&account_name.clone().into()) + let mut all_datasets: Vec<_> = dataset_registry + .all_dataset_handles_by_owner(&account_name.clone().into()) .try_collect() .await?; let total_count = all_datasets.len(); diff --git a/src/adapter/graphql/src/queries/datasets/metadata_chain.rs b/src/adapter/graphql/src/queries/datasets/metadata_chain.rs index 29d777fb64..61c0636eea 100644 --- a/src/adapter/graphql/src/queries/datasets/metadata_chain.rs +++ b/src/adapter/graphql/src/queries/datasets/metadata_chain.rs @@ -42,18 +42,18 @@ impl MetadataChain { } #[graphql(skip)] - fn get_dataset(&self, ctx: &Context<'_>) -> std::sync::Arc { - let dataset_repo = from_catalog::(ctx).unwrap(); - dataset_repo.get_dataset_by_handle(&self.dataset_handle) + fn get_dataset(&self, ctx: &Context<'_>) -> domain::ResolvedDataset { + let dataset_registry = from_catalog::(ctx).unwrap(); + dataset_registry.get_dataset_by_handle(&self.dataset_handle) } /// Returns all named metadata block references #[tracing::instrument(level = "info", skip_all)] async fn refs(&self, ctx: &Context<'_>) -> Result> { - let dataset = self.get_dataset(ctx); + let resolved_dataset = self.get_dataset(ctx); Ok(vec![BlockRef { name: "head".to_owned(), - block_hash: dataset + block_hash: resolved_dataset .as_metadata_chain() .resolve_ref(&domain::BlockRef::Head) .await @@ -69,8 +69,11 @@ impl MetadataChain { ctx: &Context<'_>, hash: Multihash, ) -> Result> { - let dataset = self.get_dataset(ctx); - let block = dataset.as_metadata_chain().try_get_block(&hash).await?; + let resolved_dataset = self.get_dataset(ctx); + let block = resolved_dataset + .as_metadata_chain() + .try_get_block(&hash) + .await?; let account = Account::from_dataset_alias(ctx, &self.dataset_handle.alias) .await? .expect("Account must exist"); @@ -88,8 +91,12 @@ impl MetadataChain { ) -> Result> { use odf::serde::MetadataBlockSerializer; - let dataset = self.get_dataset(ctx); - match dataset.as_metadata_chain().try_get_block(&hash).await? { + let resolved_dataset = self.get_dataset(ctx); + match resolved_dataset + .as_metadata_chain() + .try_get_block(&hash) + .await? + { None => Ok(None), Some(block) => match format { MetadataManifestFormat::Yaml => { @@ -115,8 +122,8 @@ impl MetadataChain { let page = page.unwrap_or(0); let per_page = per_page.unwrap_or(Self::DEFAULT_BLOCKS_PER_PAGE); - let dataset = self.get_dataset(ctx); - let chain = dataset.as_metadata_chain(); + let resolved_dataset = self.get_dataset(ctx); + let chain = resolved_dataset.as_metadata_chain(); let head = chain.resolve_ref(&domain::BlockRef::Head).await.int_err()?; let total_count = diff --git a/src/adapter/graphql/src/queries/flows/flow.rs b/src/adapter/graphql/src/queries/flows/flow.rs index 3af8f80247..ec657784a0 100644 --- a/src/adapter/graphql/src/queries/flows/flow.rs +++ b/src/adapter/graphql/src/queries/flows/flow.rs @@ -8,7 +8,7 @@ // by the Apache License, Version 2.0. use chrono::{DateTime, Utc}; -use kamu_core::{DatasetChangesService, PollingIngestService}; +use kamu_core::{DatasetChangesService, DatasetRegistry, DatasetRegistryExt, PollingIngestService}; use kamu_flow_system::FlowResultDatasetUpdate; use {kamu_flow_system as fs, opendatafabric as odf}; @@ -58,10 +58,15 @@ impl Flow { ) -> Result { Ok(match dataset_key.flow_type { fs::DatasetFlowType::Ingest => { - let polling_ingest_svc = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); + let resolved_dataset = dataset_registry + .get_dataset_by_ref(&dataset_key.dataset_id.as_local_ref()) + .await + .int_err()?; + let polling_ingest_svc = from_catalog::(ctx).unwrap(); let maybe_polling_source = polling_ingest_svc - .get_active_polling_source(&dataset_key.dataset_id.as_local_ref()) + .get_active_polling_source(resolved_dataset) .await .int_err()?; diff --git a/src/adapter/graphql/src/queries/flows/flow_outcome.rs b/src/adapter/graphql/src/queries/flows/flow_outcome.rs index a6dac34d96..04c1853696 100644 --- a/src/adapter/graphql/src/queries/flows/flow_outcome.rs +++ b/src/adapter/graphql/src/queries/flows/flow_outcome.rs @@ -7,7 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu_core::DatasetRepository; +use kamu_core::DatasetRegistry; use kamu_flow_system::FlowError; use crate::prelude::*; @@ -71,10 +71,9 @@ impl FlowOutcome { }), }), FlowError::InputDatasetCompacted(err) => { - let dataset_repository = - from_catalog::(ctx).unwrap(); - let hdl = dataset_repository - .resolve_dataset_ref(&err.dataset_id.as_local_ref()) + let dataset_registry = from_catalog::(ctx).unwrap(); + let hdl = dataset_registry + .resolve_dataset_handle_by_ref(&err.dataset_id.as_local_ref()) .await .int_err()?; diff --git a/src/adapter/graphql/src/queries/flows/flow_trigger.rs b/src/adapter/graphql/src/queries/flows/flow_trigger.rs index becd31237c..d9c1d75ad4 100644 --- a/src/adapter/graphql/src/queries/flows/flow_trigger.rs +++ b/src/adapter/graphql/src/queries/flows/flow_trigger.rs @@ -7,7 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu_core::DatasetRepository; +use kamu_core::DatasetRegistry; use kamu_flow_system as fs; use crate::prelude::*; @@ -33,9 +33,9 @@ impl FlowTrigger { fs::FlowTrigger::AutoPolling(auto_polling) => Self::AutoPolling(auto_polling.into()), fs::FlowTrigger::Push(push) => Self::Push(push.into()), fs::FlowTrigger::InputDatasetFlow(input) => { - let dataset_repository = from_catalog::(ctx).unwrap(); - let hdl = dataset_repository - .resolve_dataset_ref(&input.dataset_id.as_local_ref()) + let dataset_registry = from_catalog::(ctx).unwrap(); + let hdl = dataset_registry + .resolve_dataset_handle_by_ref(&input.dataset_id.as_local_ref()) .await .int_err()?; let account = Account::from_dataset_alias(ctx, &hdl.alias) diff --git a/src/adapter/graphql/src/queries/search.rs b/src/adapter/graphql/src/queries/search.rs index 88931a4276..363752b3ff 100644 --- a/src/adapter/graphql/src/queries/search.rs +++ b/src/adapter/graphql/src/queries/search.rs @@ -8,6 +8,7 @@ // by the Apache License, Version 2.0. use futures::TryStreamExt; +use kamu_core::auth::DatasetAction; use kamu_core::{self as domain, TryStreamExtExt}; use crate::prelude::*; @@ -31,22 +32,33 @@ impl Search { page: Option, per_page: Option, ) -> Result { - let dataset_repo = from_catalog::(ctx).unwrap(); + let dataset_registry = from_catalog::(ctx).unwrap(); + let dataset_action_authorizer = + from_catalog::(ctx).unwrap(); let page = page.unwrap_or(0); let per_page = per_page.unwrap_or(Self::DEFAULT_RESULTS_PER_PAGE); - let mut datasets: Vec<_> = dataset_repo - .get_all_datasets() + let filtered_dataset_handles: Vec<_> = dataset_registry + .all_dataset_handles() .filter_ok(|hdl| hdl.alias.dataset_name.contains(&query)) .try_collect() - .await?; + .await + .int_err()?; - datasets.sort_by(|a, b| a.alias.cmp(&b.alias)); - let total_count = datasets.len(); + let readable_dataset_handles = dataset_action_authorizer + .filter_datasets_allowing(filtered_dataset_handles, DatasetAction::Read) + .await + .int_err()?; + + let total_count = readable_dataset_handles.len(); let mut nodes: Vec = Vec::new(); - for hdl in datasets.into_iter().skip(page * per_page).take(per_page) { + for hdl in readable_dataset_handles + .into_iter() + .skip(page * per_page) + .take(per_page) + { let maybe_account = Account::from_dataset_alias(ctx, &hdl.alias).await?; if let Some(account) = maybe_account { nodes.push(SearchResult::Dataset(Dataset::new(account, hdl))); diff --git a/src/adapter/graphql/src/scalars/flow_configuration.rs b/src/adapter/graphql/src/scalars/flow_configuration.rs index 0242a5f977..d7c284a4ef 100644 --- a/src/adapter/graphql/src/scalars/flow_configuration.rs +++ b/src/adapter/graphql/src/scalars/flow_configuration.rs @@ -522,12 +522,12 @@ impl FlowRunConfiguration { } } DatasetFlowType::Reset => { - let dataset_repo = from_catalog::(ctx).unwrap(); - let dataset = dataset_repo.get_dataset_by_handle(dataset_handle); + let dataset_registry = from_catalog::(ctx).unwrap(); + let resolved_dataset = dataset_registry.get_dataset_by_handle(dataset_handle); // Assume unwrap safe such as we have checked this existance during // validation step - let current_head_hash = dataset + let current_head_hash = resolved_dataset .as_metadata_chain() .try_get_ref(&kamu_core::BlockRef::Head) .await diff --git a/src/adapter/graphql/src/utils.rs b/src/adapter/graphql/src/utils.rs index 6b389abfa5..b332fde895 100644 --- a/src/adapter/graphql/src/utils.rs +++ b/src/adapter/graphql/src/utils.rs @@ -13,7 +13,7 @@ use async_graphql::{Context, ErrorExtensions}; use internal_error::*; use kamu_accounts::{CurrentAccountSubject, GetAccessTokenError, LoggedAccount}; use kamu_core::auth::DatasetActionUnauthorizedError; -use kamu_core::{Dataset, DatasetRepository}; +use kamu_core::{DatasetRegistry, ResolvedDataset}; use kamu_datasets::DatasetEnvVarsConfig; use kamu_task_system as ts; use opendatafabric::{AccountName as OdfAccountName, DatasetHandle}; @@ -33,9 +33,9 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -pub(crate) fn get_dataset(ctx: &Context<'_>, dataset_handle: &DatasetHandle) -> Arc { - let dataset_repo = from_catalog::(ctx).unwrap(); - dataset_repo.get_dataset_by_handle(dataset_handle) +pub(crate) fn get_dataset(ctx: &Context<'_>, dataset_handle: &DatasetHandle) -> ResolvedDataset { + let dataset_registry = from_catalog::(ctx).unwrap(); + dataset_registry.get_dataset_by_handle(dataset_handle) } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/graphql/tests/tests/test_error_handling.rs b/src/adapter/graphql/tests/tests/test_error_handling.rs index 8dc6019d38..6ef57e2323 100644 --- a/src/adapter/graphql/tests/tests/test_error_handling.rs +++ b/src/adapter/graphql/tests/tests/test_error_handling.rs @@ -9,9 +9,9 @@ use dill::Component; use indoc::indoc; -use kamu::DatasetRepositoryLocalFs; +use kamu::{DatasetRegistryRepoBridge, DatasetRepositoryLocalFs}; use kamu_accounts::CurrentAccountSubject; -use kamu_core::DatasetRepository; +use kamu_core::{DatasetRepository, TenancyConfig}; use time_source::SystemTimeSourceDefault; #[test_log::test(tokio::test)] @@ -61,17 +61,12 @@ async fn test_internal_error() { let cat = dill::CatalogBuilder::new() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(tempdir.path().join("datasets")) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(tempdir.path().join("datasets"))) .bind::() + .add::() .build(); - // Note: Not creating a repo to cause an error - let _ = cat.get_one::().unwrap(); - let schema = kamu_adapter_graphql::schema_quiet(); let res = schema.execute(async_graphql::Request::new(indoc!( r#" diff --git a/src/adapter/graphql/tests/tests/test_gql_account_flow_configs.rs b/src/adapter/graphql/tests/tests/test_gql_account_flow_configs.rs index 5325b50b26..0a85f1683b 100644 --- a/src/adapter/graphql/tests/tests/test_gql_account_flow_configs.rs +++ b/src/adapter/graphql/tests/tests/test_gql_account_flow_configs.rs @@ -20,12 +20,13 @@ use kamu::testing::{ MockDatasetChangesService, MockDependencyGraphRepository, MockPollingIngestService, - MockTransformService, + MockTransformRequestPlanner, }; use kamu::{ CreateDatasetFromSnapshotUseCaseImpl, DatasetOwnershipServiceInMemory, DatasetOwnershipServiceInMemoryStateInitializer, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, @@ -54,7 +55,7 @@ async fn test_list_account_flows() { let mock_dataset_action_authorizer = MockDatasetActionAuthorizer::allowing(); let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), mock_dataset_action_authorizer: Some(mock_dataset_action_authorizer), ..Default::default() @@ -136,7 +137,7 @@ async fn test_list_account_flows() { async fn test_list_datasets_with_flow() { let mock_dataset_action_authorizer = MockDatasetActionAuthorizer::allowing(); let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), mock_dataset_action_authorizer: Some(mock_dataset_action_authorizer), ..Default::default() @@ -248,7 +249,7 @@ async fn test_pause_resume_account_flows() { let mock_dataset_action_authorizer = MockDatasetActionAuthorizer::allowing(); let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), mock_dataset_action_authorizer: Some(mock_dataset_action_authorizer), ..Default::default() @@ -454,7 +455,7 @@ async fn test_account_configs_all_paused() { let mock_dataset_action_authorizer = MockDatasetActionAuthorizer::allowing(); let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), mock_dataset_action_authorizer: Some(mock_dataset_action_authorizer), ..Default::default() @@ -631,7 +632,7 @@ struct FlowConfigHarness { struct FlowRunsHarnessOverrides { dependency_graph_mock: Option, dataset_changes_mock: Option, - transform_service_mock: Option, + transform_planner_mock: Option, polling_service_mock: Option, mock_dataset_action_authorizer: Option, } @@ -644,7 +645,7 @@ impl FlowConfigHarness { let dataset_changes_mock = overrides.dataset_changes_mock.unwrap_or_default(); let dependency_graph_mock = overrides.dependency_graph_mock.unwrap_or_default(); - let transform_service_mock = overrides.transform_service_mock.unwrap_or_default(); + let transform_planner_mock = overrides.transform_planner_mock.unwrap_or_default(); let polling_service_mock = overrides.polling_service_mock.unwrap_or_default(); let mock_dataset_action_authorizer = overrides.mock_dataset_action_authorizer.unwrap_or_default(); @@ -657,13 +658,11 @@ impl FlowConfigHarness { .with_consumer_filter(messaging_outbox::ConsumerFilter::AllConsumers), ) .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(true), - ) + .add_value(TenancyConfig::MultiTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add_value(dataset_changes_mock) .bind::() @@ -685,8 +684,8 @@ impl FlowConfigHarness { )) .add::() .add::() - .add_value(transform_service_mock) - .bind::() + .add_value(transform_planner_mock) + .bind::() .add_value(polling_service_mock) .bind::() .add::() diff --git a/src/adapter/graphql/tests/tests/test_gql_data.rs b/src/adapter/graphql/tests/tests/test_gql_data.rs index d569e70cf7..62acb8a3b6 100644 --- a/src/adapter/graphql/tests/tests/test_gql_data.rs +++ b/src/adapter/graphql/tests/tests/test_gql_data.rs @@ -35,7 +35,7 @@ use time_source::SystemTimeSourceDefault; async fn create_catalog_with_local_workspace( tempdir: &Path, - is_multitenant: bool, + tenancy_config: TenancyConfig, ) -> dill::Catalog { let datasets_dir = tempdir.join("datasets"); std::fs::create_dir(&datasets_dir).unwrap(); @@ -59,13 +59,11 @@ async fn create_catalog_with_local_workspace( b.add::() .add_value(current_account_subject) .add_value(predefined_accounts_config) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(is_multitenant), - ) + .add_value(tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add::() .add::() @@ -162,7 +160,8 @@ async fn create_test_dataset( #[test_log::test(tokio::test)] async fn test_dataset_tail_schema() { let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace(tempdir.path(), true).await; + let catalog = + create_catalog_with_local_workspace(tempdir.path(), TenancyConfig::MultiTenant).await; create_test_dataset(&catalog, tempdir.path(), None).await; let schema = kamu_adapter_graphql::schema_quiet(); @@ -222,7 +221,8 @@ async fn test_dataset_tail_schema() { #[test_log::test(tokio::test)] async fn test_dataset_tail_some() { let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace(tempdir.path(), true).await; + let catalog = + create_catalog_with_local_workspace(tempdir.path(), TenancyConfig::MultiTenant).await; create_test_dataset(&catalog, tempdir.path(), None).await; let schema = kamu_adapter_graphql::schema_quiet(); @@ -265,7 +265,8 @@ async fn test_dataset_tail_some() { #[test_log::test(tokio::test)] async fn test_dataset_tail_empty() { let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace(tempdir.path(), true).await; + let catalog = + create_catalog_with_local_workspace(tempdir.path(), TenancyConfig::MultiTenant).await; create_test_dataset(&catalog, tempdir.path(), None).await; let schema = kamu_adapter_graphql::schema_quiet(); @@ -310,7 +311,8 @@ async fn test_dataset_tail_empty() { #[test_log::test(tokio::test)] async fn test_data_query_some() { let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace(tempdir.path(), true).await; + let catalog = + create_catalog_with_local_workspace(tempdir.path(), TenancyConfig::MultiTenant).await; create_test_dataset(&catalog, tempdir.path(), None).await; let schema = kamu_adapter_graphql::schema_quiet(); @@ -377,7 +379,8 @@ async fn test_data_query_some() { #[test_log::test(tokio::test)] async fn test_data_query_error_sql_unparsable() { let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace(tempdir.path(), true).await; + let catalog = + create_catalog_with_local_workspace(tempdir.path(), TenancyConfig::MultiTenant).await; let schema = kamu_adapter_graphql::schema_quiet(); let res = schema @@ -423,7 +426,8 @@ async fn test_data_query_error_sql_unparsable() { #[test_log::test(tokio::test)] async fn test_data_query_error_sql_missing_function() { let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace(tempdir.path(), true).await; + let catalog = + create_catalog_with_local_workspace(tempdir.path(), TenancyConfig::MultiTenant).await; let schema = kamu_adapter_graphql::schema_quiet(); let res = schema diff --git a/src/adapter/graphql/tests/tests/test_gql_dataset_env_vars.rs b/src/adapter/graphql/tests/tests/test_gql_dataset_env_vars.rs index 530c74121d..9f989b55ab 100644 --- a/src/adapter/graphql/tests/tests/test_gql_dataset_env_vars.rs +++ b/src/adapter/graphql/tests/tests/test_gql_dataset_env_vars.rs @@ -14,11 +14,18 @@ use indoc::indoc; use kamu::testing::MetadataFactory; use kamu::{ CreateDatasetFromSnapshotUseCaseImpl, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, }; -use kamu_core::{auth, CreateDatasetFromSnapshotUseCase, CreateDatasetResult, DatasetRepository}; +use kamu_core::{ + auth, + CreateDatasetFromSnapshotUseCase, + CreateDatasetResult, + DatasetRepository, + TenancyConfig, +}; use kamu_datasets::DatasetEnvVarsConfig; use kamu_datasets_inmem::InMemoryDatasetEnvVarRepository; use kamu_datasets_services::DatasetEnvVarServiceImpl; @@ -345,13 +352,11 @@ impl DatasetEnvVarsHarness { b.add::() .add_value(DatasetEnvVarsConfig::sample()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add::() .add::() diff --git a/src/adapter/graphql/tests/tests/test_gql_dataset_flow_configs.rs b/src/adapter/graphql/tests/tests/test_gql_dataset_flow_configs.rs index 738d4dfcd5..d34cc8b933 100644 --- a/src/adapter/graphql/tests/tests/test_gql_dataset_flow_configs.rs +++ b/src/adapter/graphql/tests/tests/test_gql_dataset_flow_configs.rs @@ -11,9 +11,10 @@ use async_graphql::value; use database_common::{DatabaseTransactionRunner, NoOpDatabasePlugin}; use dill::Component; use indoc::indoc; -use kamu::testing::{MetadataFactory, MockPollingIngestService, MockTransformService}; +use kamu::testing::{MetadataFactory, MockPollingIngestService, MockTransformRequestPlanner}; use kamu::{ CreateDatasetFromSnapshotUseCaseImpl, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, @@ -24,7 +25,8 @@ use kamu_core::{ CreateDatasetResult, DatasetRepository, PollingIngestService, - TransformService, + TenancyConfig, + TransformRequestPlanner, }; use kamu_flow_system_inmem::InMemoryFlowConfigurationEventStore; use kamu_flow_system_services::FlowConfigurationServiceImpl; @@ -39,7 +41,7 @@ use crate::utils::{authentication_catalogs, expect_anonymous_access_error}; #[test_log::test(tokio::test)] async fn test_crud_time_delta_root_dataset() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -210,7 +212,7 @@ async fn test_crud_time_delta_root_dataset() { #[test_log::test(tokio::test)] async fn test_time_delta_validation() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -286,7 +288,7 @@ async fn test_time_delta_validation() { #[test_log::test(tokio::test)] async fn test_crud_cron_root_dataset() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -495,7 +497,7 @@ async fn test_crud_cron_root_dataset() { #[test_log::test(tokio::test)] async fn test_crud_transform_derived_dataset() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -614,7 +616,7 @@ async fn test_crud_transform_derived_dataset() { #[test_log::test(tokio::test)] async fn test_crud_compaction_root_dataset() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -732,7 +734,7 @@ async fn test_crud_compaction_root_dataset() { #[test_log::test(tokio::test)] async fn test_transform_config_validation() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -801,7 +803,7 @@ async fn test_transform_config_validation() { #[test_log::test(tokio::test)] async fn test_compaction_config_validation() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -926,7 +928,7 @@ async fn test_pause_resume_dataset_flows() { // Setup initial flow configs for datasets let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1151,7 +1153,7 @@ async fn test_pause_resume_dataset_flows() { #[test_log::test(tokio::test)] async fn test_conditions_not_met_for_flows() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::without_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::without_set_transform()), polling_service_mock: Some(MockPollingIngestService::without_active_polling_source()), }) .await; @@ -1240,7 +1242,7 @@ async fn test_conditions_not_met_for_flows() { #[test_log::test(tokio::test)] async fn test_incorrect_dataset_kinds_for_flow_type() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1398,7 +1400,7 @@ async fn test_incorrect_dataset_kinds_for_flow_type() { #[test_log::test(tokio::test)] async fn test_set_metadataonly_compaction_config_form_derivative() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1454,7 +1456,7 @@ async fn test_set_metadataonly_compaction_config_form_derivative() { #[test_log::test(tokio::test)] async fn test_set_config_for_hard_compaction_fails() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::without_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::without_set_transform()), polling_service_mock: Some(MockPollingIngestService::without_active_polling_source()), }) .await; @@ -1542,7 +1544,7 @@ async fn test_set_config_for_hard_compaction_fails() { #[test_log::test(tokio::test)] async fn test_anonymous_setters_fail() { let harness = FlowConfigHarness::with_overrides(FlowRunsHarnessOverrides { - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1599,7 +1601,7 @@ async fn test_anonymous_setters_fail() { #[derive(Default)] struct FlowRunsHarnessOverrides { - transform_service_mock: Option, + transform_planner_mock: Option, polling_service_mock: Option, } @@ -1616,26 +1618,24 @@ impl FlowConfigHarness { let datasets_dir = tempdir.path().join("datasets"); std::fs::create_dir(&datasets_dir).unwrap(); - let transform_service_mock = overrides.transform_service_mock.unwrap_or_default(); + let transform_planner_mock = overrides.transform_planner_mock.unwrap_or_default(); let polling_service_mock = overrides.polling_service_mock.unwrap_or_default(); let catalog_base = { let mut b = dill::CatalogBuilder::new(); b.add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add::() .add_value(polling_service_mock) .bind::() - .add_value(transform_service_mock) - .bind::() + .add_value(transform_planner_mock) + .bind::() .add::() .add::() .add::() diff --git a/src/adapter/graphql/tests/tests/test_gql_dataset_flow_runs.rs b/src/adapter/graphql/tests/tests/test_gql_dataset_flow_runs.rs index 426700e539..3412623c7e 100644 --- a/src/adapter/graphql/tests/tests/test_gql_dataset_flow_runs.rs +++ b/src/adapter/graphql/tests/tests/test_gql_dataset_flow_runs.rs @@ -20,11 +20,12 @@ use kamu::testing::{ MockDatasetChangesService, MockDependencyGraphRepository, MockPollingIngestService, - MockTransformService, + MockTransformRequestPlanner, }; use kamu::{ CreateDatasetFromSnapshotUseCaseImpl, DatasetOwnershipServiceInMemory, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, @@ -50,7 +51,8 @@ use kamu_core::{ DependencyGraphRepository, PollingIngestService, PullResult, - TransformService, + TenancyConfig, + TransformRequestPlanner, MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, }; use kamu_flow_system::{ @@ -88,7 +90,7 @@ async fn test_trigger_ingest_root_dataset() { updated_watermark: None, }, )), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -437,7 +439,7 @@ async fn test_trigger_reset_root_dataset_flow() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: Some(MockDependencyGraphRepository::no_dependencies()), dataset_changes_mock: Some(MockDatasetChangesService::default()), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -601,7 +603,7 @@ async fn test_trigger_reset_root_dataset_flow_with_invalid_head() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: Some(MockDependencyGraphRepository::no_dependencies()), dataset_changes_mock: Some(MockDatasetChangesService::default()), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -703,7 +705,7 @@ async fn test_trigger_execute_transform_derived_dataset() { updated_watermark: None, }, )), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -924,7 +926,7 @@ async fn test_trigger_compaction_root_dataset() { updated_watermark: None, }, )), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1277,7 +1279,7 @@ async fn test_list_flows_with_filters_and_pagination() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1682,7 +1684,7 @@ async fn test_list_flow_initiators() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1775,7 +1777,7 @@ async fn test_conditions_not_met_for_flows() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::without_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::without_set_transform()), polling_service_mock: Some(MockPollingIngestService::without_active_polling_source()), }) .await; @@ -1858,7 +1860,7 @@ async fn test_incorrect_dataset_kinds_for_flow_type() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -1975,7 +1977,7 @@ async fn test_cancel_ingest_root_dataset() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2047,7 +2049,7 @@ async fn test_cancel_running_transform_derived_dataset() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2124,7 +2126,7 @@ async fn test_cancel_hard_compaction_root_dataset() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2196,7 +2198,7 @@ async fn test_cancel_wrong_flow_id_fails() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2240,7 +2242,7 @@ async fn test_cancel_foreign_flow_fails() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2301,7 +2303,7 @@ async fn test_cancel_waiting_flow() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2371,7 +2373,7 @@ async fn test_cancel_already_aborted_flow() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2456,7 +2458,7 @@ async fn test_cancel_already_succeeded_flow() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: Some(MockDependencyGraphRepository::no_dependencies()), dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2536,7 +2538,7 @@ async fn test_history_of_completed_flow() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: Some(MockDependencyGraphRepository::no_dependencies()), dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2678,7 +2680,7 @@ async fn test_execute_transfrom_flow_error_after_compaction() { updated_watermark: None, }, )), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -2978,7 +2980,7 @@ async fn test_anonymous_operation_fails() { let harness = FlowRunsHarness::with_overrides(FlowRunsHarnessOverrides { dependency_graph_mock: None, dataset_changes_mock: None, - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -3024,7 +3026,7 @@ async fn test_config_snapshot_returned_correctly() { updated_watermark: None, }, )), - transform_service_mock: Some(MockTransformService::with_set_transform()), + transform_planner_mock: Some(MockTransformRequestPlanner::with_set_transform()), polling_service_mock: Some(MockPollingIngestService::with_active_polling_source()), }) .await; @@ -3155,7 +3157,7 @@ struct FlowRunsHarness { struct FlowRunsHarnessOverrides { dependency_graph_mock: Option, dataset_changes_mock: Option, - transform_service_mock: Option, + transform_planner_mock: Option, polling_service_mock: Option, } @@ -3167,7 +3169,7 @@ impl FlowRunsHarness { let dataset_changes_mock = overrides.dataset_changes_mock.unwrap_or_default(); let dependency_graph_mock = overrides.dependency_graph_mock.unwrap_or_default(); - let transform_service_mock = overrides.transform_service_mock.unwrap_or_default(); + let transform_planner_mock = overrides.transform_planner_mock.unwrap_or_default(); let polling_service_mock = overrides.polling_service_mock.unwrap_or_default(); let catalog_base = { @@ -3178,13 +3180,11 @@ impl FlowRunsHarness { .with_consumer_filter(messaging_outbox::ConsumerFilter::AllConsumers), ) .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add_value(dataset_changes_mock) .bind::() @@ -3201,8 +3201,8 @@ impl FlowRunsHarness { )) .add::() .add::() - .add_value(transform_service_mock) - .bind::() + .add_value(transform_planner_mock) + .bind::() .add_value(polling_service_mock) .bind::() .add::() diff --git a/src/adapter/graphql/tests/tests/test_gql_datasets.rs b/src/adapter/graphql/tests/tests/test_gql_datasets.rs index 316021c4eb..e2fa28010e 100644 --- a/src/adapter/graphql/tests/tests/test_gql_datasets.rs +++ b/src/adapter/graphql/tests/tests/test_gql_datasets.rs @@ -29,7 +29,7 @@ use crate::utils::{authentication_catalogs, expect_anonymous_access_error}; #[test_log::test(tokio::test)] async fn dataset_by_id_does_not_exist() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let res = harness.execute_anonymous_query(indoc!( r#" { @@ -57,7 +57,7 @@ async fn dataset_by_id_does_not_exist() { #[test_log::test(tokio::test)] async fn dataset_by_id() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let foo_result = harness .create_root_dataset(None, DatasetName::new_unchecked("foo")) @@ -107,8 +107,11 @@ async fn dataset_by_account_and_name_case_insensitive() { .with(eq(account_name.clone())) .returning(|_| Ok(Some(Account::dummy()))); - let harness = - GraphQLDatasetsHarness::new_custom_authentication(mock_authentication_service, true).await; + let harness = GraphQLDatasetsHarness::new_custom_authentication( + mock_authentication_service, + TenancyConfig::MultiTenant, + ) + .await; harness .create_root_dataset( @@ -160,8 +163,11 @@ async fn dataset_by_account_id() { .with(eq(DEFAULT_ACCOUNT_ID.clone())) .returning(|_| Ok(Some(DEFAULT_ACCOUNT_NAME.clone()))); - let harness = - GraphQLDatasetsHarness::new_custom_authentication(mock_authentication_service, false).await; + let harness = GraphQLDatasetsHarness::new_custom_authentication( + mock_authentication_service, + TenancyConfig::SingleTenant, + ) + .await; harness .create_root_dataset(None, DatasetName::new_unchecked("Foo")) .await; @@ -210,7 +216,7 @@ async fn dataset_by_account_id() { #[test_log::test(tokio::test)] async fn dataset_create_empty() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let request_code = indoc::indoc!( r#" @@ -252,7 +258,7 @@ async fn dataset_create_empty() { #[test_log::test(tokio::test)] async fn dataset_create_from_snapshot() { - let harness = GraphQLDatasetsHarness::new(true).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::MultiTenant).await; let snapshot = MetadataFactory::dataset_snapshot() .name("foo") @@ -308,7 +314,7 @@ async fn dataset_create_from_snapshot() { #[test_log::test(tokio::test)] async fn dataset_create_from_snapshot_malformed() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let res = harness .execute_authorized_query(indoc!( @@ -342,7 +348,7 @@ async fn dataset_create_from_snapshot_malformed() { #[test_log::test(tokio::test)] async fn dataset_rename_success() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let foo_result = harness .create_root_dataset(None, DatasetName::new_unchecked("foo")) @@ -394,7 +400,7 @@ async fn dataset_rename_success() { #[test_log::test(tokio::test)] async fn dataset_rename_no_changes() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let foo_result = harness .create_root_dataset(None, DatasetName::new_unchecked("foo")) @@ -444,7 +450,7 @@ async fn dataset_rename_no_changes() { #[test_log::test(tokio::test)] async fn dataset_rename_name_collision() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let foo_result = harness .create_root_dataset(None, DatasetName::new_unchecked("foo")) @@ -497,7 +503,7 @@ async fn dataset_rename_name_collision() { #[test_log::test(tokio::test)] async fn dataset_delete_success() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; harness.init_dependencies_graph().await; let foo_result = harness @@ -547,7 +553,7 @@ async fn dataset_delete_success() { #[test_log::test(tokio::test)] async fn dataset_delete_dangling_ref() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; harness.init_dependencies_graph().await; let foo_result = harness @@ -605,7 +611,7 @@ async fn dataset_delete_dangling_ref() { #[test_log::test(tokio::test)] async fn dataset_view_permissions() { - let harness = GraphQLDatasetsHarness::new(false).await; + let harness = GraphQLDatasetsHarness::new(TenancyConfig::SingleTenant).await; let foo_result = harness .create_root_dataset(None, DatasetName::new_unchecked("foo")) @@ -660,14 +666,13 @@ struct GraphQLDatasetsHarness { } impl GraphQLDatasetsHarness { - pub async fn new(is_multi_tenant: bool) -> Self { - Self::new_custom_authentication(MockAuthenticationService::built_in(), is_multi_tenant) - .await + pub async fn new(tenancy_config: TenancyConfig) -> Self { + Self::new_custom_authentication(MockAuthenticationService::built_in(), tenancy_config).await } pub async fn new_custom_authentication( mock_authentication_service: MockAuthenticationService, - is_multi_tenant: bool, + tenancy_config: TenancyConfig, ) -> Self { let tempdir = tempfile::tempdir().unwrap(); let datasets_dir = tempdir.path().join("datasets"); @@ -686,13 +691,11 @@ impl GraphQLDatasetsHarness { .add::() .add::() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(is_multi_tenant), - ) + .add_value(tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add_value(mock_authentication_service) .bind::() .add::(); diff --git a/src/adapter/graphql/tests/tests/test_gql_metadata.rs b/src/adapter/graphql/tests/tests/test_gql_metadata.rs index c7917a1fd2..d8755d6163 100644 --- a/src/adapter/graphql/tests/tests/test_gql_metadata.rs +++ b/src/adapter/graphql/tests/tests/test_gql_metadata.rs @@ -33,13 +33,11 @@ async fn test_current_push_sources() { b.add_value(RunInfoDir::new(tempdir.path().join("run"))) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add::() .add::() diff --git a/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs b/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs index 8c9cf28590..805ae8d247 100644 --- a/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs +++ b/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs @@ -27,7 +27,7 @@ use crate::utils::{authentication_catalogs, expect_anonymous_access_error}; #[test_log::test(tokio::test)] async fn test_metadata_chain_events() { - let harness = GraphQLMetadataChainHarness::new(false).await; + let harness = GraphQLMetadataChainHarness::new(TenancyConfig::SingleTenant).await; let create_dataset = harness .catalog_authorized @@ -174,7 +174,7 @@ async fn test_metadata_chain_events() { #[test_log::test(tokio::test)] async fn metadata_chain_append_event() { - let harness = GraphQLMetadataChainHarness::new(false).await; + let harness = GraphQLMetadataChainHarness::new(TenancyConfig::SingleTenant).await; let create_dataset = harness .catalog_authorized @@ -259,7 +259,7 @@ async fn metadata_chain_append_event() { #[test_log::test(tokio::test)] async fn metadata_update_readme_new() { - let harness = GraphQLMetadataChainHarness::new(false).await; + let harness = GraphQLMetadataChainHarness::new(TenancyConfig::SingleTenant).await; let create_dataset = harness .catalog_authorized @@ -518,7 +518,7 @@ struct GraphQLMetadataChainHarness { } impl GraphQLMetadataChainHarness { - async fn new(is_multi_tenant: bool) -> Self { + async fn new(tenancy_config: TenancyConfig) -> Self { let tempdir = tempfile::tempdir().unwrap(); let datasets_dir = tempdir.path().join("datasets"); std::fs::create_dir(&datasets_dir).unwrap(); @@ -532,13 +532,11 @@ impl GraphQLMetadataChainHarness { .add::() .add::() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(is_multi_tenant), - ) + .add_value(tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::(); database_common::NoOpDatabasePlugin::init_database_components(&mut b); diff --git a/src/adapter/graphql/tests/tests/test_gql_search.rs b/src/adapter/graphql/tests/tests/test_gql_search.rs index 28a045c102..7d6698bfeb 100644 --- a/src/adapter/graphql/tests/tests/test_gql_search.rs +++ b/src/adapter/graphql/tests/tests/test_gql_search.rs @@ -17,6 +17,8 @@ use messaging_outbox::DummyOutboxImpl; use opendatafabric::*; use time_source::SystemTimeSourceDefault; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[tokio::test] async fn test_search_query() { let tempdir = tempfile::tempdir().unwrap(); @@ -29,13 +31,11 @@ async fn test_search_query() { .add::() .add_value(CurrentAccountSubject::new_test()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .build(); @@ -204,3 +204,5 @@ async fn test_search_query() { }) ); } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/http/src/data/ingest_handler.rs b/src/adapter/http/src/data/ingest_handler.rs index 66ad27f04d..ade5042b3c 100644 --- a/src/adapter/http/src/data/ingest_handler.rs +++ b/src/adapter/http/src/data/ingest_handler.rs @@ -7,15 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - +use auth::{DatasetActionAuthorizer, DatasetActionUnauthorizedError}; use axum::extract::{Extension, Query}; use chrono::{DateTime, Utc}; use database_common_macros::transactional_handler; @@ -127,10 +119,28 @@ pub async fn dataset_ingest_handler( Some(time_source.now()) }); + // Resolve dataset + let dataset_registry = catalog.get_one::().unwrap(); + let resolved_dataset = dataset_registry + .get_dataset_by_ref(&dataset_ref) + .await + .map_err(ApiError::not_found)?; + + // Authorization check + let authorizer = catalog.get_one::().unwrap(); + authorizer + .check_action_allowed(resolved_dataset.get_handle(), auth::DatasetAction::Write) + .await + .map_err(|e| match e { + DatasetActionUnauthorizedError::Access(_) => ApiError::new_forbidden(), + DatasetActionUnauthorizedError::Internal(e) => e.api_err(), + })?; + + // Run ingestion let ingest_svc = catalog.get_one::().unwrap(); match ingest_svc .ingest_from_file_stream( - &dataset_ref, + resolved_dataset, params.source_name.as_deref(), arguments.data_stream, PushIngestOpts { diff --git a/src/adapter/http/src/data/metadata_handler.rs b/src/adapter/http/src/data/metadata_handler.rs index 2d54de3a98..c1fdf0cb15 100644 --- a/src/adapter/http/src/data/metadata_handler.rs +++ b/src/adapter/http/src/data/metadata_handler.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::{Extension, Query}; use axum::response::Json; use comma_separated::CommaSeparatedSet; @@ -149,9 +140,9 @@ pub async fn dataset_metadata_handler( ) -> Result, ApiError> { use kamu_core::{metadata_chain_visitors as vis, MetadataChainExt as _}; - let dataset_repo = catalog.get_one::().unwrap(); - let dataset = dataset_repo - .find_dataset_by_ref(&dataset_ref) + let dataset_registry = catalog.get_one::().unwrap(); + let resolved_dataset = dataset_registry + .get_dataset_by_ref(&dataset_ref) .await .api_err()?; @@ -189,7 +180,7 @@ pub async fn dataset_metadata_handler( &mut vocab_visitor, ]; - dataset + resolved_dataset .as_metadata_chain() .accept(&mut visitors) .await @@ -223,7 +214,7 @@ pub async fn dataset_metadata_handler( let refs = if !params.include.contains(&Include::Refs) { None } else { - dataset + resolved_dataset .as_metadata_chain() .try_get_ref(&BlockRef::Head) .await diff --git a/src/adapter/http/src/data/query_handler.rs b/src/adapter/http/src/data/query_handler.rs index ee7cb165d4..0ce6942130 100644 --- a/src/adapter/http/src/data/query_handler.rs +++ b/src/adapter/http/src/data/query_handler.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::{Extension, Query}; use axum::response::Json; use database_common_macros::transactional_handler; diff --git a/src/adapter/http/src/data/query_types.rs b/src/adapter/http/src/data/query_types.rs index b5daa92b8d..cf1c098382 100644 --- a/src/adapter/http/src/data/query_types.rs +++ b/src/adapter/http/src/data/query_types.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use std::collections::BTreeSet; use http_common::comma_separated::CommaSeparatedSet; diff --git a/src/adapter/http/src/data/router.rs b/src/adapter/http/src/data/router.rs index 69b18338e1..54a0f9703f 100644 --- a/src/adapter/http/src/data/router.rs +++ b/src/adapter/http/src/data/router.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use utoipa_axum::router::OpenApiRouter; use utoipa_axum::routes; diff --git a/src/adapter/http/src/data/tail_handler.rs b/src/adapter/http/src/data/tail_handler.rs index f138f91fb2..5b73a6c9f9 100644 --- a/src/adapter/http/src/data/tail_handler.rs +++ b/src/adapter/http/src/data/tail_handler.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::{Extension, Query}; use axum::response::Json; use database_common_macros::transactional_handler; diff --git a/src/adapter/http/src/data/verify_handler.rs b/src/adapter/http/src/data/verify_handler.rs index 428a3d9e0d..288557cbb9 100644 --- a/src/adapter/http/src/data/verify_handler.rs +++ b/src/adapter/http/src/data/verify_handler.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use std::error::Error; use axum::extract::Extension; diff --git a/src/adapter/http/src/data/verify_types.rs b/src/adapter/http/src/data/verify_types.rs index 2f6f6f7e40..d6b0c0c382 100644 --- a/src/adapter/http/src/data/verify_types.rs +++ b/src/adapter/http/src/data/verify_types.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use std::borrow::Cow; use axum::response::Json; diff --git a/src/adapter/http/src/e2e/system_time_handler.rs b/src/adapter/http/src/e2e/system_time_handler.rs index 8bd6434b9a..204805de3b 100644 --- a/src/adapter/http/src/e2e/system_time_handler.rs +++ b/src/adapter/http/src/e2e/system_time_handler.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::Extension; use axum::response::Json; use chrono::{DateTime, Utc}; diff --git a/src/adapter/http/src/general/account_handler.rs b/src/adapter/http/src/general/account_handler.rs index 23531c5799..ac6b412bf1 100644 --- a/src/adapter/http/src/general/account_handler.rs +++ b/src/adapter/http/src/general/account_handler.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::Extension; use axum::response::Json; use database_common_macros::transactional_handler; diff --git a/src/adapter/http/src/general/dataset_info_handler.rs b/src/adapter/http/src/general/dataset_info_handler.rs index 71feb0de2f..4e4ca490ad 100644 --- a/src/adapter/http/src/general/dataset_info_handler.rs +++ b/src/adapter/http/src/general/dataset_info_handler.rs @@ -7,22 +7,13 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::{Extension, Path}; use axum::response::Json; use database_common_macros::transactional_handler; use dill::Catalog; use http_common::*; use kamu_accounts::AuthenticationService; -use kamu_core::{DatasetRepository, GetDatasetError}; +use kamu_core::{DatasetRegistry, GetDatasetError}; use opendatafabric::{AccountID, AccountName, DatasetHandle, DatasetID, DatasetName}; use crate::axum_utils::ensure_authenticated_account; @@ -108,9 +99,9 @@ async fn get_dataset_by_id( // to access dataset and not reject non-authed users ensure_authenticated_account(catalog).api_err()?; - let dataset_repo = catalog.get_one::().unwrap(); - let dataset_handle = dataset_repo - .resolve_dataset_ref(&dataset_id.clone().as_local_ref()) + let dataset_registry = catalog.get_one::().unwrap(); + let dataset_handle = dataset_registry + .resolve_dataset_handle_by_ref(&dataset_id.clone().as_local_ref()) .await .map_err(|err| match err { GetDatasetError::NotFound(e) => ApiError::not_found(e), diff --git a/src/adapter/http/src/general/node_info_handler.rs b/src/adapter/http/src/general/node_info_handler.rs index 37add0b33c..9bf10cb8a5 100644 --- a/src/adapter/http/src/general/node_info_handler.rs +++ b/src/adapter/http/src/general/node_info_handler.rs @@ -7,20 +7,11 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use axum::extract::Extension; use axum::response::Json; use dill::Catalog; use http_common::*; -use kamu_core::DatasetRepository; +use kamu_core::TenancyConfig; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -52,10 +43,10 @@ pub async fn node_info_handler( } fn get_node_info(catalog: &Catalog) -> Json { - let dataset_repo = catalog.get_one::().unwrap(); + let tenancy_config = catalog.get_one::().unwrap(); Json(NodeInfoResponse { - is_multi_tenant: dataset_repo.is_multi_tenant(), + is_multi_tenant: *tenancy_config == TenancyConfig::MultiTenant, }) } diff --git a/src/adapter/http/src/general/router.rs b/src/adapter/http/src/general/router.rs index a6ec5a73b0..4c63adbb46 100644 --- a/src/adapter/http/src/general/router.rs +++ b/src/adapter/http/src/general/router.rs @@ -7,15 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use utoipa_axum::router::OpenApiRouter; use utoipa_axum::routes; diff --git a/src/adapter/http/src/http_server_dataset_router.rs b/src/adapter/http/src/http_server_dataset_router.rs index e275260fd2..65792d6765 100644 --- a/src/adapter/http/src/http_server_dataset_router.rs +++ b/src/adapter/http/src/http_server_dataset_router.rs @@ -11,6 +11,7 @@ use axum::{Extension, Json}; use database_common_macros::transactional_handler; use dill::Catalog; use http_common::{ApiError, ApiErrorResponse, IntoApiError, ResultIntoApiError}; +use kamu_core::TenancyConfig; use opendatafabric as odf; use serde::{Deserialize, Serialize}; use utoipa_axum::router::OpenApiRouter; @@ -58,22 +59,21 @@ pub fn smart_transfer_protocol_router() -> OpenApiRouter { pub fn add_dataset_resolver_layer( dataset_router: OpenApiRouter, - multi_tenant: bool, + tenancy_config: TenancyConfig, ) -> OpenApiRouter { use axum::extract::Path; - if multi_tenant { - dataset_router.layer(DatasetResolverLayer::new( + match tenancy_config { + TenancyConfig::MultiTenant => dataset_router.layer(DatasetResolverLayer::new( |Path(p): Path| { odf::DatasetAlias::new(Some(p.account_name), p.dataset_name).into_local_ref() }, is_dataset_optional_for_request, - )) - } else { - dataset_router.layer(DatasetResolverLayer::new( + )), + TenancyConfig::SingleTenant => dataset_router.layer(DatasetResolverLayer::new( |Path(p): Path| p.dataset_name.as_local_ref(), is_dataset_optional_for_request, - )) + )), } } diff --git a/src/adapter/http/src/middleware/dataset_authorization_layer.rs b/src/adapter/http/src/middleware/dataset_authorization_layer.rs index 4dc488ddb7..43d3931411 100644 --- a/src/adapter/http/src/middleware/dataset_authorization_layer.rs +++ b/src/adapter/http/src/middleware/dataset_authorization_layer.rs @@ -12,9 +12,11 @@ use std::task::{Context, Poll}; use axum::body::Body; use axum::response::Response; +use database_common::DatabaseTransactionRunner; use futures::Future; +use internal_error::InternalError; use kamu_accounts::CurrentAccountSubject; -use kamu_core::GetDatasetError; +use kamu_core::{DatasetRegistry, GetDatasetError}; use opendatafabric::DatasetRef; use tower::{Layer, Service}; @@ -101,14 +103,6 @@ where .get::() .expect("Catalog not found in http server extensions"); - let dataset_action_authorizer = catalog - .get_one::() - .unwrap(); - - let dataset_repo = catalog - .get_one::() - .unwrap(); - let dataset_ref = request .extensions() .get::() @@ -116,35 +110,68 @@ where let action = dataset_action_query(&request); - match dataset_repo.resolve_dataset_ref(dataset_ref).await { - Ok(dataset_handle) => { - if let Err(err) = dataset_action_authorizer - .check_action_allowed(&dataset_handle, action) - .await - { - if let Err(err_result) = Self::check_logged_in(catalog) { - tracing::error!( - "Dataset '{}' {} access denied: user not logged in", - dataset_ref, - action - ); - return Ok(err_result); - } + enum CheckResult { + Proceed, + ErrorResponse(Response), + } - tracing::error!( - "Dataset '{}' {} access denied: {:?}", - dataset_ref, - action, - err - ); - return Ok(forbidden_access_response()); - } + let check_result: Result = + DatabaseTransactionRunner::new(catalog.clone()) + .transactional(|transaction_catalog| async move { + let dataset_registry = transaction_catalog + .get_one::() + .unwrap(); + let dataset_action_authorizer = transaction_catalog + .get_one::() + .unwrap(); + + match dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await + { + Ok(dataset_handle) => { + if let Err(err) = dataset_action_authorizer + .check_action_allowed(&dataset_handle, action) + .await + { + if let Err(err_result) = Self::check_logged_in(catalog) { + tracing::error!( + "Dataset '{}' {} access denied: user not logged in", + dataset_ref, + action + ); + return Ok(CheckResult::ErrorResponse(err_result)); + } + + tracing::error!( + "Dataset '{}' {} access denied: {:?}", + dataset_ref, + action, + err + ); + return Ok(CheckResult::ErrorResponse( + forbidden_access_response(), + )); + } + + Ok(CheckResult::Proceed) + } + Err(GetDatasetError::NotFound(_)) => Ok(CheckResult::Proceed), + Err(GetDatasetError::Internal(_)) => { + Ok(CheckResult::ErrorResponse(internal_server_error_response())) + } + } + }) + .await; + + match check_result { + Ok(CheckResult::Proceed) => inner.call(request).await, + Ok(CheckResult::ErrorResponse(r)) => Ok(r), + Err(err) => { + tracing::error!(error=?err, error_msg=%err, "DatasetAuthorizationLayer failed"); + Ok(internal_server_error_response()) } - Err(GetDatasetError::NotFound(_)) => {} - Err(GetDatasetError::Internal(_)) => return Ok(internal_server_error_response()), } - - inner.call(request).await }) } } diff --git a/src/adapter/http/src/middleware/dataset_resolver_layer.rs b/src/adapter/http/src/middleware/dataset_resolver_layer.rs index 33b80c83ea..d3eeb665eb 100644 --- a/src/adapter/http/src/middleware/dataset_resolver_layer.rs +++ b/src/adapter/http/src/middleware/dataset_resolver_layer.rs @@ -16,7 +16,9 @@ use axum::body::Body; use axum::extract::FromRequestParts; use axum::response::Response; use axum::RequestExt; -use kamu_core::{DatasetRepository, GetDatasetError}; +use database_common::DatabaseTransactionRunner; +use internal_error::InternalError; +use kamu_core::{DatasetRegistry, DatasetRegistryExt, GetDatasetError, ResolvedDataset}; use opendatafabric::DatasetRef; use tower::{Layer, Service}; @@ -141,21 +143,45 @@ where .get::() .expect("Catalog not found in http server extensions"); - let dataset_repo = catalog.get_one::().unwrap(); + enum CheckResult { + CheckedDataset(ResolvedDataset), + ErrorResponse(Response), + } - let dataset = match dataset_repo.find_dataset_by_ref(&dataset_ref).await { - Ok(ds) => ds, - Err(GetDatasetError::NotFound(err)) => { - tracing::warn!("Dataset not found: {:?}", err); - return Ok(not_found_response()); + let dataset_ref = dataset_ref.clone(); + + let check_result: Result = + DatabaseTransactionRunner::new(catalog.clone()) + .transactional(|transational_catalog| async move { + let dataset_registry = transational_catalog + .get_one::() + .unwrap(); + match dataset_registry.get_dataset_by_ref(&dataset_ref).await { + Ok(resolved_dataset) => { + Ok(CheckResult::CheckedDataset(resolved_dataset)) + } + Err(GetDatasetError::NotFound(err)) => { + tracing::warn!("Dataset not found: {:?}", err); + Ok(CheckResult::ErrorResponse(not_found_response())) + } + Err(err) => { + tracing::error!("Could not get dataset: {:?}", err); + Ok(CheckResult::ErrorResponse(internal_server_error_response())) + } + } + }) + .await; + + match check_result { + Ok(CheckResult::CheckedDataset(target)) => { + request.extensions_mut().insert((*target).clone()); } + Ok(CheckResult::ErrorResponse(r)) => return Ok(r), Err(err) => { - tracing::error!("Could not get dataset: {:?}", err); + tracing::error!(error=?err, error_msg=%err, "DatasetResolverLayer failed"); return Ok(internal_server_error_response()); } - }; - - request.extensions_mut().insert(dataset); + } } request.extensions_mut().insert(dataset_ref); diff --git a/src/adapter/http/src/middleware/mod.rs b/src/adapter/http/src/middleware/mod.rs index bf69975714..b53f76e632 100644 --- a/src/adapter/http/src/middleware/mod.rs +++ b/src/adapter/http/src/middleware/mod.rs @@ -11,10 +11,8 @@ mod authentication_layer; mod dataset_authorization_layer; mod dataset_resolver_layer; mod headers; -mod run_in_database_transaction_layer; pub use authentication_layer::*; pub use dataset_authorization_layer::*; pub use dataset_resolver_layer::*; pub use headers::*; -pub use run_in_database_transaction_layer::*; diff --git a/src/adapter/http/src/middleware/run_in_database_transaction_layer.rs b/src/adapter/http/src/middleware/run_in_database_transaction_layer.rs deleted file mode 100644 index e22d188bab..0000000000 --- a/src/adapter/http/src/middleware/run_in_database_transaction_layer.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::pin::Pin; -use std::task::{Context, Poll}; - -use axum::body::Body; -use axum::response::{IntoResponse, Response}; -use database_common::DatabaseTransactionRunner; -use futures::Future; -use http_common::IntoApiError; -use internal_error::InternalError; -use tower::{Layer, Service}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone)] -pub struct RunInDatabaseTransactionLayer {} - -impl RunInDatabaseTransactionLayer { - pub fn new() -> Self { - Self {} - } -} - -impl Layer for RunInDatabaseTransactionLayer { - type Service = RunInDatabaseTransactionMiddleware; - - fn layer(&self, inner: InnerSvc) -> Self::Service { - RunInDatabaseTransactionMiddleware { inner } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone)] -pub struct RunInDatabaseTransactionMiddleware { - inner: Svc, -} - -impl Service> for RunInDatabaseTransactionMiddleware -where - InnerSvc: Service, Response = Response> + Send + Clone + 'static, - InnerSvc::Error: Send, - InnerSvc::Future: Send, -{ - type Response = InnerSvc::Response; - type Error = InnerSvc::Error; - type Future = Pin> + Send>>; - - fn poll_ready(&mut self, ctx: &mut Context<'_>) -> Poll> { - self.inner.poll_ready(ctx) - } - - fn call(&mut self, mut request: http::Request) -> Self::Future { - use tracing::Instrument; - - // Inspired by https://github.com/maxcountryman/axum-login/blob/5239b38b2698a3db3f92075b6ad430aea79c215a/axum-login/src/auth.rs - // TODO: PERF: Is cloning a performance concern? - let mut inner = self.inner.clone(); - - Box::pin(async move { - let base_catalog = request - .extensions() - .get::() - .expect("Catalog not found in http server extensions") - .clone(); - let transaction_runner = DatabaseTransactionRunner::new(base_catalog); - - transaction_runner - .transactional(|updated_catalog| async move { - request.extensions_mut().insert(updated_catalog); - - let inner_result = inner.call(request).await; - - Ok(inner_result) - }) - .instrument(tracing::debug_span!("RunInDatabaseTransactionMiddleware")) - .await - .unwrap_or_else(|e: InternalError| Ok(e.api_err().into_response())) - }) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/http/src/simple_protocol/handlers.rs b/src/adapter/http/src/simple_protocol/handlers.rs index 14767a54f9..71fe265fa4 100644 --- a/src/adapter/http/src/simple_protocol/handlers.rs +++ b/src/adapter/http/src/simple_protocol/handlers.rs @@ -7,20 +7,12 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - use std::str::FromStr; use std::sync::Arc; use axum::response::IntoResponse; use axum_extra::typed_header::TypedHeader; +use database_common::DatabaseTransactionRunner; use http_common::*; use internal_error::ResultIntoInternal; use kamu_accounts::CurrentAccountSubject; @@ -304,23 +296,29 @@ pub async fn dataset_push_ws_upgrade_handler( let server_url_config = catalog.get_one::().unwrap(); let dataset_url = get_base_dataset_url(uri, &server_url_config.protocols.base_url_rest, 1); - let dataset_repo = catalog.get_one::().unwrap(); - - let dataset = match dataset_repo.find_dataset_by_ref(&dataset_ref).await { - Ok(ds) => Ok(Some(ds)), - Err(GetDatasetError::NotFound(_)) => { - // Make sure account in dataset ref being created and token account match - let CurrentAccountSubject::Logged(acc) = current_account_subject.as_ref() else { - unreachable!() - }; - if let Some(ref_account_name) = dataset_ref.account_name() { - if ref_account_name != &acc.account_name { - return Err(ApiError::new_forbidden()); + let maybe_dataset = { + let dataset_ref = dataset_ref.clone(); + DatabaseTransactionRunner::new(catalog.clone()) + .transactional_with(|dataset_registry: Arc| async move { + match dataset_registry.get_dataset_by_ref(&dataset_ref).await { + Ok(resolved_dataset) => Ok(Some((*resolved_dataset).clone())), + Err(GetDatasetError::NotFound(_)) => { + // Make sure account in dataset ref being created and token account match + let CurrentAccountSubject::Logged(acc) = current_account_subject.as_ref() + else { + unreachable!() + }; + if let Some(ref_account_name) = dataset_ref.account_name() { + if ref_account_name != &acc.account_name { + return Err(ApiError::new_forbidden()); + } + } + Ok(None) + } + Err(err) => Err(err.api_err()), } - } - Ok(None) - } - Err(err) => Err(err.api_err()), + }) + .await }?; Ok(ws.on_upgrade(|socket| { @@ -328,7 +326,7 @@ pub async fn dataset_push_ws_upgrade_handler( socket, catalog, dataset_ref, - dataset, + maybe_dataset, dataset_url, maybe_bearer_header, ) diff --git a/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs b/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs index f485e26956..be7ab9ebd6 100644 --- a/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs +++ b/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs @@ -49,7 +49,7 @@ pub struct AxumServerPushProtocolInstance { socket: axum::extract::ws::WebSocket, catalog: Catalog, dataset_ref: DatasetRef, - dataset: Option>, + maybe_dataset: Option>, dataset_url: Url, maybe_bearer_header: Option, } @@ -59,7 +59,7 @@ impl AxumServerPushProtocolInstance { socket: axum::extract::ws::WebSocket, catalog: Catalog, dataset_ref: DatasetRef, - dataset: Option>, + maybe_dataset: Option>, dataset_url: Url, maybe_bearer_header: Option, ) -> Self { @@ -67,7 +67,7 @@ impl AxumServerPushProtocolInstance { socket, catalog, dataset_ref, - dataset, + maybe_dataset, dataset_url, maybe_bearer_header, } @@ -196,7 +196,7 @@ impl AxumServerPushProtocolInstance { let mut new_blocks = self.try_handle_push_metadata_request(push_request).await?; if !new_blocks.is_empty() { - if self.dataset.is_none() { + if self.maybe_dataset.is_none() { tracing::info!("Dataset does not exist, trying to create from Seed block"); let dataset_alias = self @@ -237,7 +237,9 @@ impl AxumServerPushProtocolInstance { )) .await; match create_result { - Ok(create_result) => self.dataset = Some(create_result.dataset), + Ok(create_result) => { + self.maybe_dataset = Some(create_result.dataset); + } Err(ref _e @ CreateDatasetError::RefCollision(ref err)) => { return Err(PushServerError::RefCollision(RefCollisionError { id: err.id.clone(), @@ -248,7 +250,7 @@ impl AxumServerPushProtocolInstance { } Err(e) => { return Err(PushServerError::Internal(PhaseInternalError { - phase: TransferPhase::Push(PushPhase::ObjectsUploadProgress), + phase: TransferPhase::Push(PushPhase::EnsuringTargetExists), error: e.int_err(), })); } @@ -257,7 +259,7 @@ impl AxumServerPushProtocolInstance { loop { let should_continue = self - .try_handle_push_objects_request(self.dataset.as_ref().unwrap().clone()) + .try_handle_push_objects_request(self.maybe_dataset.as_ref().unwrap().clone()) .await?; if !should_continue { @@ -293,7 +295,7 @@ impl AxumServerPushProtocolInstance { // TODO: consider size estimate and maybe cancel too large pushes - let actual_head = if let Some(dataset) = self.dataset.as_ref() { + let actual_head = if let Some(dataset) = self.maybe_dataset.as_ref() { match dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) @@ -469,7 +471,7 @@ impl AxumServerPushProtocolInstance { tracing::debug!("Push client sent a complete request. Committing the dataset"); - let dataset = self.dataset.clone().unwrap(); + let dataset = self.maybe_dataset.clone().unwrap(); DatabaseTransactionRunner::new(self.catalog.clone()) .transactional_with( |append_dataset_metadata_batch: Arc| async move { diff --git a/src/adapter/http/src/smart_protocol/phases.rs b/src/adapter/http/src/smart_protocol/phases.rs index 716fd89c56..a916daa7e4 100644 --- a/src/adapter/http/src/smart_protocol/phases.rs +++ b/src/adapter/http/src/smart_protocol/phases.rs @@ -26,6 +26,7 @@ pub enum PushPhase { InitialRequest, MetadataRequest, ObjectsRequest, + EnsuringTargetExists, ObjectsUploadProgress, CompleteRequest, } @@ -65,6 +66,7 @@ impl fmt::Display for PushPhase { PushPhase::InitialRequest => "Initial Request", PushPhase::MetadataRequest => "Metadata Request", PushPhase::ObjectsRequest => "Objects Request", + PushPhase::EnsuringTargetExists => "Ensuring Target Dataset Exists", PushPhase::ObjectsUploadProgress => "Objects Upload Progress", PushPhase::CompleteRequest => "Complete Request", }; diff --git a/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs b/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs index cf4261782e..72990ce2d8 100644 --- a/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs +++ b/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs @@ -14,13 +14,10 @@ use dill::*; use futures::SinkExt; use headers::Header; use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; -use kamu::utils::smart_transfer_protocol::{ - DatasetFactoryFn, - SmartTransferProtocolClient, - TransferOptions, -}; +use kamu::utils::smart_transfer_protocol::{SmartTransferProtocolClient, TransferOptions}; use kamu_core::*; -use opendatafabric::{AsTypedBlock, Multihash}; +use odf::AsTypedBlock; +use opendatafabric as odf; use serde::de::DeserializeOwned; use serde::Serialize; use tokio::net::TcpStream; @@ -61,7 +58,7 @@ impl WsSmartTransferProtocolClient { async fn pull_send_request( &self, socket: &mut TungsteniteStream, - dst_head: Option, + dst_head: Option, force_update_if_diverged: bool, ) -> Result { let pull_request_message = DatasetPullRequest { @@ -198,7 +195,7 @@ impl WsSmartTransferProtocolClient { &self, socket: &mut TungsteniteStream, transfer_plan: TransferPlan, - dst_head: Option<&Multihash>, + dst_head: Option<&odf::Multihash>, force_update_if_diverged: bool, visibility_for_created_dataset: DatasetVisibility, ) -> Result { @@ -250,8 +247,8 @@ impl WsSmartTransferProtocolClient { &self, socket: &mut TungsteniteStream, src_dataset: &dyn Dataset, - src_head: &Multihash, - dst_head: Option<&Multihash>, + src_head: &odf::Multihash, + dst_head: Option<&odf::Multihash>, force_update_if_diverged: bool, ) -> Result { tracing::debug!("Sending push metadata request"); @@ -518,7 +515,7 @@ impl SmartTransferProtocolClient for WsSmartTransferProtocolClient { &self, http_src_url: &Url, dst: Option>, - dst_factory: Option, + dst_alias: Option<&odf::DatasetAlias>, listener: Arc, transfer_options: TransferOptions, ) -> Result { @@ -639,7 +636,21 @@ impl SmartTransferProtocolClient for WsSmartTransferProtocolClient { message: "First metadata block is not Seed".to_owned(), source: None, })?; - let create_result = dst_factory.unwrap()(seed_block).await.int_err()?; + + let create_dataset_use_case = + self.catalog.get_one::().unwrap(); + let alias = + dst_alias.ok_or_else(|| "Destination dataset alias is unknown".int_err())?; + let create_result = create_dataset_use_case + .execute( + alias, + seed_block, + CreateDatasetUseCaseOptions { + dataset_visibility: transfer_options.visibility_for_created_dataset, + }, + ) + .await + .int_err()?; assert_eq!(first_hash, create_result.head); create_result.dataset }; @@ -736,7 +747,7 @@ impl SmartTransferProtocolClient for WsSmartTransferProtocolClient { &self, src: Arc, http_dst_url: &Url, - dst_head: Option<&Multihash>, + dst_head: Option<&odf::Multihash>, listener: Arc, transfer_options: TransferOptions, ) -> Result { @@ -884,7 +895,7 @@ impl SmartTransferProtocolClient for WsSmartTransferProtocolClient { self.export_group_of_object_files( &mut ws_stream, push_objects_response, - src, + src.clone(), transfer_options, ) .await?; diff --git a/src/adapter/http/tests/harness/client_side_harness.rs b/src/adapter/http/tests/harness/client_side_harness.rs index 01b44ce411..9f85679a4d 100644 --- a/src/adapter/http/tests/harness/client_side_harness.rs +++ b/src/adapter/http/tests/harness/client_side_harness.rs @@ -18,6 +18,7 @@ use dill::Component; use headers::Header; use internal_error::{InternalError, ResultIntoInternal}; use kamu::domain::*; +use kamu::utils::simple_transfer_protocol::SimpleTransferProtocol; use kamu::*; use kamu_accounts::CurrentAccountSubject; use kamu_adapter_http::{OdfSmtpVersion, SmartTransferProtocolClientWs}; @@ -48,14 +49,14 @@ const CLIENT_ACCOUNT_NAME: &str = "kamu-client"; pub(crate) struct ClientSideHarness { tempdir: TempDir, catalog: dill::Catalog, - pull_service: Arc, - push_service: Arc, + pull_dataset_use_case: Arc, + push_dataset_use_case: Arc, access_token_resover: Arc, options: ClientSideHarnessOptions, } pub(crate) struct ClientSideHarnessOptions { - pub multi_tenant: bool, + pub tenancy_config: TenancyConfig, pub authenticated_remotely: bool, } @@ -98,13 +99,12 @@ impl ClientSideHarness { b.add::(); - b.add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(options.multi_tenant), - ) - .bind::() - .bind::(); + b.add_value(options.tenancy_config); + + b.add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) + .bind::() + .bind::() + .add::(); b.add::(); @@ -125,16 +125,20 @@ impl ClientSideHarness { b.add::(); b.add::(); + b.add::(); b.add::(); + b.add::(); - b.add::(); + b.add::(); + b.add::(); + b.add::(); b.add::(); - b.add::(); + b.add::(); - b.add::(); + b.add::(); b.add::(); @@ -142,6 +146,8 @@ impl ClientSideHarness { b.add::(); b.add::(); b.add::(); + b.add::(); + b.add::(); b.add_value(ContainerRuntime::default()); b.add_value(kamu::utils::ipfs_wrapper::IpfsClient::default()); @@ -151,8 +157,8 @@ impl ClientSideHarness { let catalog = b.build(); - let pull_service = catalog.get_one::().unwrap(); - let push_service = catalog.get_one::().unwrap(); + let pull_dataset_use_case = catalog.get_one::().unwrap(); + let push_dataset_use_case = catalog.get_one::().unwrap(); let access_token_resover = catalog .get_one::() .unwrap(); @@ -160,23 +166,25 @@ impl ClientSideHarness { Self { tempdir, catalog, - pull_service, - push_service, + pull_dataset_use_case, + push_dataset_use_case, access_token_resover, options, } } pub fn operating_account_name(&self) -> Option { - if self.options.multi_tenant && self.options.authenticated_remotely { + if self.options.tenancy_config == TenancyConfig::MultiTenant + && self.options.authenticated_remotely + { Some(AccountName::new_unchecked(CLIENT_ACCOUNT_NAME)) } else { None } } - pub fn dataset_repository(&self) -> Arc { - self.catalog.get_one::().unwrap() + pub fn dataset_registry(&self) -> Arc { + self.catalog.get_one::().unwrap() } pub fn create_dataset_from_snapshot(&self) -> Arc { @@ -197,12 +205,12 @@ impl ClientSideHarness { // TODO: accept alias or handle pub fn dataset_layout(&self, dataset_id: &DatasetID, dataset_name: &str) -> DatasetLayout { - let root_path = if self.options.multi_tenant { - self.internal_datasets_folder_path() + let root_path = match self.options.tenancy_config { + TenancyConfig::MultiTenant => self + .internal_datasets_folder_path() .join(CLIENT_ACCOUNT_NAME) - .join(dataset_id.as_multibase().to_stack_string()) - } else { - self.internal_datasets_folder_path().join(dataset_name) + .join(dataset_id.as_multibase().to_stack_string()), + TenancyConfig::SingleTenant => self.internal_datasets_folder_path().join(dataset_name), }; DatasetLayout::new(root_path.as_path()) } @@ -212,10 +220,12 @@ impl ClientSideHarness { dataset_ref: DatasetRefAny, force: bool, ) -> Vec { - self.pull_service - .pull_multi( - vec![dataset_ref], - PullMultiOptions { + self.pull_dataset_use_case + .execute_multi( + vec![PullRequest::from_any_ref(&dataset_ref, |_| { + self.options.tenancy_config == TenancyConfig::SingleTenant + })], + PullOptions { sync_options: SyncOptions { create_if_not_exists: true, force, @@ -247,9 +257,15 @@ impl ClientSideHarness { force: bool, dataset_visibility: DatasetVisibility, ) -> Vec { - self.push_service - .push_multi( - vec![dataset_local_ref], + let dataset_handle = self + .dataset_registry() + .resolve_dataset_handle_by_ref(&dataset_local_ref) + .await + .unwrap(); + + self.push_dataset_use_case + .execute_multi( + vec![dataset_handle], PushMultiOptions { sync_options: SyncOptions { create_if_not_exists: true, @@ -263,6 +279,7 @@ impl ClientSideHarness { None, ) .await + .unwrap() } pub async fn push_dataset_result( diff --git a/src/adapter/http/tests/harness/common_harness.rs b/src/adapter/http/tests/harness/common_harness.rs index 5568775a8c..aa3c7d6886 100644 --- a/src/adapter/http/tests/harness/common_harness.rs +++ b/src/adapter/http/tests/harness/common_harness.rs @@ -154,15 +154,18 @@ async fn create_random_parquet_file( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub(crate) async fn commit_add_data_event( - dataset_repo: &dyn DatasetRepository, + dataset_registry: &dyn DatasetRegistry, dataset_ref: &DatasetRef, dataset_layout: &DatasetLayout, prev_data_block_hash: Option, ) -> CommitResult { - let dataset = dataset_repo.find_dataset_by_ref(dataset_ref).await.unwrap(); + let resolved_dataset = dataset_registry + .get_dataset_by_ref(dataset_ref) + .await + .unwrap(); let (prev_offset, prev_checkpoint) = if let Some(prev_data_block_hash) = prev_data_block_hash { - let prev_data_block = dataset + let prev_data_block = resolved_dataset .as_metadata_chain() .get_block(&prev_data_block_hash) .await @@ -182,7 +185,7 @@ pub(crate) async fn commit_add_data_event( .await .build(); - dataset + resolved_dataset .commit_event(MetadataEvent::AddData(random_data), CommitOpts::default()) .await .unwrap() diff --git a/src/adapter/http/tests/harness/server_side_harness.rs b/src/adapter/http/tests/harness/server_side_harness.rs index ccd26681d7..767c863163 100644 --- a/src/adapter/http/tests/harness/server_side_harness.rs +++ b/src/adapter/http/tests/harness/server_side_harness.rs @@ -23,12 +23,12 @@ use kamu::domain::{ CompactionService, CreateDatasetFromSnapshotUseCase, CreateDatasetUseCase, - DatasetRepository, }; use kamu::testing::MockDatasetActionAuthorizer; use kamu::DatasetLayout; use kamu_accounts::testing::MockAuthenticationService; use kamu_accounts::{Account, AccountType, CurrentAccountSubject, PROVIDER_PASSWORD}; +use kamu_core::{DatasetRegistry, TenancyConfig}; use opendatafabric::{AccountID, AccountName, DatasetAlias, DatasetHandle}; use reqwest::Url; use time_source::SystemTimeSourceStub; @@ -42,7 +42,8 @@ pub(crate) const SERVER_ACCOUNT_NAME: &str = "kamu-server"; #[async_trait::async_trait] pub(crate) trait ServerSideHarness { fn operating_account_name(&self) -> Option; - fn cli_dataset_repository(&self) -> Arc; + + fn cli_dataset_registry(&self) -> Arc; fn cli_create_dataset_use_case(&self) -> Arc; @@ -74,7 +75,7 @@ pub(crate) trait ServerSideHarness { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub(crate) struct ServerSideHarnessOptions { - pub multi_tenant: bool, + pub tenancy_config: TenancyConfig, pub authorized_writes: bool, pub base_catalog: Option, } diff --git a/src/adapter/http/tests/harness/server_side_local_fs_harness.rs b/src/adapter/http/tests/harness/server_side_local_fs_harness.rs index 89fd8aa1a5..119ba57ac1 100644 --- a/src/adapter/http/tests/harness/server_side_local_fs_harness.rs +++ b/src/adapter/http/tests/harness/server_side_local_fs_harness.rs @@ -31,6 +31,7 @@ use kamu::{ CreateDatasetFromSnapshotUseCaseImpl, CreateDatasetUseCaseImpl, DatasetLayout, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, @@ -40,6 +41,7 @@ use kamu::{ }; use kamu_accounts::testing::MockAuthenticationService; use kamu_accounts::{Account, AuthenticationService}; +use kamu_core::{DatasetRegistry, TenancyConfig}; use messaging_outbox::DummyOutboxImpl; use opendatafabric::{AccountName, DatasetAlias, DatasetHandle}; use tempfile::TempDir; @@ -101,14 +103,12 @@ impl ServerSideLocalFsHarness { .add_value(time_source.clone()) .bind::() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(options.multi_tenant), - ) + .add_value(options.tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() .add_value(server_authentication_mock(&account)) + .add::() .bind::() .add_value(ServerUrlConfig::new_test(Some(&base_url_rest))) .add::() @@ -128,7 +128,7 @@ impl ServerSideLocalFsHarness { let api_server = TestAPIServer::new( create_web_user_catalog(&base_catalog, &options), listener, - options.multi_tenant, + options.tenancy_config, ); Self { @@ -157,16 +157,15 @@ impl ServerSideLocalFsHarness { #[async_trait::async_trait] impl ServerSideHarness for ServerSideLocalFsHarness { fn operating_account_name(&self) -> Option { - if self.options.multi_tenant { - Some(AccountName::new_unchecked(SERVER_ACCOUNT_NAME)) - } else { - None + match self.options.tenancy_config { + TenancyConfig::MultiTenant => Some(AccountName::new_unchecked(SERVER_ACCOUNT_NAME)), + TenancyConfig::SingleTenant => None, } } - fn cli_dataset_repository(&self) -> Arc { + fn cli_dataset_registry(&self) -> Arc { let cli_catalog = create_cli_user_catalog(&self.base_catalog); - cli_catalog.get_one::().unwrap() + cli_catalog.get_one::().unwrap() } fn cli_create_dataset_use_case(&self) -> Arc { @@ -206,8 +205,8 @@ impl ServerSideHarness for ServerSideLocalFsHarness { fn dataset_url_with_scheme(&self, dataset_alias: &DatasetAlias, scheme: &str) -> Url { let api_server_address = self.api_server_addr(); Url::from_str( - if self.options.multi_tenant { - format!( + match self.options.tenancy_config { + TenancyConfig::MultiTenant => format!( "{}://{}/{}/{}", scheme, api_server_address, @@ -217,12 +216,11 @@ impl ServerSideHarness for ServerSideLocalFsHarness { panic!("Account name not specified in alias"); }, dataset_alias.dataset_name - ) - } else { - format!( + ), + TenancyConfig::SingleTenant => format!( "{}://{}/{}", scheme, api_server_address, dataset_alias.dataset_name - ) + ), } .as_str(), ) @@ -230,8 +228,9 @@ impl ServerSideHarness for ServerSideLocalFsHarness { } fn dataset_layout(&self, dataset_handle: &DatasetHandle) -> DatasetLayout { - let root_path = if self.options.multi_tenant { - self.internal_datasets_folder_path() + let root_path = match self.options.tenancy_config { + TenancyConfig::MultiTenant => self + .internal_datasets_folder_path() .join( if let Some(account_name) = &dataset_handle.alias.account_name { account_name.to_string() @@ -239,10 +238,10 @@ impl ServerSideHarness for ServerSideLocalFsHarness { panic!("Account name not specified in alias"); }, ) - .join(dataset_handle.id.as_multibase().to_stack_string()) - } else { - self.internal_datasets_folder_path() - .join(dataset_handle.alias.dataset_name.clone()) + .join(dataset_handle.id.as_multibase().to_stack_string()), + TenancyConfig::SingleTenant => self + .internal_datasets_folder_path() + .join(dataset_handle.alias.dataset_name.clone()), }; DatasetLayout::new(root_path.as_path()) } diff --git a/src/adapter/http/tests/harness/server_side_s3_harness.rs b/src/adapter/http/tests/harness/server_side_s3_harness.rs index a2ede8e515..d969dab80e 100644 --- a/src/adapter/http/tests/harness/server_side_s3_harness.rs +++ b/src/adapter/http/tests/harness/server_side_s3_harness.rs @@ -33,6 +33,7 @@ use kamu::{ CreateDatasetFromSnapshotUseCaseImpl, CreateDatasetUseCaseImpl, DatasetLayout, + DatasetRegistryRepoBridge, DatasetRepositoryS3, DatasetRepositoryWriter, DependencyGraphServiceInMemory, @@ -42,6 +43,7 @@ use kamu::{ }; use kamu_accounts::testing::MockAuthenticationService; use kamu_accounts::{Account, AuthenticationService}; +use kamu_core::{DatasetRegistry, TenancyConfig}; use messaging_outbox::DummyOutboxImpl; use opendatafabric::{AccountName, DatasetAlias, DatasetHandle}; use time_source::{SystemTimeSource, SystemTimeSourceStub}; @@ -96,13 +98,11 @@ impl ServerSideS3Harness { .bind::() .add::() .add::() - .add_builder( - DatasetRepositoryS3::builder() - .with_s3_context(s3_context.clone()) - .with_multi_tenant(options.multi_tenant), - ) + .add_value(options.tenancy_config) + .add_builder(DatasetRepositoryS3::builder().with_s3_context(s3_context.clone())) .bind::() .bind::() + .add::() .add_value(server_authentication_mock(&account)) .bind::() .add_value(ServerUrlConfig::new_test(Some(&base_url_rest))) @@ -124,7 +124,7 @@ impl ServerSideS3Harness { let api_server = TestAPIServer::new( create_web_user_catalog(&base_catalog, &options), listener, - options.multi_tenant, + options.tenancy_config, ); Self { @@ -146,16 +146,15 @@ impl ServerSideS3Harness { #[async_trait::async_trait] impl ServerSideHarness for ServerSideS3Harness { fn operating_account_name(&self) -> Option { - if self.options.multi_tenant { - Some(AccountName::new_unchecked(SERVER_ACCOUNT_NAME)) - } else { - None + match self.options.tenancy_config { + TenancyConfig::MultiTenant => Some(AccountName::new_unchecked(SERVER_ACCOUNT_NAME)), + TenancyConfig::SingleTenant => None, } } - fn cli_dataset_repository(&self) -> Arc { + fn cli_dataset_registry(&self) -> Arc { let cli_catalog = create_cli_user_catalog(&self.base_catalog); - cli_catalog.get_one::().unwrap() + cli_catalog.get_one::().unwrap() } fn cli_create_dataset_use_case(&self) -> Arc { @@ -187,19 +186,18 @@ impl ServerSideHarness for ServerSideS3Harness { fn dataset_url_with_scheme(&self, dataset_alias: &DatasetAlias, scheme: &str) -> Url { let api_server_address = self.api_server_addr(); Url::from_str( - if self.options.multi_tenant { - format!( + match self.options.tenancy_config { + TenancyConfig::MultiTenant => format!( "{}://{}/{}/{}", scheme, api_server_address, dataset_alias.account_name.as_ref().unwrap(), dataset_alias.dataset_name - ) - } else { - format!( + ), + TenancyConfig::SingleTenant => format!( "{}://{}/{}", scheme, api_server_address, dataset_alias.dataset_name - ) + ), } .as_str(), ) diff --git a/src/adapter/http/tests/harness/test_api_server.rs b/src/adapter/http/tests/harness/test_api_server.rs index 40f52f2467..2ccce1c2a7 100644 --- a/src/adapter/http/tests/harness/test_api_server.rs +++ b/src/adapter/http/tests/harness/test_api_server.rs @@ -10,6 +10,7 @@ use std::net::SocketAddr; use dill::Catalog; +use kamu_core::TenancyConfig; use utoipa_axum::router::OpenApiRouter; use utoipa_axum::routes; @@ -21,7 +22,11 @@ pub struct TestAPIServer { } impl TestAPIServer { - pub fn new(catalog: Catalog, listener: tokio::net::TcpListener, multi_tenant: bool) -> Self { + pub fn new( + catalog: Catalog, + listener: tokio::net::TcpListener, + tenancy_config: TenancyConfig, + ) -> Self { let (router, _api) = OpenApiRouter::new() .routes(routes!(kamu_adapter_http::platform_login_handler)) .routes(routes!(kamu_adapter_http::platform_token_validate_handler)) @@ -35,16 +40,15 @@ impl TestAPIServer { .merge(kamu_adapter_http::data::root_router()) .merge(kamu_adapter_http::general::root_router()) .nest( - if multi_tenant { - "/:account_name/:dataset_name" - } else { - "/:dataset_name" + match tenancy_config { + TenancyConfig::MultiTenant => "/:account_name/:dataset_name", + TenancyConfig::SingleTenant => "/:dataset_name", }, kamu_adapter_http::add_dataset_resolver_layer( OpenApiRouter::new() .merge(kamu_adapter_http::smart_transfer_protocol_router()) .merge(kamu_adapter_http::data::dataset_router()), - multi_tenant, + tenancy_config, ), ) .layer( diff --git a/src/adapter/http/tests/tests/mod.rs b/src/adapter/http/tests/tests/mod.rs index eabae3bea9..dc13b4df7c 100644 --- a/src/adapter/http/tests/tests/mod.rs +++ b/src/adapter/http/tests/tests/mod.rs @@ -35,9 +35,12 @@ macro_rules! test_client_server_local_fs_harness_permutations { #[test_log::test(tokio::test)] async fn [<$test_name "_st_client_st_local_fs_server">] () { $test_package::$test_name( - ClientSideHarness::new(ClientSideHarnessOptions { multi_tenant: false, authenticated_remotely: true }), + ClientSideHarness::new(ClientSideHarnessOptions { + tenancy_config: TenancyConfig::SingleTenant, + authenticated_remotely: true + }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authorized_writes: true, base_catalog: None, }).await, @@ -50,9 +53,12 @@ macro_rules! test_client_server_local_fs_harness_permutations { #[test_log::test(tokio::test)] async fn [<$test_name "_st_client_mt_local_fs_server">] () { $test_package::$test_name( - ClientSideHarness::new(ClientSideHarnessOptions { multi_tenant: false, authenticated_remotely: true }), + ClientSideHarness::new(ClientSideHarnessOptions { + tenancy_config: TenancyConfig::SingleTenant, + authenticated_remotely: true + }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: None, }).await, @@ -65,9 +71,12 @@ macro_rules! test_client_server_local_fs_harness_permutations { #[test_log::test(tokio::test)] async fn [<$test_name "_mt_client_st_local_fs_server">] () { $test_package::$test_name( - ClientSideHarness::new(ClientSideHarnessOptions { multi_tenant: true, authenticated_remotely: true }), + ClientSideHarness::new(ClientSideHarnessOptions { + tenancy_config: TenancyConfig::MultiTenant, + authenticated_remotely: true + }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authorized_writes: true, base_catalog: None, }).await, @@ -80,9 +89,12 @@ macro_rules! test_client_server_local_fs_harness_permutations { #[test_log::test(tokio::test)] async fn [<$test_name "_mt_client_mt_local_fs_server">] () { $test_package::$test_name( - ClientSideHarness::new(ClientSideHarnessOptions { multi_tenant: true, authenticated_remotely: true }), + ClientSideHarness::new(ClientSideHarnessOptions { + tenancy_config: TenancyConfig::MultiTenant, + authenticated_remotely: true + }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: None, }).await, @@ -105,9 +117,12 @@ macro_rules! test_client_server_s3_harness_permutations { #[test_log::test(tokio::test)] async fn [<$test_name "_st_client_st_s3_server">] () { $test_package::$test_name( - ClientSideHarness::new(ClientSideHarnessOptions { multi_tenant: false, authenticated_remotely: true }), + ClientSideHarness::new(ClientSideHarnessOptions { + tenancy_config: TenancyConfig::SingleTenant, + authenticated_remotely: true + }), ServerSideS3Harness::new(ServerSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authorized_writes: true, base_catalog: None, }).await, @@ -121,9 +136,12 @@ macro_rules! test_client_server_s3_harness_permutations { #[test_log::test(tokio::test)] async fn [<$test_name "_st_client_mt_s3_server">] () { $test_package::$test_name( - ClientSideHarness::new(ClientSideHarnessOptions { multi_tenant: false, authenticated_remotely: true }), + ClientSideHarness::new(ClientSideHarnessOptions { + tenancy_config: TenancyConfig::SingleTenant, + authenticated_remotely: true + }), ServerSideS3Harness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: None, }).await, diff --git a/src/adapter/http/tests/tests/test_account_info.rs b/src/adapter/http/tests/tests/test_account_info.rs index 4dbc859122..96e12943e1 100644 --- a/src/adapter/http/tests/tests/test_account_info.rs +++ b/src/adapter/http/tests/tests/test_account_info.rs @@ -8,7 +8,7 @@ // by the Apache License, Version 2.0. use kamu_accounts::DUMMY_ACCESS_TOKEN; -use kamu_core::RunInfoDir; +use kamu_core::{RunInfoDir, TenancyConfig}; use serde_json::json; use crate::harness::*; @@ -17,7 +17,7 @@ use crate::harness::*; #[test_log::test(tokio::test)] async fn test_get_account_info_with_wrong_token() { - let harness = AccountInfoHarness::new(false).await; + let harness = AccountInfoHarness::new(TenancyConfig::SingleTenant).await; let client = async move { let cl = reqwest::Client::new(); @@ -42,7 +42,7 @@ async fn test_get_account_info_with_wrong_token() { #[test_log::test(tokio::test)] async fn test_get_account_info() { - let harness = AccountInfoHarness::new(false).await; + let harness = AccountInfoHarness::new(TenancyConfig::SingleTenant).await; let expected_account = harness.server_harness.api_server_account(); let client = async move { @@ -77,7 +77,7 @@ struct AccountInfoHarness { } impl AccountInfoHarness { - async fn new(is_multi_tenant: bool) -> Self { + async fn new(tenancy_config: TenancyConfig) -> Self { let run_info_dir = tempfile::tempdir().unwrap(); let catalog = dill::CatalogBuilder::new() @@ -85,7 +85,7 @@ impl AccountInfoHarness { .build(); let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: is_multi_tenant, + tenancy_config, authorized_writes: true, base_catalog: Some(catalog), }) diff --git a/src/adapter/http/tests/tests/test_data_ingest.rs b/src/adapter/http/tests/tests/test_data_ingest.rs index b58048b4e8..0b25effe3e 100644 --- a/src/adapter/http/tests/tests/test_data_ingest.rs +++ b/src/adapter/http/tests/tests/test_data_ingest.rs @@ -609,7 +609,7 @@ impl DataIngestHarness { .build(); let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: Some(catalog), }) diff --git a/src/adapter/http/tests/tests/test_data_query.rs b/src/adapter/http/tests/tests/test_data_query.rs index 719d28ddb8..14b1719921 100644 --- a/src/adapter/http/tests/tests/test_data_query.rs +++ b/src/adapter/http/tests/tests/test_data_query.rs @@ -58,7 +58,7 @@ impl Harness { .build(); let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: Some(catalog), }) diff --git a/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs b/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs index 0af6474a74..416bc02f8c 100644 --- a/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs +++ b/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs @@ -18,12 +18,14 @@ use kamu::domain::{CreateDatasetUseCase, DatasetRepository}; use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer}; use kamu::{ CreateDatasetUseCaseImpl, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, }; use kamu_accounts::testing::MockAuthenticationService; use kamu_accounts::*; +use kamu_core::TenancyConfig; use messaging_outbox::DummyOutboxImpl; use mockall::predicate::{eq, function}; use opendatafabric::{DatasetAlias, DatasetHandle, DatasetKind, DatasetName, DatasetRef}; @@ -227,13 +229,14 @@ impl ServerHarness { .bind::() .add_value(dataset_action_authorizer) .bind::() + .add_value(TenancyConfig::SingleTenant) .add_builder( DatasetRepositoryLocalFs::builder() - .with_multi_tenant(false) .with_root(datasets_dir), ) .bind::() .bind::() + .add::() .add::() .add::(); diff --git a/src/adapter/http/tests/tests/test_dataset_info.rs b/src/adapter/http/tests/tests/test_dataset_info.rs index cd1a66ff8a..6a8b6e36f8 100644 --- a/src/adapter/http/tests/tests/test_dataset_info.rs +++ b/src/adapter/http/tests/tests/test_dataset_info.rs @@ -10,6 +10,7 @@ use chrono::Utc; use kamu::testing::MetadataFactory; use kamu_accounts::DUMMY_ACCESS_TOKEN; +use kamu_core::TenancyConfig; use opendatafabric::{DatasetAlias, DatasetID, DatasetKind, DatasetName}; use serde_json::json; @@ -19,7 +20,7 @@ use crate::harness::*; #[test_log::test(tokio::test)] async fn test_get_dataset_info_by_id() { - let harness = DatasetInfoHarness::new(false).await; + let harness = DatasetInfoHarness::new(TenancyConfig::SingleTenant).await; let dataset_alias = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); let create_result = harness @@ -67,7 +68,7 @@ async fn test_get_dataset_info_by_id() { #[test_log::test(tokio::test)] async fn test_get_dataset_info_by_id_not_found_err() { - let harness = DatasetInfoHarness::new(false).await; + let harness = DatasetInfoHarness::new(TenancyConfig::SingleTenant).await; let client = async move { let cl = reqwest::Client::new(); @@ -100,9 +101,9 @@ struct DatasetInfoHarness { } impl DatasetInfoHarness { - async fn new(is_multi_tenant: bool) -> Self { + async fn new(tenancy_config: TenancyConfig) -> Self { let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: is_multi_tenant, + tenancy_config, authorized_writes: true, base_catalog: None, }) diff --git a/src/adapter/http/tests/tests/test_node_info.rs b/src/adapter/http/tests/tests/test_node_info.rs index 894bda0b0b..368f4171bf 100644 --- a/src/adapter/http/tests/tests/test_node_info.rs +++ b/src/adapter/http/tests/tests/test_node_info.rs @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use kamu_core::TenancyConfig; use serde_json::json; use crate::harness::*; @@ -15,7 +16,7 @@ use crate::harness::*; #[test_log::test(tokio::test)] async fn test_node_info_single_tenant() { - let harness = NodeInfoHarness::new(false).await; + let harness = NodeInfoHarness::new(TenancyConfig::SingleTenant).await; let client = async move { let cl = reqwest::Client::new(); @@ -41,7 +42,7 @@ async fn test_node_info_single_tenant() { #[test_log::test(tokio::test)] async fn test_node_info_multi_tenant() { - let harness = NodeInfoHarness::new(true).await; + let harness = NodeInfoHarness::new(TenancyConfig::MultiTenant).await; let client = async move { let cl = reqwest::Client::new(); @@ -73,9 +74,9 @@ struct NodeInfoHarness { } impl NodeInfoHarness { - async fn new(is_multi_tenant: bool) -> Self { + async fn new(tenancy_config: TenancyConfig) -> Self { let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: is_multi_tenant, + tenancy_config, authorized_writes: true, base_catalog: None, }) diff --git a/src/adapter/http/tests/tests/test_platform_login_validate.rs b/src/adapter/http/tests/tests/test_platform_login_validate.rs index 6353de8d5e..7a16df9dd0 100644 --- a/src/adapter/http/tests/tests/test_platform_login_validate.rs +++ b/src/adapter/http/tests/tests/test_platform_login_validate.rs @@ -22,6 +22,7 @@ use kamu_accounts_services::{ PredefinedAccountsRegistrator, }; use kamu_adapter_http::{LoginRequestBody, LoginResponseBody}; +use kamu_core::TenancyConfig; use opendatafabric::AccountName; use serde_json::json; use time_source::{SystemTimeSource, SystemTimeSourceStub}; @@ -85,7 +86,7 @@ impl Harness { let addr = SocketAddr::from(([127, 0, 0, 1], 0)); let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); - let api_server = TestAPIServer::new(catalog, listener, true); + let api_server = TestAPIServer::new(catalog, listener, TenancyConfig::MultiTenant); Self { run_info_dir, diff --git a/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs b/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs index 4aba91785d..57a72f1b28 100644 --- a/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs +++ b/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs @@ -18,6 +18,7 @@ use kamu_accounts::DUMMY_ACCESS_TOKEN; use kamu_adapter_http::smart_protocol::messages::{self, SMART_TRANSFER_PROTOCOL_VERSION}; use kamu_adapter_http::smart_protocol::protocol_dataset_helper::*; use kamu_adapter_http::{BearerHeader, OdfSmtpVersion}; +use kamu_core::TenancyConfig; use opendatafabric::{DatasetID, DatasetKind, Multihash}; use url::Url; @@ -35,7 +36,7 @@ use crate::harness::{ #[test_log::test(tokio::test)] async fn test_object_url_local_fs() { let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authorized_writes: true, base_catalog: None, }) @@ -254,7 +255,7 @@ async fn test_object_url_local_fs() { #[test_log::test(tokio::test)] async fn test_pull_object_url_s3() { let server_harness = ServerSideS3Harness::new(ServerSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authorized_writes: true, base_catalog: None, }) @@ -490,7 +491,7 @@ async fn create_test_case(server_harness: &dyn ServerSideHarness) -> TestCase { .unwrap(); let commit_result = commit_add_data_event( - server_harness.cli_dataset_repository().as_ref(), + server_harness.cli_dataset_registry().as_ref(), &make_dataset_ref(&None, "foo"), &server_harness.dataset_layout(&create_result.dataset_handle), None, diff --git a/src/adapter/http/tests/tests/test_routing.rs b/src/adapter/http/tests/tests/test_routing.rs index 7366010eeb..4d06f36f26 100644 --- a/src/adapter/http/tests/tests/test_routing.rs +++ b/src/adapter/http/tests/tests/test_routing.rs @@ -12,6 +12,7 @@ use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use ::serde::Deserialize; use axum::extract::{FromRequestParts, Path}; +use database_common::{DatabaseTransactionRunner, NoOpDatabasePlugin}; use dill::Component; use kamu::domain::*; use kamu::testing::*; @@ -38,21 +39,23 @@ async fn setup_repo() -> RepoFixture { let datasets_dir = tmp_dir.path().join("datasets"); std::fs::create_dir(&datasets_dir).unwrap(); - let catalog = dill::CatalogBuilder::new() - .add::() + let mut b = dill::CatalogBuilder::new(); + b.add::() .add::() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add_value(CurrentAccountSubject::new_test()) .add::() .add::() - .build(); + .add::(); + + NoOpDatabasePlugin::init_database_components(&mut b); + + let catalog = b.build(); let create_dataset_from_snapshot = catalog .get_one::() diff --git a/src/adapter/http/tests/tests/test_upload_local.rs b/src/adapter/http/tests/tests/test_upload_local.rs index 92bd9cc283..548ac32f88 100644 --- a/src/adapter/http/tests/tests/test_upload_local.rs +++ b/src/adapter/http/tests/tests/test_upload_local.rs @@ -35,7 +35,7 @@ use kamu_adapter_http::{ UploadToken, UploadTokenBase64Json, }; -use kamu_core::MediaType; +use kamu_core::{MediaType, TenancyConfig}; use opendatafabric::{AccountID, AccountName}; use serde_json::json; use time_source::SystemTimeSourceDefault; @@ -97,7 +97,7 @@ impl Harness { let authentication_service = catalog.get_one::().unwrap(); - let api_server = TestAPIServer::new(catalog, listener, true); + let api_server = TestAPIServer::new(catalog, listener, TenancyConfig::MultiTenant); Self { _tempdir: tempdir, diff --git a/src/adapter/http/tests/tests/test_upload_s3.rs b/src/adapter/http/tests/tests/test_upload_s3.rs index e124ae53b5..ed4cb998c2 100644 --- a/src/adapter/http/tests/tests/test_upload_s3.rs +++ b/src/adapter/http/tests/tests/test_upload_s3.rs @@ -26,6 +26,7 @@ use kamu_accounts_services::{ PredefinedAccountsRegistrator, }; use kamu_adapter_http::{FileUploadLimitConfig, UploadContext, UploadService, UploadServiceS3}; +use kamu_core::TenancyConfig; use opendatafabric::AccountID; use serde_json::json; use time_source::SystemTimeSourceDefault; @@ -80,7 +81,7 @@ impl Harness { let authentication_service = catalog.get_one::().unwrap(); - let api_server = TestAPIServer::new(catalog, listener, true); + let api_server = TestAPIServer::new(catalog, listener, TenancyConfig::MultiTenant); Self { _s3: s3, diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_existing_evolved_dataset_reread_succeeds.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_existing_evolved_dataset_reread_succeeds.rs index 6808c6037b..bd905d93eb 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_existing_evolved_dataset_reread_succeeds.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_existing_evolved_dataset_reread_succeeds.rs @@ -82,11 +82,11 @@ impl // Extend server-side dataset with new nodes - let server_repo = server_harness.cli_dataset_repository(); + let server_repo = server_harness.cli_dataset_registry(); let server_dataset_ref = make_dataset_ref(&server_account_name, "foo"); let server_dataset_handle = server_repo - .resolve_dataset_ref(&server_dataset_ref) + .resolve_dataset_handle_by_ref(&server_dataset_ref) .await .unwrap(); diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_new_reread_succeeds.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_new_reread_succeeds.rs index f5f941ccf0..131e228d47 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_new_reread_succeeds.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_aborted_read_of_new_reread_succeeds.rs @@ -65,7 +65,7 @@ impl server_harness.dataset_layout(&server_create_result.dataset_handle); let server_commit_result = commit_add_data_event( - server_harness.cli_dataset_repository().as_ref(), + server_harness.cli_dataset_registry().as_ref(), &make_dataset_ref(&server_account_name, "foo"), &server_dataset_layout, None, diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_advanced_dataset_fails.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_advanced_dataset_fails.rs index 9ae05d2df8..d30d527ddd 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_advanced_dataset_fails.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_advanced_dataset_fails.rs @@ -73,9 +73,9 @@ impl .await; // Extend client-side dataset with new node - let client_repo = client_harness.dataset_repository(); - client_repo - .find_dataset_by_ref(&make_dataset_ref(&client_account_name, "foo")) + let client_registry = client_harness.dataset_registry(); + client_registry + .get_dataset_by_ref(&make_dataset_ref(&client_account_name, "foo")) .await .unwrap() .commit_event( diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_diverged_dataset.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_diverged_dataset.rs index abc4089f3f..771a68bf47 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_diverged_dataset.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_diverged_dataset.rs @@ -69,7 +69,7 @@ impl SmartPullExistingDivergedDatasetScenario for _ in 0..3 { commit_result = Some( commit_add_data_event( - server_harness.cli_dataset_repository().as_ref(), + server_harness.cli_dataset_registry().as_ref(), &server_dataset_ref, &server_dataset_layout, commit_result.map(|r| r.new_head), @@ -91,13 +91,13 @@ impl SmartPullExistingDivergedDatasetScenario ) .await; + let server_dataset = server_harness + .cli_dataset_registry() + .get_dataset_by_handle(&server_create_result.dataset_handle); + let compaction_service = server_harness.cli_compaction_service(); let server_compaction_result = compaction_service - .compact_dataset( - &server_create_result.dataset_handle, - CompactionOptions::default(), - None, - ) + .compact_dataset(server_dataset, CompactionOptions::default(), None) .await .unwrap(); diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_evolved_dataset.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_evolved_dataset.rs index 84c0482544..b87686b6e3 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_evolved_dataset.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_evolved_dataset.rs @@ -76,11 +76,11 @@ impl SmartPullExistingEvolvedDatasetScenario< .await; // Extend server-side dataset with new nodes - let server_repo = server_harness.cli_dataset_repository(); + let server_repo = server_harness.cli_dataset_registry(); let server_dataset_ref = make_dataset_ref(&server_account_name, "foo"); let server_dataset_handle = server_repo - .resolve_dataset_ref(&server_dataset_ref) + .resolve_dataset_handle_by_ref(&server_dataset_ref) .await .unwrap(); diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs index cb82450514..adf253158e 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs @@ -60,7 +60,7 @@ impl SmartPullExistingUpToDateDatasetScenario server_harness.dataset_layout(&server_create_result.dataset_handle); commit_add_data_event( - server_harness.cli_dataset_repository().as_ref(), + server_harness.cli_dataset_registry().as_ref(), &make_dataset_ref(&server_account_name, "foo"), &server_dataset_layout, None, diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_new_dataset.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_new_dataset.rs index 2de4f2ee28..eb345aa467 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_new_dataset.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_new_dataset.rs @@ -60,7 +60,7 @@ impl SmartPullNewDatasetScenario SmartPushAbortedWriteOfNewWriteSucceeds< let client_dataset_ref = make_dataset_ref(&client_account_name, "foo"); let client_commit_result = commit_add_data_event( - client_harness.dataset_repository().as_ref(), + client_harness.dataset_registry().as_ref(), &client_dataset_ref, &client_dataset_layout, None, diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_aborted_write_of_updated_rewrite_succeeds.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_aborted_write_of_updated_rewrite_succeeds.rs index 51b8b4f958..6063001267 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_aborted_write_of_updated_rewrite_succeeds.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_aborted_write_of_updated_rewrite_succeeds.rs @@ -81,12 +81,12 @@ impl ) .await; - let client_repo = client_harness.dataset_repository(); + let client_registry = client_harness.dataset_registry(); // Extend client-side dataset with new nodes let client_dataset_ref = make_dataset_ref(&client_account_name, "foo"); - let client_dataset_handle = client_repo - .resolve_dataset_ref(&client_dataset_ref) + let client_dataset_handle = client_registry + .resolve_dataset_handle_by_ref(&client_dataset_ref) .await .unwrap(); client_harness @@ -104,7 +104,7 @@ impl .unwrap(); let client_commit_result = commit_add_data_event( - client_repo.as_ref(), + client_registry.as_ref(), &client_dataset_ref, &client_dataset_layout, None, diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_dataset_fails_as_server_advanced.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_dataset_fails_as_server_advanced.rs index e119dd9701..eedbeb1839 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_dataset_fails_as_server_advanced.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_dataset_fails_as_server_advanced.rs @@ -40,7 +40,7 @@ impl let client_account_name = client_harness.operating_account_name(); let server_account_name = server_harness.operating_account_name(); - let server_repo = server_harness.cli_dataset_repository(); + let server_repo = server_harness.cli_dataset_registry(); let client_create_result = client_harness .create_dataset_from_snapshot() @@ -82,7 +82,7 @@ impl // Extend server-side dataset with new node server_repo - .find_dataset_by_ref(&make_dataset_ref(&server_account_name, "foo")) + .get_dataset_by_ref(&make_dataset_ref(&server_account_name, "foo")) .await .unwrap() .commit_event( diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_diverged_dataset.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_diverged_dataset.rs index 979181ba32..4010d4ec2a 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_diverged_dataset.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_diverged_dataset.rs @@ -69,7 +69,7 @@ impl SmartPushExistingDivergedDatasetScenario for _ in 0..3 { commit_result = Some( commit_add_data_event( - client_harness.dataset_repository().as_ref(), + client_harness.dataset_registry().as_ref(), &client_dataset_ref, &client_dataset_layout, commit_result.map(|r| r.new_head), @@ -93,14 +93,14 @@ impl SmartPushExistingDivergedDatasetScenario ) .await; + let client_dataset = client_harness + .dataset_registry() + .get_dataset_by_handle(&client_create_result.dataset_handle); + // Compact at client side let compaction_service = client_harness.compaction_service(); let client_compaction_result = compaction_service - .compact_dataset( - &client_create_result.dataset_handle, - CompactionOptions::default(), - None, - ) + .compact_dataset(client_dataset, CompactionOptions::default(), None) .await .unwrap(); diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_evolved_dataset.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_evolved_dataset.rs index fe3bd35d92..8569e192b0 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_evolved_dataset.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_evolved_dataset.rs @@ -79,10 +79,10 @@ impl SmartPushExistingEvolvedDatasetScenario< .await; // Extend client-side dataset with new nodes - let client_repo = client_harness.dataset_repository(); + let client_registry = client_harness.dataset_registry(); let client_dataset_ref = make_dataset_ref(&client_account_name, "foo"); - client_repo - .find_dataset_by_ref(&client_dataset_ref) + client_registry + .get_dataset_by_ref(&client_dataset_ref) .await .unwrap() .commit_event( @@ -97,7 +97,7 @@ impl SmartPushExistingEvolvedDatasetScenario< .unwrap(); let client_commit_result = commit_add_data_event( - client_repo.as_ref(), + client_registry.as_ref(), &client_dataset_ref, &client_dataset_layout, None, diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_ref_collision.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_ref_collision.rs index ff5b283aa9..fc37675f41 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_ref_collision.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_ref_collision.rs @@ -65,7 +65,7 @@ impl SmartPushExistingRefCollisionScenarion SmartPushExistingUpToDateDatasetScenario let client_dataset_ref: DatasetRef = make_dataset_ref(&client_account_name, "foo"); commit_add_data_event( - client_harness.dataset_repository().as_ref(), + client_harness.dataset_registry().as_ref(), &client_dataset_ref, &client_dataset_layout, None, diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_new_dataset.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_new_dataset.rs index bc941cc4eb..b70e109273 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_new_dataset.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_new_dataset.rs @@ -69,7 +69,7 @@ impl SmartPushNewDatasetScenario SmartPushNewDatasetViaRepoRefScenario panic!(), + Err(e) => assert_matches!( + e, + PushError::SyncError(SyncError::Access(AccessError::Unauthorized(_))), + ), + } }; await_client_server_flow!(api_server_handle, client_handle); @@ -75,11 +75,11 @@ async fn test_smart_push_new_dataset_unauthenticated() { async fn test_smart_push_new_dataset_wrong_user() { let scenario = SmartPushNewDatasetScenario::prepare( ClientSideHarness::new(ClientSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authenticated_remotely: true, }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: None, }) @@ -108,13 +108,13 @@ async fn test_smart_push_new_dataset_wrong_user() { .await; let dataset_result = &push_result.first().unwrap().result; - - assert_matches!( - dataset_result, - Err(PushError::SyncError(SyncError::Access( - AccessError::Forbidden(_) - ))) - ); + match dataset_result { + Ok(_) => panic!(), + Err(e) => assert_matches!( + e, + PushError::SyncError(SyncError::Access(AccessError::Forbidden(_))) + ), + } }; await_client_server_flow!(api_server_handle, client_handle); @@ -126,11 +126,11 @@ async fn test_smart_push_new_dataset_wrong_user() { async fn test_smart_push_existing_dataset_unauthenticated() { let scenario = SmartPushExistingEvolvedDatasetScenario::prepare( ClientSideHarness::new(ClientSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authenticated_remotely: false, }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: false, base_catalog: None, }) @@ -152,13 +152,13 @@ async fn test_smart_push_existing_dataset_unauthenticated() { .await; let dataset_result = &push_result.first().unwrap().result; - - assert_matches!( - dataset_result, - Err(PushError::SyncError(SyncError::Access( - AccessError::Unauthorized(_) - ))) - ); + match dataset_result { + Ok(_) => panic!(), + Err(e) => assert_matches!( + e, + PushError::SyncError(SyncError::Access(AccessError::Unauthorized(_))) + ), + } }; await_client_server_flow!(api_server_handle, client_handle); @@ -170,11 +170,11 @@ async fn test_smart_push_existing_dataset_unauthenticated() { async fn test_smart_push_existing_dataset_unauthorized() { let scenario = SmartPushExistingEvolvedDatasetScenario::prepare( ClientSideHarness::new(ClientSideHarnessOptions { - multi_tenant: false, + tenancy_config: TenancyConfig::SingleTenant, authenticated_remotely: true, }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: false, base_catalog: None, }) @@ -196,13 +196,13 @@ async fn test_smart_push_existing_dataset_unauthorized() { .await; let dataset_result = &push_result.first().unwrap().result; - - assert_matches!( - dataset_result, - Err(PushError::SyncError(SyncError::Access( - AccessError::Forbidden(_) - ))) - ); + match dataset_result { + Ok(_) => panic!(), + Err(e) => assert_matches!( + e, + PushError::SyncError(SyncError::Access(AccessError::Forbidden(_))) + ), + } }; await_client_server_flow!(api_server_handle, client_handle); @@ -214,11 +214,11 @@ async fn test_smart_push_existing_dataset_unauthorized() { async fn test_smart_push_existing_ref_collision() { let scenario = SmartPushExistingRefCollisionScenarion::prepare( ClientSideHarness::new(ClientSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authenticated_remotely: true, }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: None, }) @@ -240,11 +240,10 @@ async fn test_smart_push_existing_ref_collision() { .await; let dataset_result = &push_result.first().unwrap().result; - - assert_matches!( - dataset_result, - Err(PushError::SyncError(SyncError::RefCollision(_))) - ); + match dataset_result { + Ok(_) => panic!(), + Err(e) => assert_matches!(e, PushError::SyncError(SyncError::RefCollision(_))), + } }; await_client_server_flow!(api_server_handle, client_handle); @@ -256,11 +255,11 @@ async fn test_smart_push_existing_ref_collision() { async fn test_smart_push_incompatible_version_err() { let scenario = SmartPushExistingRefCollisionScenarion::prepare( ClientSideHarness::new(ClientSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authenticated_remotely: true, }), ServerSideLocalFsHarness::new(ServerSideHarnessOptions { - multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, authorized_writes: true, base_catalog: None, }) diff --git a/src/adapter/odata/src/context.rs b/src/adapter/odata/src/context.rs index 81ef2a7012..bdf38673ca 100644 --- a/src/adapter/odata/src/context.rs +++ b/src/adapter/odata/src/context.rs @@ -18,6 +18,7 @@ use std::sync::Arc; +use auth::{DatasetAction, DatasetActionAuthorizer}; use axum::async_trait; use chrono::{DateTime, Utc}; use datafusion::arrow::datatypes::{Schema, SchemaRef}; @@ -58,7 +59,6 @@ impl ODataServiceContext { } } -// TODO: Authorization checks #[async_trait] impl ServiceContext for ODataServiceContext { fn service_base_url(&self) -> String { @@ -68,19 +68,28 @@ impl ServiceContext for ODataServiceContext { async fn list_collections(&self) -> Result>, ODataError> { use futures::TryStreamExt; - let repo: Arc = self.catalog.get_one().unwrap(); + let registry: Arc = self.catalog.get_one().unwrap(); + let authorizer: Arc = self.catalog.get_one().unwrap(); - let datasets = if let Some(account_name) = &self.account_name { - repo.get_datasets_by_owner(account_name) + let dataset_handles = if let Some(account_name) = &self.account_name { + registry.all_dataset_handles_by_owner(account_name) } else { - repo.get_all_datasets() + registry.all_dataset_handles() }; - let datasets: Vec<_> = datasets.try_collect().await.unwrap(); + let dataset_handles: Vec<_> = dataset_handles + .try_collect() + .await + .map_err(ODataError::internal)?; + + let dataset_handles = authorizer + .filter_datasets_allowing(dataset_handles, DatasetAction::Read) + .await + .map_err(ODataError::internal)?; let mut collections: Vec> = Vec::new(); - for dataset_handle in datasets { - let dataset = repo.get_dataset_by_handle(&dataset_handle); + for dataset_handle in dataset_handles { + let resolved_dataset = registry.get_dataset_by_handle(&dataset_handle); collections.push(Arc::new(ODataCollectionContext { catalog: self.catalog.clone(), @@ -88,8 +97,7 @@ impl ServiceContext for ODataServiceContext { name: dataset_handle.alias.dataset_name.to_string(), key: None, }, - dataset_handle, - dataset, + resolved_dataset, service_base_url: self.service_base_url.clone(), })); } @@ -107,8 +115,7 @@ impl ServiceContext for ODataServiceContext { pub(crate) struct ODataCollectionContext { catalog: Catalog, addr: CollectionAddr, - dataset_handle: DatasetHandle, - dataset: Arc, + resolved_dataset: ResolvedDataset, service_base_url: String, } @@ -116,8 +123,7 @@ impl ODataCollectionContext { pub(crate) fn new( catalog: Catalog, addr: CollectionAddr, - dataset_handle: DatasetHandle, - dataset: Arc, + resolved_dataset: ResolvedDataset, ) -> Self { let config = catalog.get_one::().unwrap(); let service_base_url = config.protocols.odata_base_url(); @@ -125,8 +131,7 @@ impl ODataCollectionContext { Self { catalog, addr, - dataset_handle, - dataset, + resolved_dataset, service_base_url, } } @@ -155,14 +160,14 @@ impl CollectionContext for ODataCollectionContext { } fn collection_name(&self) -> Result { - Ok(self.dataset_handle.alias.dataset_name.to_string()) + Ok(self.resolved_dataset.get_alias().dataset_name.to_string()) } async fn last_updated_time(&self) -> DateTime { use futures::TryStreamExt; let (_, last_block) = self - .dataset + .resolved_dataset .as_metadata_chain() .iter_blocks() .try_next() @@ -178,7 +183,7 @@ impl CollectionContext for ODataCollectionContext { // See: https://github.com/kamu-data/kamu-cli/issues/306 let set_data_schema = self - .dataset + .resolved_dataset .as_metadata_chain() .iter_blocks() .filter_map_ok(|(_, b)| b.event.into_variant::()) @@ -206,7 +211,7 @@ impl CollectionContext for ODataCollectionContext { .unwrap_or(DEFAULT_RECORDS_PER_PAGE); let vocab: DatasetVocabulary = self - .dataset + .resolved_dataset .as_metadata_chain() .accept_one(SearchSetVocabVisitor::new()) .await @@ -218,7 +223,7 @@ impl CollectionContext for ODataCollectionContext { let query_svc: Arc = self.catalog.get_one().unwrap(); let df = query_svc - .get_data(&self.dataset_handle.as_local_ref()) + .get_data(&self.resolved_dataset.get_handle().as_local_ref()) .await .unwrap(); diff --git a/src/adapter/odata/src/handler.rs b/src/adapter/odata/src/handler.rs index 90a1335f8a..ce45a51a56 100644 --- a/src/adapter/odata/src/handler.rs +++ b/src/adapter/odata/src/handler.rs @@ -218,10 +218,10 @@ pub async fn odata_collection_handler_common( return Err(ApiError::not_found_without_body()); }; - let repo: Arc = catalog.get_one().unwrap(); + let registry: Arc = catalog.get_one().unwrap(); - let dataset_handle = match repo - .resolve_dataset_ref(&DatasetAlias::new(account_name, dataset_name).into()) + let dataset_handle = match registry + .resolve_dataset_handle_by_ref(&DatasetAlias::new(account_name, dataset_name).into()) .await { Ok(hdl) => Ok(hdl), @@ -232,9 +232,9 @@ pub async fn odata_collection_handler_common( } .unwrap(); - let dataset = repo.get_dataset_by_handle(&dataset_handle); + let resolved_dataset = registry.get_dataset_by_handle(&dataset_handle); - let ctx = ODataCollectionContext::new(catalog, addr, dataset_handle, dataset); + let ctx = ODataCollectionContext::new(catalog, addr, resolved_dataset); let response = datafusion_odata::handlers::odata_collection_handler( Extension(Arc::new(ctx)), query, diff --git a/src/adapter/odata/tests/tests/test_api_server.rs b/src/adapter/odata/tests/tests/test_api_server.rs index 9ebe3ea644..523e50c631 100644 --- a/src/adapter/odata/tests/tests/test_api_server.rs +++ b/src/adapter/odata/tests/tests/test_api_server.rs @@ -10,6 +10,7 @@ use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use dill::Catalog; +use kamu_core::TenancyConfig; use utoipa_axum::router::OpenApiRouter; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -24,15 +25,14 @@ impl TestAPIServer { catalog: Catalog, address: Option, port: Option, - multi_tenant: bool, + tenancy_config: TenancyConfig, ) -> Self { let (router, _api) = OpenApiRouter::new() .nest( "/odata", - if multi_tenant { - kamu_adapter_odata::router_multi_tenant() - } else { - kamu_adapter_odata::router_single_tenant() + match tenancy_config { + TenancyConfig::MultiTenant => kamu_adapter_odata::router_multi_tenant(), + TenancyConfig::SingleTenant => kamu_adapter_odata::router_single_tenant(), }, ) .layer( diff --git a/src/adapter/odata/tests/tests/test_handlers.rs b/src/adapter/odata/tests/tests/test_handlers.rs index 9440cbfe4a..da77b232b7 100644 --- a/src/adapter/odata/tests/tests/test_handlers.rs +++ b/src/adapter/odata/tests/tests/test_handlers.rs @@ -346,13 +346,11 @@ impl TestHarness { .add_value(CurrentAccountSubject::new_test()) .add_value(dataset_action_authorizer) .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add_value(SystemTimeSourceStub::new_set( Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), @@ -370,7 +368,8 @@ impl TestHarness { let push_ingest_svc = catalog.get_one::().unwrap(); - let api_server = TestAPIServer::new(catalog.clone(), None, None, false).await; + let api_server = + TestAPIServer::new(catalog.clone(), None, None, TenancyConfig::SingleTenant).await; Self { temp_dir, @@ -432,7 +431,7 @@ impl TestHarness { self.push_ingest_svc .ingest_from_url( - &ds.dataset_handle.as_local_ref(), + ResolvedDataset::from(&ds), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts::default(), diff --git a/src/app/cli/src/app.rs b/src/app/cli/src/app.rs index bfed1ce7df..29b4f97036 100644 --- a/src/app/cli/src/app.rs +++ b/src/app/cli/src/app.rs @@ -25,7 +25,7 @@ use kamu_adapter_http::{FileUploadLimitConfig, UploadServiceLocal}; use kamu_adapter_oauth::GithubAuthenticationConfig; use kamu_auth_rebac_services::{MultiTenantRebacDatasetLifecycleMessageConsumer, RebacServiceImpl}; use kamu_datasets::DatasetEnvVar; -use kamu_datasets_services::{DatasetEntryIndexer, DatasetEntryService}; +use kamu_datasets_services::{DatasetEntryIndexer, DatasetEntryServiceImpl}; use kamu_flow_system_inmem::domain::{FlowConfigurationUpdatedMessage, FlowProgressMessage}; use kamu_flow_system_services::{ MESSAGE_PRODUCER_KAMU_FLOW_CONFIGURATION_SERVICE, @@ -97,23 +97,24 @@ pub async fn run(workspace_layout: WorkspaceLayout, args: cli::Cli) -> Result<() ); let workspace_version = workspace_svc.workspace_version()?; - let is_multi_tenant_workspace = workspace_svc.is_multi_tenant_workspace(); + let tenancy_config = if workspace_svc.is_multi_tenant_workspace() { + TenancyConfig::MultiTenant + } else { + TenancyConfig::SingleTenant + }; + let config = load_config(&workspace_layout); let current_account = AccountService::current_account_indication( args.account.clone(), - is_multi_tenant_workspace, + tenancy_config, config.users.as_ref().unwrap(), ); prepare_run_dir(&workspace_layout.run_info_dir); let is_init_command = maybe_init_command.is_some(); - let app_database_config = get_app_database_config( - &workspace_layout, - &config, - is_multi_tenant_workspace, - is_init_command, - ); + let app_database_config = + get_app_database_config(&workspace_layout, &config, tenancy_config, is_init_command); let (database_config, maybe_temp_database_path) = app_database_config.into_inner(); let maybe_db_connection_settings = database_config .as_ref() @@ -123,13 +124,13 @@ pub async fn run(workspace_layout: WorkspaceLayout, args: cli::Cli) -> Result<() let (guards, base_catalog, cli_catalog, maybe_server_catalog, output_config) = { let dependencies_graph_repository = prepare_dependencies_graph_repository( &workspace_layout, - is_multi_tenant_workspace, + tenancy_config, current_account.to_current_account_subject(), ); let mut base_catalog_builder = configure_base_catalog( &workspace_layout, - is_multi_tenant_workspace, + tenancy_config, args.system_time.map(Into::into), args.e2e_output_data_path.is_some(), ); @@ -169,11 +170,7 @@ pub async fn run(workspace_layout: WorkspaceLayout, args: cli::Cli) -> Result<() "Initializing {BINARY_NAME}" ); - register_config_in_catalog( - &config, - &mut base_catalog_builder, - is_multi_tenant_workspace, - ); + register_config_in_catalog(&config, &mut base_catalog_builder, tenancy_config); let base_catalog = base_catalog_builder.build(); @@ -199,7 +196,7 @@ pub async fn run(workspace_layout: WorkspaceLayout, args: cli::Cli) -> Result<() let cli_catalog = configure_cli_catalog( maybe_server_catalog.as_ref().unwrap_or(&final_base_catalog), - is_multi_tenant_workspace, + tenancy_config, ) .add_value(current_account.to_current_account_subject()) .build(); @@ -243,7 +240,7 @@ pub async fn run(workspace_layout: WorkspaceLayout, args: cli::Cli) -> Result<() if command.needs_workspace() && is_workspace_upgrade_needed { Err(CLIError::usage_error_from(WorkspaceUpgradeRequired))?; } - if current_account.is_explicit() && !is_multi_tenant_workspace { + if current_account.is_explicit() && tenancy_config == TenancyConfig::SingleTenant { Err(CLIError::usage_error_from(NotInMultiTenantWorkspace))?; } @@ -354,7 +351,7 @@ where pub fn prepare_dependencies_graph_repository( workspace_layout: &WorkspaceLayout, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, current_account_subject: CurrentAccountSubject, ) -> DependencyGraphRepositoryInMemory { // Construct a special catalog just to create 1 object, but with a repository @@ -362,10 +359,9 @@ pub fn prepare_dependencies_graph_repository( let special_catalog_for_graph = CatalogBuilder::new() .add::() + .add_value(tenancy_config) .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(workspace_layout.datasets_dir.clone()) - .with_multi_tenant(multi_tenant_workspace), + DatasetRepositoryLocalFs::builder().with_root(workspace_layout.datasets_dir.clone()), ) .bind::() .bind::() @@ -383,7 +379,7 @@ pub fn prepare_dependencies_graph_repository( // Public only for tests pub fn configure_base_catalog( workspace_layout: &WorkspaceLayout, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, system_time: Option>, is_e2e_testing: bool, ) -> CatalogBuilder { @@ -398,6 +394,8 @@ pub fn configure_base_catalog( b.add::(); + b.add_value(tenancy_config); + if let Some(system_time) = system_time { b.add_value(SystemTimeSourceStub::new_set(system_time)); b.bind::(); @@ -406,9 +404,7 @@ pub fn configure_base_catalog( } b.add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(workspace_layout.datasets_dir.clone()) - .with_multi_tenant(multi_tenant_workspace), + DatasetRepositoryLocalFs::builder().with_root(workspace_layout.datasets_dir.clone()), ); b.bind::(); b.bind::(); @@ -431,7 +427,9 @@ pub fn configure_base_catalog( b.add::(); - b.add::(); + b.add::(); + b.add::(); + b.add::(); b.add::(); @@ -440,10 +438,13 @@ pub fn configure_base_catalog( b.add::(); b.add::(); + b.add::(); + + b.add::(); - b.add::(); + b.add::(); - b.add::(); + b.add::(); b.add::(); @@ -457,21 +458,28 @@ pub fn configure_base_catalog( b.add::(); + b.add::(); b.add::(); b.add::(); b.add::(); b.add::(); + b.add::(); b.add::(); b.add::(); b.add::(); + b.add::(); + b.add::(); b.add::(); + b.add::(); + b.add::(); + b.add::(); b.add::(); // No GitHub login possible for single-tenant workspace - if multi_tenant_workspace { + if tenancy_config == TenancyConfig::MultiTenant { if is_e2e_testing { b.add::(); } else { @@ -494,15 +502,15 @@ pub fn configure_base_catalog( b.add::(); - if multi_tenant_workspace { + if tenancy_config == TenancyConfig::MultiTenant { b.add::(); } - b.add::(); + b.add::(); b.add_builder( messaging_outbox::OutboxImmediateImpl::builder() - .with_consumer_filter(messaging_outbox::ConsumerFilter::BestEffortConsumers), + .with_consumer_filter(messaging_outbox::ConsumerFilter::ImmediateConsumers), ); b.add::(); b.add::(); @@ -521,13 +529,15 @@ pub fn configure_base_catalog( // Public only for tests pub fn configure_cli_catalog( base_catalog: &Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, ) -> CatalogBuilder { let mut b = CatalogBuilder::new_chained(base_catalog); b.add::(); b.add::(); - b.add_builder(WorkspaceService::builder().with_multi_tenant(multi_tenant_workspace)); + b.add_builder( + WorkspaceService::builder().with_multi_tenant(tenancy_config == TenancyConfig::MultiTenant), + ); b.add::(); b @@ -598,7 +608,7 @@ fn load_config(workspace_layout: &WorkspaceLayout) -> config::CLIConfig { pub fn register_config_in_catalog( config: &config::CLIConfig, catalog_builder: &mut CatalogBuilder, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, ) { let network_ns = config.engine.as_ref().unwrap().network_ns.unwrap(); @@ -712,13 +722,15 @@ pub fn register_config_in_catalog( }); catalog_builder.add_value(kamu::utils::ipfs_wrapper::IpfsClient::default()); - if multi_tenant_workspace { + if tenancy_config == TenancyConfig::MultiTenant { let mut implicit_user_config = PredefinedAccountsConfig::new(); implicit_user_config.predefined.push( AccountConfig::from_name(opendatafabric::AccountName::new_unchecked( - AccountService::default_account_name(true).as_str(), + AccountService::default_account_name(TenancyConfig::MultiTenant).as_str(), )) - .set_display_name(AccountService::default_user_name(true)), + .set_display_name(AccountService::default_user_name( + TenancyConfig::MultiTenant, + )), ); use merge::Merge; diff --git a/src/app/cli/src/cli_commands.rs b/src/app/cli/src/cli_commands.rs index 711a89d98f..d25f82c3b8 100644 --- a/src/app/cli/src/cli_commands.rs +++ b/src/app/cli/src/cli_commands.rs @@ -9,6 +9,7 @@ use clap::CommandFactory as _; use dill::Catalog; +use kamu::domain::TenancyConfig; use kamu_accounts::CurrentAccountSubject; use opendatafabric::*; @@ -22,6 +23,13 @@ pub fn get_command( cli_catalog: &Catalog, args: cli::Cli, ) -> Result, CLIError> { + let workspace_svc = cli_catalog.get_one::()?; + let tenancy_config = if workspace_svc.is_multi_tenant_workspace() { + TenancyConfig::MultiTenant + } else { + TenancyConfig::SingleTenant + }; + let command: Box = match args.command { cli::Command::Add(c) => Box::new(AddCommand::new( cli_catalog.get_one()?, @@ -36,9 +44,9 @@ pub fn get_command( c.stdin, c.visibility.into(), cli_catalog.get_one()?, + tenancy_config, )), cli::Command::Complete(c) => { - let workspace_svc = cli_catalog.get_one::()?; let in_workspace = workspace_svc.is_in_workspace() && !workspace_svc.is_upgrade_needed()?; @@ -122,7 +130,7 @@ pub fn get_command( cli_catalog.get_one()?, cli_catalog.get_one()?, c.exists_ok, - c.multi_tenant, + tenancy_config, )) } } @@ -150,7 +158,6 @@ pub fn get_command( )), }, cli::Command::List(c) => { - let workspace_svc = cli_catalog.get_one::()?; let user_config = cli_catalog.get_one::()?; Box::new(ListCommand::new( @@ -158,7 +165,7 @@ pub fn get_command( cli_catalog.get_one()?, accounts::AccountService::current_account_indication( args.account, - workspace_svc.is_multi_tenant_workspace(), + tenancy_config, user_config.as_ref(), ), accounts::AccountService::related_account_indication( @@ -166,6 +173,7 @@ pub fn get_command( c.all_accounts, ), cli_catalog.get_one()?, + tenancy_config, c.wide, )) } @@ -254,7 +262,7 @@ pub fn get_command( Box::new(SetWatermarkCommand::new( cli_catalog.get_one()?, cli_catalog.get_one()?, - cli_catalog.get_one()?, + tenancy_config, c.dataset.unwrap_or_default(), c.all, c.recursive, @@ -266,6 +274,7 @@ pub fn get_command( cli_catalog.get_one()?, cli_catalog.get_one()?, cli_catalog.get_one()?, + tenancy_config, c.dataset.unwrap_or_default(), cli_catalog.get_one()?, c.all, @@ -289,6 +298,7 @@ pub fn get_command( c.to, c.visibility.into(), cli_catalog.get_one()?, + tenancy_config, )), cli::Command::Rename(c) => Box::new(RenameCommand::new( cli_catalog.get_one()?, @@ -313,6 +323,7 @@ pub fn get_command( )), cli::RepoSubCommand::Alias(sc) => match sc.subcommand { cli::RepoAliasSubCommand::Add(ssc) => Box::new(AliasAddCommand::new( + cli_catalog.get_one()?, cli_catalog.get_one()?, cli_catalog.get_one()?, ssc.dataset, @@ -397,24 +408,20 @@ pub fn get_command( }, cli::Command::System(c) => match c.subcommand { cli::SystemSubCommand::ApiServer(sc) => match sc.subcommand { - None => { - let workspace_svc = cli_catalog.get_one::()?; - - Box::new(APIServerRunCommand::new( - base_catalog.clone(), - cli_catalog.clone(), - workspace_svc.is_multi_tenant_workspace(), - cli_catalog.get_one()?, - sc.address, - sc.http_port, - sc.external_address, - sc.get_token, - cli_catalog.get_one()?, - cli_catalog.get_one()?, - cli_catalog.get_one()?, - args.e2e_output_data_path, - )) - } + None => Box::new(APIServerRunCommand::new( + base_catalog.clone(), + cli_catalog.clone(), + tenancy_config, + cli_catalog.get_one()?, + sc.address, + sc.http_port, + sc.external_address, + sc.get_token, + cli_catalog.get_one()?, + cli_catalog.get_one()?, + cli_catalog.get_one()?, + args.e2e_output_data_path, + )), Some(cli::SystemApiServerSubCommand::GqlQuery(ssc)) => Box::new( APIServerGqlQueryCommand::new(base_catalog.clone(), ssc.query, ssc.full), ), @@ -463,6 +470,7 @@ pub fn get_command( )), cli::SystemSubCommand::Ipfs(sc) => match sc.subcommand { cli::SystemIpfsSubCommand::Add(ssc) => Box::new(SystemIpfsAddCommand::new( + cli_catalog.get_one()?, cli_catalog.get_one()?, ssc.dataset, )), @@ -479,7 +487,6 @@ pub fn get_command( cli_catalog.get_one()?, )), cli::Command::Ui(c) => { - let workspace_svc = cli_catalog.get_one::()?; let current_account_subject = cli_catalog.get_one::()?; let current_account_name = match current_account_subject.as_ref() { @@ -491,7 +498,7 @@ pub fn get_command( Box::new(UICommand::new( base_catalog.clone(), - workspace_svc.is_multi_tenant_workspace(), + tenancy_config, current_account_name, cli_catalog.get_one()?, cli_catalog.get_one()?, @@ -525,15 +532,11 @@ pub fn get_command( pub fn command_needs_transaction(args: &cli::Cli) -> bool { match &args.command { cli::Command::System(c) => match &c.subcommand { - cli::SystemSubCommand::GenerateToken(_) => true, - _ => false, + cli::SystemSubCommand::ApiServer(_) => false, + _ => true, }, - cli::Command::Add(_) - | cli::Command::Delete(_) - | cli::Command::Rename(_) - | cli::Command::Push(_) - | cli::Command::Pull(_) => true, - _ => false, + cli::Command::Ui(_) => false, + _ => true, } } diff --git a/src/app/cli/src/commands/add_command.rs b/src/app/cli/src/commands/add_command.rs index 1445c10b58..161563c8a5 100644 --- a/src/app/cli/src/commands/add_command.rs +++ b/src/app/cli/src/commands/add_command.rs @@ -21,7 +21,7 @@ use crate::{Interact, OutputConfig}; pub struct AddCommand { interact: Arc, resource_loader: Arc, - dataset_repo: Arc, + dataset_registry: Arc, create_dataset_from_snapshot: Arc, delete_dataset: Arc, snapshot_refs: Vec, @@ -31,13 +31,14 @@ pub struct AddCommand { stdin: bool, dataset_visibility: DatasetVisibility, output_config: Arc, + tenancy_config: TenancyConfig, } impl AddCommand { pub fn new( interact: Arc, resource_loader: Arc, - dataset_repo: Arc, + dataset_registry: Arc, create_dataset_from_snapshot: Arc, delete_dataset: Arc, snapshot_refs_iter: I, @@ -47,6 +48,7 @@ impl AddCommand { stdin: bool, dataset_visibility: DatasetVisibility, output_config: Arc, + tenancy_config: TenancyConfig, ) -> Self where I: IntoIterator, @@ -55,7 +57,7 @@ impl AddCommand { Self { interact, resource_loader, - dataset_repo, + dataset_registry, create_dataset_from_snapshot, delete_dataset, snapshot_refs: snapshot_refs_iter.into_iter().map(Into::into).collect(), @@ -65,6 +67,7 @@ impl AddCommand { stdin, dataset_visibility, output_config, + tenancy_config, } } @@ -229,7 +232,9 @@ impl Command for AddCommand { "Name override can be used only when adding a single manifest", )); } - if !self.dataset_repo.is_multi_tenant() && !self.dataset_visibility.is_private() { + if self.tenancy_config == TenancyConfig::SingleTenant + && !self.dataset_visibility.is_private() + { return Err(CLIError::usage_error( "Only multi-tenant workspaces support non-private dataset visibility", )); @@ -274,8 +279,8 @@ impl Command for AddCommand { let mut already_exist = Vec::new(); for s in &snapshots { if let Some(hdl) = self - .dataset_repo - .try_resolve_dataset_ref(&s.name.as_local_ref()) + .dataset_registry + .try_resolve_dataset_handle_by_ref(&s.name.as_local_ref()) .await? { already_exist.push(hdl); diff --git a/src/app/cli/src/commands/alias_add_command.rs b/src/app/cli/src/commands/alias_add_command.rs index d0da043d80..c52a749534 100644 --- a/src/app/cli/src/commands/alias_add_command.rs +++ b/src/app/cli/src/commands/alias_add_command.rs @@ -15,9 +15,10 @@ use opendatafabric::*; use super::{CLIError, Command}; pub struct AliasAddCommand { + dataset_registry: Arc, remote_repo_reg: Arc, remote_alias_reg: Arc, - dataset: DatasetRef, + dataset_ref: DatasetRef, alias: DatasetRefRemote, pull: bool, push: bool, @@ -25,6 +26,7 @@ pub struct AliasAddCommand { impl AliasAddCommand { pub fn new( + dataset_registry: Arc, remote_repo_reg: Arc, remote_alias_reg: Arc, dataset: DatasetRef, @@ -33,9 +35,10 @@ impl AliasAddCommand { push: bool, ) -> Self { Self { + dataset_registry, remote_repo_reg, remote_alias_reg, - dataset, + dataset_ref: dataset, alias, pull, push, @@ -58,9 +61,15 @@ impl Command for AliasAddCommand { .map_err(CLIError::failure)?; } + let dataset_handle = self + .dataset_registry + .resolve_dataset_handle_by_ref(&self.dataset_ref) + .await + .map_err(CLIError::failure)?; + let mut aliases = self .remote_alias_reg - .get_remote_aliases(&self.dataset) + .get_remote_aliases(&dataset_handle) .await .map_err(CLIError::failure)?; diff --git a/src/app/cli/src/commands/alias_delete_command.rs b/src/app/cli/src/commands/alias_delete_command.rs index afaf8c6f9f..3cab787e92 100644 --- a/src/app/cli/src/commands/alias_delete_command.rs +++ b/src/app/cli/src/commands/alias_delete_command.rs @@ -16,10 +16,10 @@ use opendatafabric::*; use super::{CLIError, Command}; pub struct AliasDeleteCommand { - dataset_repo: Arc, + dataset_registry: Arc, remote_alias_reg: Arc, - dataset: Option, - alias: Option, + maybe_dataset_ref: Option, + maybe_alias: Option, all: bool, pull: bool, push: bool, @@ -27,19 +27,19 @@ pub struct AliasDeleteCommand { impl AliasDeleteCommand { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, remote_alias_reg: Arc, - dataset: Option, - alias: Option, + maybe_dataset_ref: Option, + maybe_alias: Option, all: bool, pull: bool, push: bool, ) -> Self { Self { - dataset_repo, + dataset_registry, remote_alias_reg, - dataset, - alias, + maybe_dataset_ref, + maybe_alias, all, pull, push, @@ -47,9 +47,15 @@ impl AliasDeleteCommand { } async fn delete_dataset_alias(&self) -> Result { + let dataset_handle = self + .dataset_registry + .resolve_dataset_handle_by_ref(self.maybe_dataset_ref.as_ref().unwrap()) + .await + .map_err(CLIError::failure)?; + let mut aliases = self .remote_alias_reg - .get_remote_aliases(self.dataset.as_ref().unwrap()) + .get_remote_aliases(&dataset_handle) .await .map_err(CLIError::failure)?; @@ -58,7 +64,7 @@ impl AliasDeleteCommand { if self.all { count += aliases.clear(RemoteAliasKind::Pull).await?; count += aliases.clear(RemoteAliasKind::Push).await?; - } else if let Some(alias) = &self.alias { + } else if let Some(alias) = &self.maybe_alias { let both = !self.pull && !self.push; if (self.pull || both) && aliases.delete(alias, RemoteAliasKind::Pull).await? { @@ -77,14 +83,9 @@ impl AliasDeleteCommand { async fn delete_all_aliases(&self) -> Result { let mut count = 0; - let mut stream = self.dataset_repo.get_all_datasets(); - while let Some(dataset_handle) = - stream.next().await.transpose().map_err(CLIError::failure)? - { - let mut aliases = self - .remote_alias_reg - .get_remote_aliases(&dataset_handle.into_local_ref()) - .await?; + let mut stream = self.dataset_registry.all_dataset_handles(); + while let Some(hdl) = stream.next().await.transpose().map_err(CLIError::failure)? { + let mut aliases = self.remote_alias_reg.get_remote_aliases(&hdl).await?; // --all --push - clears all push aliases only // --all --pull - clears all pull aliases only @@ -104,7 +105,7 @@ impl AliasDeleteCommand { #[async_trait::async_trait(?Send)] impl Command for AliasDeleteCommand { async fn run(&mut self) -> Result<(), CLIError> { - let count = if self.dataset.is_some() { + let count = if self.maybe_dataset_ref.is_some() { self.delete_dataset_alias().await } else if self.all { self.delete_all_aliases().await diff --git a/src/app/cli/src/commands/alias_list_command.rs b/src/app/cli/src/commands/alias_list_command.rs index 3b8114174d..6b06dd82dc 100644 --- a/src/app/cli/src/commands/alias_list_command.rs +++ b/src/app/cli/src/commands/alias_list_command.rs @@ -20,24 +20,24 @@ use super::{CLIError, Command}; use crate::output::*; pub struct AliasListCommand { - dataset_repo: Arc, + dataset_registry: Arc, remote_alias_reg: Arc, output_config: Arc, - dataset_ref: Option, + maybe_dataset_ref: Option, } impl AliasListCommand { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, remote_alias_reg: Arc, output_config: Arc, - dataset_ref: Option, + maybe_dataset_ref: Option, ) -> Self { Self { - dataset_repo, + dataset_registry, remote_alias_reg, output_config, - dataset_ref, + maybe_dataset_ref, } } @@ -49,10 +49,10 @@ impl AliasListCommand { let mut col_kind = Vec::new(); let mut col_alias = Vec::new(); - for ds in datasets { + for hdl in datasets { let aliases = self .remote_alias_reg - .get_remote_aliases(&ds.as_local_ref()) + .get_remote_aliases(hdl) .await .int_err()?; let mut pull_aliases: Vec<_> = aliases @@ -68,13 +68,13 @@ impl AliasListCommand { push_aliases.sort(); for alias in pull_aliases { - col_dataset.push(ds.alias.to_string()); + col_dataset.push(hdl.alias.to_string()); col_kind.push("Pull"); col_alias.push(alias); } for alias in push_aliases { - col_dataset.push(ds.alias.to_string()); + col_dataset.push(hdl.alias.to_string()); col_kind.push("Push"); col_alias.push(alias); } @@ -93,11 +93,17 @@ impl AliasListCommand { #[async_trait::async_trait(?Send)] impl Command for AliasListCommand { async fn run(&mut self) -> Result<(), CLIError> { - let mut datasets: Vec<_> = if let Some(dataset_ref) = &self.dataset_ref { - let hdl = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; + let mut datasets: Vec<_> = if let Some(dataset_ref) = &self.maybe_dataset_ref { + let hdl = self + .dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await?; vec![hdl] } else { - self.dataset_repo.get_all_datasets().try_collect().await? + self.dataset_registry + .all_dataset_handles() + .try_collect() + .await? }; datasets.sort_by(|a, b| a.alias.cmp(&b.alias)); diff --git a/src/app/cli/src/commands/compact_command.rs b/src/app/cli/src/commands/compact_command.rs index 435954d956..812b50f159 100644 --- a/src/app/cli/src/commands/compact_command.rs +++ b/src/app/cli/src/commands/compact_command.rs @@ -11,12 +11,13 @@ use std::sync::Arc; use futures::TryStreamExt as _; use kamu::domain::{ + CompactDatasetUseCase, CompactionOptions, - CompactionService, - DatasetRepository, + DatasetRegistry, VerificationMultiListener, VerificationOptions, - VerificationService, + VerificationRequest, + VerifyDatasetUseCase, }; use opendatafabric::{DatasetHandle, DatasetRefPattern}; @@ -31,9 +32,9 @@ use crate::{ pub struct CompactCommand { interact: Arc, - dataset_repo: Arc, - verification_svc: Arc, - compaction_svc: Arc, + compact_dataset_use_case: Arc, + verify_dataset_use_case: Arc, + dataset_registry: Arc, dataset_ref_patterns: Vec, max_slice_size: u64, max_slice_records: u64, @@ -45,9 +46,9 @@ pub struct CompactCommand { impl CompactCommand { pub fn new( interact: Arc, - dataset_repo: Arc, - verification_svc: Arc, - compaction_svc: Arc, + compact_dataset_use_case: Arc, + verify_dataset_use_case: Arc, + dataset_registry: Arc, dataset_ref_patterns: Vec, max_slice_size: u64, max_slice_records: u64, @@ -57,9 +58,9 @@ impl CompactCommand { ) -> Self { Self { interact, - dataset_repo, - verification_svc, - compaction_svc, + compact_dataset_use_case, + verify_dataset_use_case, + dataset_registry, dataset_ref_patterns, max_slice_size, max_slice_records, @@ -77,11 +78,13 @@ impl CompactCommand { }); let result = self - .verification_svc - .verify( - &dataset_handle.as_local_ref(), - (None, None), - VerificationOptions::default(), + .verify_dataset_use_case + .execute( + VerificationRequest { + target: dataset_handle.clone(), + block_range: (None, None), + options: VerificationOptions::default(), + }, listener.begin_verify(dataset_handle), ) .await; @@ -108,7 +111,7 @@ impl Command for CompactCommand { let dataset_handles: Vec = { kamu::utils::datasets_filtering::filter_datasets_by_local_pattern( - self.dataset_repo.as_ref(), + self.dataset_registry.as_ref(), self.dataset_ref_patterns.clone(), ) .try_collect() @@ -146,9 +149,9 @@ impl Command for CompactCommand { }); let compaction_results = self - .compaction_svc - .compact_multi( - dataset_handles.into_iter().map(Into::into).collect(), + .compact_dataset_use_case + .execute_multi( + dataset_handles, CompactionOptions { max_slice_size: Some(self.max_slice_size), max_slice_records: Some(self.max_slice_records), diff --git a/src/app/cli/src/commands/complete_command.rs b/src/app/cli/src/commands/complete_command.rs index 3c66694b0d..968415a7cf 100644 --- a/src/app/cli/src/commands/complete_command.rs +++ b/src/app/cli/src/commands/complete_command.rs @@ -21,7 +21,7 @@ use super::{CLIError, Command}; use crate::config::ConfigService; pub struct CompleteCommand { - dataset_repo: Option>, + dataset_registry: Option>, remote_repo_reg: Option>, remote_alias_reg: Option>, config_service: Arc, @@ -34,7 +34,7 @@ pub struct CompleteCommand { // but we have to do this until clap supports custom completer functions impl CompleteCommand { pub fn new( - dataset_repo: Option>, + dataset_registry: Option>, remote_repo_reg: Option>, remote_alias_reg: Option>, config_service: Arc, @@ -46,7 +46,7 @@ impl CompleteCommand { S: Into, { Self { - dataset_repo, + dataset_registry, remote_repo_reg, remote_alias_reg, config_service, @@ -74,8 +74,8 @@ impl CompleteCommand { } async fn complete_dataset(&self, output: &mut impl Write, prefix: &str) { - if let Some(repo) = self.dataset_repo.as_ref() { - let mut datasets = repo.get_all_datasets(); + if let Some(registry) = self.dataset_registry.as_ref() { + let mut datasets = registry.all_dataset_handles(); while let Some(dataset_handle) = datasets.try_next().await.unwrap() { if dataset_handle.alias.dataset_name.starts_with(prefix) { writeln!(output, "{}", dataset_handle.alias).unwrap(); @@ -95,14 +95,11 @@ impl CompleteCommand { } async fn complete_alias(&self, output: &mut impl Write, prefix: &str) { - if let Some(repo) = self.dataset_repo.as_ref() { + if let Some(registry) = self.dataset_registry.as_ref() { if let Some(reg) = self.remote_alias_reg.as_ref() { - let mut datasets = repo.get_all_datasets(); - while let Some(dataset_handle) = datasets.try_next().await.unwrap() { - let aliases = reg - .get_remote_aliases(&dataset_handle.as_local_ref()) - .await - .unwrap(); + let mut datasets = registry.all_dataset_handles(); + while let Some(hdl) = datasets.try_next().await.unwrap() { + let aliases = reg.get_remote_aliases(&hdl).await.unwrap(); for alias in aliases.get_by_kind(RemoteAliasKind::Pull) { if alias.to_string().starts_with(prefix) { writeln!(output, "{alias}").unwrap(); diff --git a/src/app/cli/src/commands/delete_command.rs b/src/app/cli/src/commands/delete_command.rs index 14053f8d80..74a04c99c3 100644 --- a/src/app/cli/src/commands/delete_command.rs +++ b/src/app/cli/src/commands/delete_command.rs @@ -22,7 +22,7 @@ use crate::Interact; pub struct DeleteCommand { interact: Arc, - dataset_repo: Arc, + dataset_registry: Arc, delete_dataset: Arc, dataset_ref_patterns: Vec, dependency_graph_service: Arc, @@ -33,7 +33,7 @@ pub struct DeleteCommand { impl DeleteCommand { pub fn new( interact: Arc, - dataset_repo: Arc, + dataset_registry: Arc, delete_dataset: Arc, dataset_ref_patterns: I, dependency_graph_service: Arc, @@ -45,7 +45,7 @@ impl DeleteCommand { { Self { interact, - dataset_repo, + dataset_registry, delete_dataset, dataset_ref_patterns: dataset_ref_patterns.into_iter().collect(), dependency_graph_service, @@ -63,10 +63,13 @@ impl Command for DeleteCommand { } let dataset_handles: Vec = if self.all { - self.dataset_repo.get_all_datasets().try_collect().await? + self.dataset_registry + .all_dataset_handles() + .try_collect() + .await? } else { filter_datasets_by_local_pattern( - self.dataset_repo.as_ref(), + self.dataset_registry.as_ref(), self.dataset_ref_patterns.clone(), ) .try_collect() @@ -84,8 +87,8 @@ impl Command for DeleteCommand { .int_err()? .map(DatasetID::into_local_ref) .then(|hdl| { - let repo = self.dataset_repo.clone(); - async move { repo.resolve_dataset_ref(&hdl).await } + let registry = self.dataset_registry.clone(); + async move { registry.resolve_dataset_handle_by_ref(&hdl).await } }) .try_collect() .await? diff --git a/src/app/cli/src/commands/ingest_command.rs b/src/app/cli/src/commands/ingest_command.rs index 67471f9f09..a55c16cfae 100644 --- a/src/app/cli/src/commands/ingest_command.rs +++ b/src/app/cli/src/commands/ingest_command.rs @@ -25,7 +25,7 @@ use crate::OutputConfig; pub struct IngestCommand { data_format_reg: Arc, - dataset_repo: Arc, + dataset_registry: Arc, push_ingest_svc: Arc, output_config: Arc, remote_alias_reg: Arc, @@ -41,7 +41,7 @@ pub struct IngestCommand { impl IngestCommand { pub fn new( data_format_reg: Arc, - dataset_repo: Arc, + dataset_registry: Arc, push_ingest_svc: Arc, output_config: Arc, remote_alias_reg: Arc, @@ -59,7 +59,7 @@ impl IngestCommand { { Self { data_format_reg, - dataset_repo, + dataset_registry, push_ingest_svc, output_config, remote_alias_reg, @@ -87,7 +87,7 @@ impl IngestCommand { ) -> Result<(), CLIError> { let aliases = self .remote_alias_reg - .get_remote_aliases(&dataset_handle.as_local_ref()) + .get_remote_aliases(dataset_handle) .await .map_err(CLIError::failure)?; let pull_aliases: Vec<_> = aliases @@ -102,8 +102,8 @@ impl IngestCommand { ))); } - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - let dataset_kind = dataset + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); + let dataset_kind = resolved_dataset .get_summary(GetSummaryOpts::default()) .await .int_err()? @@ -152,8 +152,8 @@ impl Command for IngestCommand { } let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&self.dataset_ref) + .dataset_registry + .resolve_dataset_handle_by_ref(&self.dataset_ref) .await .map_err(CLIError::failure)?; @@ -194,7 +194,7 @@ impl Command for IngestCommand { let result = self .push_ingest_svc .ingest_from_url( - &self.dataset_ref, + self.dataset_registry.get_dataset_by_handle(&dataset_handle), self.source_name.as_deref(), url, PushIngestOpts { diff --git a/src/app/cli/src/commands/init_command.rs b/src/app/cli/src/commands/init_command.rs index 34d5a88284..5cf60d2d51 100644 --- a/src/app/cli/src/commands/init_command.rs +++ b/src/app/cli/src/commands/init_command.rs @@ -9,6 +9,8 @@ use std::sync::Arc; +use kamu::domain::TenancyConfig; + use super::{CLIError, Command}; use crate::{AlreadyInWorkspace, OutputConfig, WorkspaceLayout}; @@ -16,7 +18,7 @@ pub struct InitCommand { output_config: Arc, workspace_layout: Arc, exists_ok: bool, - multi_tenant: bool, + tenancy_config: TenancyConfig, } impl InitCommand { @@ -24,13 +26,13 @@ impl InitCommand { output_config: Arc, workspace_layout: Arc, exists_ok: bool, - multi_tenant: bool, + tenancy_config: TenancyConfig, ) -> Self { Self { output_config, workspace_layout, exists_ok, - multi_tenant, + tenancy_config, } } } @@ -53,17 +55,16 @@ impl Command for InitCommand { }; } - WorkspaceLayout::create(&self.workspace_layout.root_dir, self.multi_tenant)?; + WorkspaceLayout::create(&self.workspace_layout.root_dir, self.tenancy_config)?; // TODO, write a workspace config if !self.output_config.quiet { eprintln!( "{}", - console::style(if self.multi_tenant { - "Initialized an empty multi-tenant workspace" - } else { - "Initialized an empty workspace" + console::style(match self.tenancy_config { + TenancyConfig::MultiTenant => "Initialized an empty multi-tenant workspace", + TenancyConfig::SingleTenant => "Initialized an empty workspace", }) .green() .bold() diff --git a/src/app/cli/src/commands/inspect_lineage_command.rs b/src/app/cli/src/commands/inspect_lineage_command.rs index c1200342b7..cf5b177800 100644 --- a/src/app/cli/src/commands/inspect_lineage_command.rs +++ b/src/app/cli/src/commands/inspect_lineage_command.rs @@ -32,7 +32,7 @@ pub enum LineageOutputFormat { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct InspectLineageCommand { - dataset_repo: Arc, + dataset_registry: Arc, provenance_svc: Arc, workspace_layout: Arc, dataset_refs: Vec, @@ -43,7 +43,7 @@ pub struct InspectLineageCommand { impl InspectLineageCommand { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, provenance_svc: Arc, workspace_layout: Arc, dataset_refs: I, @@ -55,7 +55,7 @@ impl InspectLineageCommand { I: IntoIterator, { Self { - dataset_repo, + dataset_registry, provenance_svc, workspace_layout, dataset_refs: dataset_refs.into_iter().collect(), @@ -98,10 +98,13 @@ impl Command for InspectLineageCommand { async fn run(&mut self) -> Result<(), CLIError> { use futures::{StreamExt, TryStreamExt}; let mut dataset_handles: Vec<_> = if self.dataset_refs.is_empty() { - self.dataset_repo.get_all_datasets().try_collect().await? + self.dataset_registry + .all_dataset_handles() + .try_collect() + .await? } else { futures::stream::iter(&self.dataset_refs) - .then(|r| self.dataset_repo.resolve_dataset_ref(r)) + .then(|r| self.dataset_registry.resolve_dataset_handle_by_ref(r)) .try_collect() .await .map_err(CLIError::failure)? diff --git a/src/app/cli/src/commands/inspect_query_command.rs b/src/app/cli/src/commands/inspect_query_command.rs index d627dc863c..9c25d875e0 100644 --- a/src/app/cli/src/commands/inspect_query_command.rs +++ b/src/app/cli/src/commands/inspect_query_command.rs @@ -20,7 +20,7 @@ use super::{CLIError, Command}; use crate::{OutputConfig, WritePager}; pub struct InspectQueryCommand { - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, dataset_ref: DatasetRef, output_config: Arc, @@ -28,13 +28,13 @@ pub struct InspectQueryCommand { impl InspectQueryCommand { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, dataset_ref: DatasetRef, output_config: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, dataset_action_authorizer, dataset_ref, output_config, @@ -46,9 +46,9 @@ impl InspectQueryCommand { output: &mut impl Write, dataset_handle: &DatasetHandle, ) -> Result<(), CLIError> { - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); - let mut blocks = dataset.as_metadata_chain().iter_blocks(); + let mut blocks = resolved_dataset.as_metadata_chain().iter_blocks(); while let Some((block_hash, block)) = blocks.try_next().await? { match &block.event { MetadataEvent::SetTransform(SetTransform { inputs, transform }) => { @@ -174,8 +174,8 @@ impl InspectQueryCommand { impl Command for InspectQueryCommand { async fn run(&mut self) -> Result<(), CLIError> { let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&self.dataset_ref) + .dataset_registry + .resolve_dataset_handle_by_ref(&self.dataset_ref) .await?; self.dataset_action_authorizer diff --git a/src/app/cli/src/commands/list_command.rs b/src/app/cli/src/commands/list_command.rs index 435c06c65e..92cd8b5694 100644 --- a/src/app/cli/src/commands/list_command.rs +++ b/src/app/cli/src/commands/list_command.rs @@ -22,29 +22,32 @@ use crate::output::*; use crate::{accounts, NotInMultiTenantWorkspace}; pub struct ListCommand { - dataset_repo: Arc, + dataset_registry: Arc, remote_alias_reg: Arc, current_account: accounts::CurrentAccountIndication, related_account: accounts::RelatedAccountIndication, output_config: Arc, + tenancy_config: TenancyConfig, detail_level: u8, } impl ListCommand { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, remote_alias_reg: Arc, current_account: accounts::CurrentAccountIndication, related_account: accounts::RelatedAccountIndication, output_config: Arc, + tenancy_config: TenancyConfig, detail_level: u8, ) -> Self { Self { - dataset_repo, + dataset_registry, remote_alias_reg, current_account, related_account, output_config, + tenancy_config, detail_level, } } @@ -76,7 +79,7 @@ impl ListCommand { ) -> Result { let is_remote = self .remote_alias_reg - .get_remote_aliases(&handle.as_local_ref()) + .get_remote_aliases(handle) .await? .get_by_kind(RemoteAliasKind::Pull) .next() @@ -176,20 +179,21 @@ impl ListCommand { } fn stream_datasets(&self) -> DatasetHandleStream { - if self.dataset_repo.is_multi_tenant() { - match &self.related_account.target_account { + match self.tenancy_config { + TenancyConfig::MultiTenant => match &self.related_account.target_account { accounts::TargetAccountSelection::Current => self - .dataset_repo - .get_datasets_by_owner(&self.current_account.account_name), + .dataset_registry + .all_dataset_handles_by_owner(&self.current_account.account_name), accounts::TargetAccountSelection::Specific { account_name: user_name, - } => self - .dataset_repo - .get_datasets_by_owner(&AccountName::from_str(user_name.as_str()).unwrap()), - accounts::TargetAccountSelection::AllUsers => self.dataset_repo.get_all_datasets(), - } - } else { - self.dataset_repo.get_all_datasets() + } => self.dataset_registry.all_dataset_handles_by_owner( + &AccountName::from_str(user_name.as_str()).unwrap(), + ), + accounts::TargetAccountSelection::AllUsers => { + self.dataset_registry.all_dataset_handles() + } + }, + TenancyConfig::SingleTenant => self.dataset_registry.all_dataset_handles(), } } } @@ -206,7 +210,7 @@ impl Command for ListCommand { use datafusion::arrow::datatypes::Schema; use datafusion::arrow::record_batch::RecordBatch; - let show_owners = if self.dataset_repo.is_multi_tenant() { + let show_owners = if self.tenancy_config == TenancyConfig::MultiTenant { self.current_account.is_explicit() || self.related_account.is_explicit() } else if self.related_account.is_explicit() { return Err(CLIError::usage_error_from(NotInMultiTenantWorkspace)); @@ -240,12 +244,14 @@ impl Command for ListCommand { datasets.sort_by(|a, b| a.alias.cmp(&b.alias)); for hdl in &datasets { - let dataset = self.dataset_repo.get_dataset_by_handle(hdl); - let current_head = dataset + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(hdl); + let current_head = resolved_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) .await?; - let summary = dataset.get_summary(GetSummaryOpts::default()).await?; + let summary = resolved_dataset + .get_summary(GetSummaryOpts::default()) + .await?; name.push(hdl.alias.dataset_name.to_string()); @@ -262,14 +268,14 @@ impl Command for ListCommand { size.push(summary.data_size); if self.detail_level > 0 { - let num_blocks = dataset + let num_blocks = resolved_dataset .as_metadata_chain() .get_block(¤t_head) .await .int_err()? .sequence_number + 1; - let last_watermark = dataset + let last_watermark = resolved_dataset .as_metadata_chain() .last_data_block() .await diff --git a/src/app/cli/src/commands/log_command.rs b/src/app/cli/src/commands/log_command.rs index ca5852b206..28ba2399e3 100644 --- a/src/app/cli/src/commands/log_command.rs +++ b/src/app/cli/src/commands/log_command.rs @@ -36,7 +36,7 @@ pub enum MetadataLogOutputFormat { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct LogCommand { - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, dataset_ref: DatasetRef, output_format: Option, @@ -47,7 +47,7 @@ pub struct LogCommand { impl LogCommand { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, dataset_ref: DatasetRef, output_format: Option, @@ -56,7 +56,7 @@ impl LogCommand { output_config: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, dataset_action_authorizer, dataset_ref, output_format, @@ -94,8 +94,8 @@ impl LogCommand { impl Command for LogCommand { async fn run(&mut self) -> Result<(), CLIError> { let id_to_alias_lookup: BTreeMap<_, _> = self - .dataset_repo - .get_all_datasets() + .dataset_registry + .all_dataset_handles() .map_ok(|h| (h.id, h.alias)) .try_collect() .await?; @@ -117,8 +117,8 @@ impl Command for LogCommand { }; let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&self.dataset_ref) + .dataset_registry + .resolve_dataset_handle_by_ref(&self.dataset_ref) .await?; self.dataset_action_authorizer @@ -129,10 +129,10 @@ impl Command for LogCommand { auth::DatasetActionUnauthorizedError::Internal(e) => CLIError::critical(e), })?; - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(&dataset_handle); let blocks = Box::pin( - dataset + resolved_dataset .as_metadata_chain() .iter_blocks() .filter_ok(|(_, b)| self.filter_block(b)), diff --git a/src/app/cli/src/commands/pull_command.rs b/src/app/cli/src/commands/pull_command.rs index be3be6eaa7..6ef44342a5 100644 --- a/src/app/cli/src/commands/pull_command.rs +++ b/src/app/cli/src/commands/pull_command.rs @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use std::borrow::Cow; use std::collections::HashMap; use std::sync::{Arc, Mutex}; use std::time::Duration; @@ -26,10 +27,11 @@ use crate::output::OutputConfig; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct PullCommand { - pull_svc: Arc, - dataset_repo: Arc, + pull_dataset_use_case: Arc, + dataset_registry: Arc, search_svc: Arc, output_config: Arc, + tenancy_config: TenancyConfig, refs: Vec, current_account_subject: Arc, all: bool, @@ -43,10 +45,11 @@ pub struct PullCommand { impl PullCommand { pub fn new( - pull_svc: Arc, - dataset_repo: Arc, + pull_dataset_use_case: Arc, + dataset_registry: Arc, search_svc: Arc, output_config: Arc, + tenancy_config: TenancyConfig, refs: I, current_account_subject: Arc, all: bool, @@ -61,10 +64,11 @@ impl PullCommand { I: IntoIterator, { Self { - pull_svc, - dataset_repo, + pull_dataset_use_case, + dataset_registry, search_svc, output_config, + tenancy_config, refs: refs.into_iter().collect(), current_account_subject, all, @@ -93,20 +97,28 @@ impl PullCommand { } }; - Ok(self - .pull_svc - .pull_multi_ext( - vec![PullRequest { - local_ref: Some(local_name.into()), - remote_ref: Some(remote_ref), - }], - PullMultiOptions { + self.pull_dataset_use_case + .execute_multi( + vec![PullRequest::remote( + remote_ref, + Some(DatasetAlias::new( + match self.tenancy_config { + TenancyConfig::MultiTenant => { + Some(self.current_account_subject.account_name().clone()) + } + TenancyConfig::SingleTenant => None, + }, + local_name.clone(), + )), + )], + PullOptions { add_aliases: self.add_aliases, ..Default::default() }, listener, ) - .await?) + .await + .map_err(CLIError::failure) } async fn pull_multi( @@ -114,12 +126,13 @@ impl PullCommand { listener: Option>, current_account_name: &AccountName, ) -> Result, CLIError> { - let dataset_refs: Vec<_> = if !self.all { + let dataset_any_refs: Vec<_> = if !self.all { filter_datasets_by_any_pattern( - self.dataset_repo.as_ref(), + self.dataset_registry.as_ref(), self.search_svc.clone(), self.refs.clone(), current_account_name, + self.tenancy_config, ) .try_collect() .await? @@ -127,34 +140,48 @@ impl PullCommand { vec![] }; - Ok(self - .pull_svc - .pull_multi( - dataset_refs, - PullMultiOptions { - recursive: self.recursive, - all: self.all, - add_aliases: self.add_aliases, - reset_derivatives_on_diverged_input: self.reset_derivatives_on_diverged_input, - ingest_options: PollingIngestOptions { - fetch_uncacheable: self.fetch_uncacheable, - exhaust_sources: true, - dataset_env_vars: HashMap::new(), - schema_inference: SchemaInferenceOpts::default(), - }, - sync_options: SyncOptions { - force: self.force, - ..SyncOptions::default() - }, - }, - listener, - ) - .await?) + let options = PullOptions { + recursive: self.recursive, + add_aliases: self.add_aliases, + ingest_options: PollingIngestOptions { + fetch_uncacheable: self.fetch_uncacheable, + exhaust_sources: true, + dataset_env_vars: HashMap::new(), + schema_inference: SchemaInferenceOpts::default(), + }, + transform_options: TransformOptions { + reset_derivatives_on_diverged_input: self.reset_derivatives_on_diverged_input, + }, + sync_options: SyncOptions { + force: self.force, + ..SyncOptions::default() + }, + }; + + if self.all { + self.pull_dataset_use_case + .execute_all_owned(options, listener) + .await + .map_err(CLIError::failure) + } else { + let requests = dataset_any_refs + .into_iter() + .map(|r| { + PullRequest::from_any_ref(&r, |_| { + self.tenancy_config == TenancyConfig::SingleTenant + }) + }) + .collect(); + + self.pull_dataset_use_case + .execute_multi(requests, options, listener) + .await + .map_err(CLIError::failure) + } } async fn pull_with_progress(&self) -> Result, CLIError> { - let pull_progress = - PrettyPullProgress::new(self.fetch_uncacheable, self.dataset_repo.is_multi_tenant()); + let pull_progress = PrettyPullProgress::new(self.fetch_uncacheable, self.tenancy_config); let listener = Arc::new(pull_progress.clone()); self.pull(Some(listener)).await } @@ -163,30 +190,23 @@ impl PullCommand { &self, listener: Option>, ) -> Result, CLIError> { - let current_account_name = match self.current_account_subject.as_ref() { - CurrentAccountSubject::Anonymous(_) => { - return Err(CLIError::usage_error( - "Anonymous account misused, use multi-tenant alias", - )) - } - CurrentAccountSubject::Logged(l) => &l.account_name, - }; if self.as_name.is_some() { self.sync_from(listener).await } else { + let current_account_name = self.current_account_subject.account_name(); self.pull_multi(listener, current_account_name).await } } fn describe_response(&self, pr: &PullResponse) -> String { - let local_ref = pr.local_ref.as_ref().or(pr - .original_request + let local_ref = pr.maybe_local_ref.as_ref().map(Cow::Borrowed).or(pr + .maybe_original_request .as_ref() - .and_then(|r| r.local_ref.as_ref())); - let remote_ref = pr.remote_ref.as_ref().or(pr - .original_request + .and_then(PullRequest::local_ref)); + let remote_ref = pr.maybe_remote_ref.as_ref().or(pr + .maybe_original_request .as_ref() - .and_then(|r| r.remote_ref.as_ref())); + .and_then(|r| r.remote_ref())); match (local_ref, remote_ref) { (Some(local_ref), Some(remote_ref)) => { format!("sync {local_ref} from {remote_ref}") @@ -278,15 +298,15 @@ impl Command for PullCommand { struct PrettyPullProgress { multi_progress: Arc, fetch_uncacheable: bool, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, } impl PrettyPullProgress { - fn new(fetch_uncacheable: bool, multi_tenant_workspace: bool) -> Self { + fn new(fetch_uncacheable: bool, tenancy_config: TenancyConfig) -> Self { Self { multi_progress: Arc::new(indicatif::MultiProgress::new()), fetch_uncacheable, - multi_tenant_workspace, + tenancy_config, } } } @@ -337,7 +357,7 @@ impl SyncMultiListener for PrettyPullProgress { dst: &DatasetRefAny, ) -> Option> { Some(Arc::new(PrettySyncProgress::new( - dst.as_local_ref(|_| !self.multi_tenant_workspace) + dst.as_local_ref(|_| self.tenancy_config == TenancyConfig::SingleTenant) .expect("Expected local ref"), src.as_remote_ref(|_| true).expect("Expected remote ref"), self.multi_progress.clone(), @@ -693,14 +713,25 @@ impl TransformListener for PrettyTransformProgress { .finish_with_message(Self::spinner_message(&self.dataset_handle, 0, msg)); } - fn error(&self, _error: &TransformError) { + fn elaborate_error(&self, _error: &TransformElaborateError) { + self.curr_progress + .lock() + .unwrap() + .finish_with_message(Self::spinner_message( + &self.dataset_handle, + 0, + console::style("Failed to update derivative dataset (elaborate phase)").red(), + )); + } + + fn execute_error(&self, _error: &TransformExecuteError) { self.curr_progress .lock() .unwrap() .finish_with_message(Self::spinner_message( &self.dataset_handle, 0, - console::style("Failed to update derivative dataset").red(), + console::style("Failed to update derivative dataset (execute phase)").red(), )); } diff --git a/src/app/cli/src/commands/push_command.rs b/src/app/cli/src/commands/push_command.rs index 33231f4e4f..4d2e475105 100644 --- a/src/app/cli/src/commands/push_command.rs +++ b/src/app/cli/src/commands/push_command.rs @@ -24,8 +24,8 @@ use crate::output::OutputConfig; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct PushCommand { - push_svc: Arc, - dataset_repo: Arc, + push_dataset_use_case: Arc, + dataset_registry: Arc, refs: Vec, all: bool, recursive: bool, @@ -34,12 +34,13 @@ pub struct PushCommand { to: Option, dataset_visibility: DatasetVisibility, output_config: Arc, + tenancy_config: TenancyConfig, } impl PushCommand { pub fn new( - push_svc: Arc, - dataset_repo: Arc, + push_dataset_use_case: Arc, + dataset_registry: Arc, refs: I, all: bool, recursive: bool, @@ -48,13 +49,14 @@ impl PushCommand { to: Option, dataset_visibility: DatasetVisibility, output_config: Arc, + tenancy_config: TenancyConfig, ) -> Self where I: IntoIterator, { Self { - push_svc, - dataset_repo, + push_dataset_use_case, + dataset_registry, refs: refs.into_iter().collect(), all, recursive, @@ -63,6 +65,7 @@ impl PushCommand { to, dataset_visibility, output_config, + tenancy_config, } } @@ -71,15 +74,42 @@ impl PushCommand { listener: Option>, ) -> Result, CLIError> { let dataset_refs: Vec<_> = - filter_datasets_by_local_pattern(self.dataset_repo.as_ref(), self.refs.clone()) + filter_datasets_by_local_pattern(self.dataset_registry.as_ref(), self.refs.clone()) .map_ok(|dataset_handle| dataset_handle.as_local_ref()) .try_collect() .await?; - Ok(self - .push_svc - .push_multi( - dataset_refs, + let mut dataset_handles = Vec::new(); + let mut error_responses = Vec::new(); + // TODO: batch resolution + for dataset_ref in &dataset_refs { + match self + .dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await + { + Ok(hdl) => dataset_handles.push(hdl), + Err(e) => { + let push_error = match e { + GetDatasetError::NotFound(e) => PushError::SourceNotFound(e), + GetDatasetError::Internal(e) => PushError::Internal(e), + }; + error_responses.push(PushResponse { + local_handle: None, + target: None, + result: Err(push_error), + }); + } + } + } + + if !error_responses.is_empty() { + return Ok(error_responses); + } + + self.push_dataset_use_case + .execute_multi( + dataset_handles, PushMultiOptions { all: self.all, recursive: self.recursive, @@ -89,7 +119,8 @@ impl PushCommand { }, listener, ) - .await) + .await + .map_err(CLIError::failure) } fn sync_options(&self) -> SyncOptions { @@ -101,7 +132,7 @@ impl PushCommand { } async fn push_with_progress(&self) -> Result, CLIError> { - let progress = PrettyPushProgress::new(self.dataset_repo.is_multi_tenant()); + let progress = PrettyPushProgress::new(self.tenancy_config); let listener = Arc::new(progress.clone()); self.do_push(Some(listener.clone())).await } @@ -139,7 +170,7 @@ impl Command for PushCommand { for res in &push_results { match &res.result { - Ok(r) => match r { + Ok(sync_result) => match sync_result { SyncResult::UpToDate => up_to_date += 1, SyncResult::Updated { .. } => updated += 1, }, @@ -196,14 +227,14 @@ impl Command for PushCommand { #[derive(Clone)] struct PrettyPushProgress { pub multi_progress: Arc, - pub multi_tenant_workspace: bool, + pub tenancy_config: TenancyConfig, } impl PrettyPushProgress { - fn new(multi_tenant_workspace: bool) -> Self { + fn new(tenancy_config: TenancyConfig) -> Self { Self { multi_progress: Arc::new(indicatif::MultiProgress::new()), - multi_tenant_workspace, + tenancy_config, } } } @@ -215,7 +246,7 @@ impl SyncMultiListener for PrettyPushProgress { dst: &DatasetRefAny, ) -> Option> { Some(Arc::new(PrettySyncProgress::new( - src.as_local_ref(|_| !self.multi_tenant_workspace) + src.as_local_ref(|_| self.tenancy_config == TenancyConfig::SingleTenant) .expect("Expected local ref"), dst.as_remote_ref(|_| true).expect("Expected remote ref"), self.multi_progress.clone(), diff --git a/src/app/cli/src/commands/reset_command.rs b/src/app/cli/src/commands/reset_command.rs index 32f69a4df1..84d6029404 100644 --- a/src/app/cli/src/commands/reset_command.rs +++ b/src/app/cli/src/commands/reset_command.rs @@ -19,8 +19,8 @@ use crate::Interact; pub struct ResetCommand { interact: Arc, - dataset_repo: Arc, - reset_svc: Arc, + dataset_registry: Arc, + reset_dataset_use_case: Arc, dataset_ref: DatasetRef, block_hash: Multihash, } @@ -28,15 +28,15 @@ pub struct ResetCommand { impl ResetCommand { pub fn new( interact: Arc, - dataset_repo: Arc, - reset_svc: Arc, + dataset_registry: Arc, + reset_dataset_use_case: Arc, dataset_ref: DatasetRef, block_hash: Multihash, ) -> Self { Self { interact, - dataset_repo, - reset_svc, + dataset_registry, + reset_dataset_use_case, dataset_ref, block_hash, } @@ -47,8 +47,8 @@ impl ResetCommand { impl Command for ResetCommand { async fn run(&mut self) -> Result<(), CLIError> { let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&self.dataset_ref) + .dataset_registry + .resolve_dataset_handle_by_ref(&self.dataset_ref) .await?; self.interact.require_confirmation(format!( @@ -58,8 +58,8 @@ impl Command for ResetCommand { console::style("This operation is irreversible!").yellow(), ))?; - self.reset_svc - .reset_dataset(&dataset_handle, Some(&self.block_hash), None) + self.reset_dataset_use_case + .execute(&dataset_handle, Some(&self.block_hash), None) .await .map_err(CLIError::failure)?; diff --git a/src/app/cli/src/commands/set_watermark_command.rs b/src/app/cli/src/commands/set_watermark_command.rs index d122bd57d0..2e7f8256e7 100644 --- a/src/app/cli/src/commands/set_watermark_command.rs +++ b/src/app/cli/src/commands/set_watermark_command.rs @@ -16,9 +16,9 @@ use opendatafabric::*; use super::{CLIError, Command}; pub struct SetWatermarkCommand { - dataset_repo: Arc, - remote_alias_reg: Arc, - pull_svc: Arc, + dataset_registry: Arc, + set_watermark_use_case: Arc, + tenancy_config: TenancyConfig, refs: Vec, all: bool, recursive: bool, @@ -27,9 +27,9 @@ pub struct SetWatermarkCommand { impl SetWatermarkCommand { pub fn new( - dataset_repo: Arc, - remote_alias_reg: Arc, - pull_svc: Arc, + dataset_registry: Arc, + set_watermark_use_case: Arc, + tenancy_config: TenancyConfig, refs: I, all: bool, recursive: bool, @@ -40,9 +40,9 @@ impl SetWatermarkCommand { I: IntoIterator, { Self { - dataset_repo, - remote_alias_reg, - pull_svc, + dataset_registry, + set_watermark_use_case, + tenancy_config, refs: refs.into_iter().collect(), all, recursive, @@ -80,38 +80,25 @@ impl Command for SetWatermarkCommand { let dataset_ref = self.refs[0] .as_dataset_ref_any() .unwrap() - .as_local_ref(|_| !self.dataset_repo.is_multi_tenant()) + .as_local_ref(|_| self.tenancy_config == TenancyConfig::SingleTenant) .map_err(|_| CLIError::usage_error("Expected a local dataset reference"))?; - let aliases = self - .remote_alias_reg - .get_remote_aliases(&dataset_ref) + let dataset_handle = self + .dataset_registry + .resolve_dataset_handle_by_ref(&dataset_ref) .await - .map_err(CLIError::failure)?; - let pull_aliases: Vec<_> = aliases - .get_by_kind(RemoteAliasKind::Pull) - .map(ToString::to_string) - .collect(); - - if !pull_aliases.is_empty() { - // TODO: Should this check be performed at domain model level? - return Err(CLIError::usage_error(format!( - "Setting watermark on a remote dataset will cause histories to diverge. Existing \ - pull aliases:\n{}", - pull_aliases.join("\n- ") - ))); - } + .map_err(CLIError::critical)?; match self - .pull_svc - .set_watermark(&dataset_ref, watermark.into()) + .set_watermark_use_case + .execute(&dataset_handle, watermark.into()) .await { - Ok(PullResult::UpToDate(_)) => { + Ok(SetWatermarkResult::UpToDate) => { eprintln!("{}", console::style("Watermark was up-to-date").yellow()); Ok(()) } - Ok(PullResult::Updated { new_head, .. }) => { + Ok(SetWatermarkResult::Updated { new_head, .. }) => { eprintln!( "{}", console::style(format!( @@ -125,7 +112,6 @@ impl Command for SetWatermarkCommand { Err( e @ (SetWatermarkError::IsDerivative | SetWatermarkError::IsRemote - | SetWatermarkError::NotFound(_) | SetWatermarkError::Access(_)), ) => Err(CLIError::failure(e)), Err(e @ SetWatermarkError::Internal(_)) => Err(CLIError::critical(e)), diff --git a/src/app/cli/src/commands/system_api_server_run_command.rs b/src/app/cli/src/commands/system_api_server_run_command.rs index 459f57be9d..a8abb052a7 100644 --- a/src/app/cli/src/commands/system_api_server_run_command.rs +++ b/src/app/cli/src/commands/system_api_server_run_command.rs @@ -15,6 +15,7 @@ use console::style as s; use database_common::DatabaseTransactionRunner; use dill::Catalog; use internal_error::ResultIntoInternal; +use kamu::domain::TenancyConfig; use kamu_accounts::*; use kamu_accounts_services::PasswordLoginCredentials; use kamu_adapter_oauth::*; @@ -28,7 +29,7 @@ use crate::OutputConfig; pub struct APIServerRunCommand { base_catalog: Catalog, cli_catalog: Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, output_config: Arc, address: Option, port: Option, @@ -44,7 +45,7 @@ impl APIServerRunCommand { pub fn new( base_catalog: Catalog, cli_catalog: Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, output_config: Arc, address: Option, port: Option, @@ -58,7 +59,7 @@ impl APIServerRunCommand { Self { base_catalog, cli_catalog, - multi_tenant_workspace, + tenancy_config, output_config, address, port, @@ -113,7 +114,7 @@ impl APIServerRunCommand { #[async_trait::async_trait(?Send)] impl Command for APIServerRunCommand { async fn validate_args(&self) -> Result<(), CLIError> { - if self.multi_tenant_workspace { + if self.tenancy_config == TenancyConfig::MultiTenant { if self.github_auth_config.client_id.is_empty() { return Err(CLIError::missed_env_var(ENV_VAR_KAMU_AUTH_GITHUB_CLIENT_ID)); } @@ -134,7 +135,7 @@ impl Command for APIServerRunCommand { let api_server = crate::explore::APIServer::new( &self.base_catalog, &self.cli_catalog, - self.multi_tenant_workspace, + self.tenancy_config, self.address, self.port, self.external_address, diff --git a/src/app/cli/src/commands/system_diagnose_command.rs b/src/app/cli/src/commands/system_diagnose_command.rs index 790f17fbc2..f4a812fbb9 100644 --- a/src/app/cli/src/commands/system_diagnose_command.rs +++ b/src/app/cli/src/commands/system_diagnose_command.rs @@ -19,11 +19,11 @@ use futures::TryStreamExt; use internal_error::{InternalError, ResultIntoInternal}; use kamu::domain::engine::normalize_logs; use kamu::domain::{ - DatasetRepository, + DatasetRegistry, OwnedFile, VerificationOptions, VerificationRequest, - VerificationService, + VerifyDatasetUseCase, }; use kamu::utils::docker_images::BUSYBOX; use random_names::get_random_name; @@ -40,22 +40,22 @@ const FAILED_MESSAGE: &str = "failed"; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct SystemDiagnoseCommand { - dataset_repo: Arc, - verification_svc: Arc, + dataset_registry: Arc, + verify_dataset_use_case: Arc, container_runtime: Arc, workspace_svc: Arc, } impl SystemDiagnoseCommand { pub fn new( - dataset_repo: Arc, - verification_svc: Arc, + dataset_registry: Arc, + verify_dataset_use_case: Arc, container_runtime: Arc, workspace_svc: Arc, ) -> Self { Self { - dataset_repo, - verification_svc, + dataset_registry, + verify_dataset_use_case, container_runtime, workspace_svc, } @@ -98,8 +98,8 @@ impl Command for SystemDiagnoseCommand { // Add checks which required workspace initialization if self.workspace_svc.is_in_workspace() { diagnostic_checks.push(Box::new(CheckWorkspaceConsistent { - dataset_repo: self.dataset_repo.clone(), - verification_svc: self.verification_svc.clone(), + dataset_registry: self.dataset_registry.clone(), + verify_dataset_use_case: self.verify_dataset_use_case.clone(), })); } @@ -330,8 +330,8 @@ impl DiagnosticCheck for CheckContainerRuntimeVolumeMount { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct CheckWorkspaceConsistent { - dataset_repo: Arc, - verification_svc: Arc, + dataset_registry: Arc, + verify_dataset_use_case: Arc, } #[async_trait::async_trait] @@ -352,29 +352,28 @@ impl DiagnosticCheck for CheckWorkspaceConsistent { progress_cloned.draw(); }); - let verification_requests: Vec<_> = self - .dataset_repo - .get_all_datasets() - .map_ok(|hdl| VerificationRequest { - dataset_ref: hdl.as_local_ref(), - block_range: (None, None), - }) + let dataset_handles: Vec<_> = self + .dataset_registry + .all_dataset_handles() .try_collect() .await?; - let verify_options = VerificationOptions { - check_integrity: true, - check_logical_hashes: false, - replay_transformations: false, - }; + let mut verification_tasks = Vec::new(); + for dataset_handle in dataset_handles { + verification_tasks.push(VerificationRequest { + target: dataset_handle, + block_range: (None, None), + options: VerificationOptions { + check_integrity: true, + check_logical_hashes: false, + replay_transformations: false, + }, + }); + } let results = self - .verification_svc - .verify_multi( - verification_requests, - verify_options.clone(), - Some(progress.clone()), - ) + .verify_dataset_use_case + .execute_multi(verification_tasks, Some(progress.clone())) .await; for result in results { diff --git a/src/app/cli/src/commands/system_e2e_command.rs b/src/app/cli/src/commands/system_e2e_command.rs index 180f6484fa..b14a3f0bdc 100644 --- a/src/app/cli/src/commands/system_e2e_command.rs +++ b/src/app/cli/src/commands/system_e2e_command.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use internal_error::ResultIntoInternal; -use kamu::domain::{DatasetRepository, MetadataChainExt}; +use kamu::domain::{DatasetRegistry, DatasetRegistryExt, MetadataChainExt}; use opendatafabric::DatasetRef; use super::{CLIError, Command}; @@ -20,14 +20,14 @@ use super::{CLIError, Command}; pub struct SystemE2ECommand { action: String, dataset_ref: Option, - dataset_repo: Arc, + dataset_registry: Arc, } impl SystemE2ECommand { pub fn new( action: S, dataset_ref: Option, - dataset_repo: Arc, + dataset_registry: Arc, ) -> Self where S: Into, @@ -35,7 +35,7 @@ impl SystemE2ECommand { Self { action: action.into(), dataset_ref, - dataset_repo, + dataset_registry, } } } @@ -49,9 +49,12 @@ impl Command for SystemE2ECommand { return Err(CLIError::usage_error("dataset required")); }; - let dataset = self.dataset_repo.find_dataset_by_ref(dataset_ref).await?; + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(dataset_ref) + .await?; - let maybe_physical_hash = dataset + let maybe_physical_hash = resolved_dataset .as_metadata_chain() .last_data_block_with_new_data() .await? @@ -63,7 +66,7 @@ impl Command for SystemE2ECommand { return Err(CLIError::usage_error("DataSlice not found")); }; - let internal_url = dataset + let internal_url = resolved_dataset .as_data_repo() .get_internal_url(&physical_hash) .await; diff --git a/src/app/cli/src/commands/system_ipfs_add_command.rs b/src/app/cli/src/commands/system_ipfs_add_command.rs index a982565e90..5e71b1c8f8 100644 --- a/src/app/cli/src/commands/system_ipfs_add_command.rs +++ b/src/app/cli/src/commands/system_ipfs_add_command.rs @@ -19,13 +19,19 @@ use super::{CLIError, Command}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct SystemIpfsAddCommand { + dataset_registry: Arc, sync_svc: Arc, dataset_ref: DatasetRef, } impl SystemIpfsAddCommand { - pub fn new(sync_svc: Arc, dataset_ref: DatasetRef) -> Self { + pub fn new( + dataset_registry: Arc, + sync_svc: Arc, + dataset_ref: DatasetRef, + ) -> Self { Self { + dataset_registry, sync_svc, dataset_ref, } @@ -35,9 +41,15 @@ impl SystemIpfsAddCommand { #[async_trait::async_trait(?Send)] impl Command for SystemIpfsAddCommand { async fn run(&mut self) -> Result<(), CLIError> { + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(&self.dataset_ref) + .await + .map_err(CLIError::failure)?; + let cid = self .sync_svc - .ipfs_add(&self.dataset_ref) + .ipfs_add(resolved_dataset) .await .map_err(CLIError::failure)?; diff --git a/src/app/cli/src/commands/ui_command.rs b/src/app/cli/src/commands/ui_command.rs index 3aa56dc67e..c369be37aa 100644 --- a/src/app/cli/src/commands/ui_command.rs +++ b/src/app/cli/src/commands/ui_command.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use console::style as s; use dill::Catalog; use internal_error::ResultIntoInternal; +use kamu::domain::TenancyConfig; use kamu_accounts::PredefinedAccountsConfig; use kamu_adapter_http::FileUploadLimitConfig; use kamu_datasets::DatasetEnvVarsConfig; @@ -29,7 +30,7 @@ use crate::OutputConfig; pub struct UICommand { server_catalog: Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, current_account_name: AccountName, predefined_accounts_config: Arc, file_upload_limit_config: Arc, @@ -43,7 +44,7 @@ pub struct UICommand { impl UICommand { pub fn new( server_catalog: Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, current_account_name: AccountName, predefined_accounts_config: Arc, file_upload_limit_config: Arc, @@ -55,7 +56,7 @@ impl UICommand { ) -> Self { Self { server_catalog, - multi_tenant_workspace, + tenancy_config, current_account_name, predefined_accounts_config, file_upload_limit_config, @@ -74,7 +75,7 @@ impl Command for UICommand { async fn run(&mut self) -> Result<(), CLIError> { let web_server = crate::explore::WebUIServer::new( self.server_catalog.clone(), - self.multi_tenant_workspace, + self.tenancy_config, self.current_account_name.clone(), self.predefined_accounts_config.clone(), self.file_upload_limit_config.clone(), diff --git a/src/app/cli/src/commands/verify_command.rs b/src/app/cli/src/commands/verify_command.rs index ac3d257789..7b0d140d7c 100644 --- a/src/app/cli/src/commands/verify_command.rs +++ b/src/app/cli/src/commands/verify_command.rs @@ -24,8 +24,8 @@ type GenericVerificationResult = Result, CLIError>; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct VerifyCommand { - dataset_repo: Arc, - verification_svc: Arc, + verify_dataset_use_case: Arc, + dataset_registry: Arc, dependency_graph_service: Arc, remote_alias_reg: Arc, output_config: Arc, @@ -41,8 +41,8 @@ struct RemoteRefDependency { impl VerifyCommand { pub fn new( - dataset_repo: Arc, - verification_svc: Arc, + verify_dataset_use_case: Arc, + dataset_registry: Arc, dependency_graph_service: Arc, remote_alias_reg: Arc, output_config: Arc, @@ -54,8 +54,8 @@ impl VerifyCommand { I: Iterator, { Self { - dataset_repo, - verification_svc, + verify_dataset_use_case, + dataset_registry, dependency_graph_service, remote_alias_reg, output_config, @@ -87,41 +87,47 @@ impl VerifyCommand { async fn verify( &self, options: VerificationOptions, - listener: Option>, + multi_listener: Option>, ) -> GenericVerificationResult { let dataset_ref_pattern = self.refs.first().unwrap(); - let dataset_ids: Vec<_> = filter_datasets_by_local_pattern( - self.dataset_repo.as_ref(), + let dataset_handles: Vec<_> = filter_datasets_by_local_pattern( + self.dataset_registry.as_ref(), vec![dataset_ref_pattern.clone()], ) - .map_ok(|dataset_handle| dataset_handle.id) .try_collect() .await?; - let requests: Vec<_> = if self.recursive { - self.dependency_graph_service - .get_recursive_upstream_dependencies(dataset_ids) + let mut verification_results = Vec::new(); + + let dataset_handles: Vec<_> = if self.recursive { + let input_dataset_ids = dataset_handles.into_iter().map(|hdl| hdl.id).collect(); + + let all_dataset_ids = self + .dependency_graph_service + .get_recursive_upstream_dependencies(input_dataset_ids) .await .int_err()? - .map(|dataset_id| VerificationRequest { - dataset_ref: DatasetRef::ID(dataset_id), - block_range: (None, None), - }) .collect() + .await; + + let resolution_results = self + .dataset_registry + .resolve_multiple_dataset_handles_by_ids(all_dataset_ids) .await + .int_err()?; + + for (_, e) in resolution_results.unresolved_datasets { + verification_results.push(VerificationResult::err_no_handle(e)); + } + + resolution_results.resolved_handles } else { - dataset_ids - .iter() - .map(|dataset_id| VerificationRequest { - dataset_ref: DatasetRef::ID(dataset_id.clone()), - block_range: (None, None), - }) - .collect() + dataset_handles }; - let (filtered_requests, missed_remote_dependencies) = - self.check_remote_datasets(requests).await; + let (filtered_dataset_handles, missed_remote_dependencies) = + self.detect_remote_datasets(dataset_handles).await; if !missed_remote_dependencies.is_empty() { let missed_dependency_warnings: Vec = missed_remote_dependencies @@ -146,71 +152,78 @@ impl VerifyCommand { ); } - if filtered_requests.is_empty() { + if filtered_dataset_handles.is_empty() { return Ok(vec![]); } - Ok(self - .verification_svc - .verify_multi(filtered_requests, options, listener) - .await) + let filtered_requests = filtered_dataset_handles + .into_iter() + .map(|hdl| VerificationRequest { + target: hdl, + block_range: (None, None), + options: options.clone(), + }) + .collect(); + + let mut main_verification_results = self + .verify_dataset_use_case + .clone() + .execute_multi(filtered_requests, multi_listener) + .await; + + verification_results.append(&mut main_verification_results); + Ok(verification_results) } // Return tuple with filtered VerificationRequests(check existing) // with a list of missed remote dependencies - async fn check_remote_datasets( + async fn detect_remote_datasets( &self, - verification_requests: Vec, - ) -> (Vec, Vec) { + dataset_handles: Vec, + ) -> (Vec, Vec) { let mut result = vec![]; let mut missed_dependencies = vec![]; - for verification_request in &verification_requests { - if let Ok(dataset_handle) = self - .dataset_repo - .resolve_dataset_ref(&verification_request.dataset_ref) + for hdl in dataset_handles { + let is_remote = self + .remote_alias_reg + .get_remote_aliases(&hdl) .await - { - let is_remote = self - .remote_alias_reg - .get_remote_aliases(&dataset_handle.as_local_ref()) - .await - .unwrap() - .get_by_kind(RemoteAliasKind::Pull) - .next() - .is_some(); - if !is_remote || self.integrity { - result.push(verification_request.clone()); - continue; - } + .unwrap() + .get_by_kind(RemoteAliasKind::Pull) + .next() + .is_some(); + if !is_remote || self.integrity { + result.push(hdl); + continue; + } - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - let summary = dataset - .get_summary(GetSummaryOpts::default()) + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(&hdl); + let summary = resolved_dataset + .get_summary(GetSummaryOpts::default()) + .await + .unwrap(); + + let mut current_missed_dependencies = vec![]; + + for dependency in summary.dependencies { + if self + .dataset_registry + .resolve_dataset_handle_by_ref(&DatasetRef::ID(dependency.clone())) .await - .unwrap(); - - let mut current_missed_dependencies = vec![]; - - for dependency in summary.dependencies { - if self - .dataset_repo - .resolve_dataset_ref(&DatasetRef::ID(dependency.clone())) - .await - .is_err() - { - current_missed_dependencies.push(dependency.to_string()); - } - } - if !current_missed_dependencies.is_empty() { - missed_dependencies.push(RemoteRefDependency { - source_dataset: dataset_handle.alias.to_string(), - dependencies: current_missed_dependencies, - }); - } else { - result.push(verification_request.clone()); + .is_err() + { + current_missed_dependencies.push(dependency.to_string()); } } + if !current_missed_dependencies.is_empty() { + missed_dependencies.push(RemoteRefDependency { + source_dataset: hdl.alias.to_string(), + dependencies: current_missed_dependencies, + }); + } else { + result.push(hdl); + } } (result, missed_dependencies) diff --git a/src/app/cli/src/database.rs b/src/app/cli/src/database.rs index eaf1d36a74..c4cfc81ef5 100644 --- a/src/app/cli/src/database.rs +++ b/src/app/cli/src/database.rs @@ -12,6 +12,7 @@ use std::path::{Path, PathBuf}; use database_common::*; use dill::{Catalog, CatalogBuilder, Component}; use internal_error::{InternalError, ResultIntoInternal}; +use kamu::domain::TenancyConfig; use secrecy::SecretString; use tempfile::TempDir; @@ -50,14 +51,14 @@ impl AppDatabaseConfig { pub fn get_app_database_config( workspace_layout: &WorkspaceLayout, config: &config::CLIConfig, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, init_command: bool, ) -> AppDatabaseConfig { if let Some(database_config) = config.database.clone() { return AppDatabaseConfig::Explicit(database_config); } - if !multi_tenant_workspace { + if tenancy_config == TenancyConfig::SingleTenant { // Default for multi-tenant workspace only return AppDatabaseConfig::None; }; diff --git a/src/app/cli/src/error.rs b/src/app/cli/src/error.rs index 636d388b77..3d2cdb1e9c 100644 --- a/src/app/cli/src/error.rs +++ b/src/app/cli/src/error.rs @@ -181,7 +181,6 @@ impl From for CLIError { impl From for CLIError { fn from(v: GetAliasesError) -> Self { match v { - e @ GetAliasesError::DatasetNotFound(_) => Self::failure(e), e @ GetAliasesError::Internal(_) => Self::critical(e), } } diff --git a/src/app/cli/src/explore/api_server.rs b/src/app/cli/src/explore/api_server.rs index 1bc1c89d4b..1365eb2c2e 100644 --- a/src/app/cli/src/explore/api_server.rs +++ b/src/app/cli/src/explore/api_server.rs @@ -20,7 +20,7 @@ use dill::{Catalog, CatalogBuilder}; use http_common::ApiError; use indoc::indoc; use internal_error::*; -use kamu::domain::{Protocols, ServerUrlConfig}; +use kamu::domain::{Protocols, ServerUrlConfig, TenancyConfig}; use kamu_adapter_http::e2e::e2e_router; use kamu_flow_system_inmem::domain::FlowExecutor; use kamu_task_system_inmem::domain::TaskExecutor; @@ -46,7 +46,7 @@ impl APIServer { pub async fn new( base_catalog: &Catalog, cli_catalog: &Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, address: Option, port: Option, external_address: Option, @@ -115,23 +115,21 @@ impl APIServer { .merge(kamu_adapter_http::general::root_router()) .nest( "/odata", - if multi_tenant_workspace { - kamu_adapter_odata::router_multi_tenant() - } else { - kamu_adapter_odata::router_single_tenant() + match tenancy_config { + TenancyConfig::MultiTenant => kamu_adapter_odata::router_multi_tenant(), + TenancyConfig::SingleTenant => kamu_adapter_odata::router_single_tenant(), }, ) .nest( - if multi_tenant_workspace { - "/:account_name/:dataset_name" - } else { - "/:dataset_name" + match tenancy_config { + TenancyConfig::MultiTenant => "/:account_name/:dataset_name", + TenancyConfig::SingleTenant => "/:dataset_name", }, kamu_adapter_http::add_dataset_resolver_layer( OpenApiRouter::new() .merge(kamu_adapter_http::smart_transfer_protocol_router()) .merge(kamu_adapter_http::data::dataset_router()), - multi_tenant_workspace, + tenancy_config, ), ); diff --git a/src/app/cli/src/explore/web_ui_server.rs b/src/app/cli/src/explore/web_ui_server.rs index 3bb64478ca..050e0c3fd3 100644 --- a/src/app/cli/src/explore/web_ui_server.rs +++ b/src/app/cli/src/explore/web_ui_server.rs @@ -18,7 +18,7 @@ use database_common_macros::transactional_handler; use dill::{Catalog, CatalogBuilder}; use http_common::ApiError; use internal_error::*; -use kamu::domain::{Protocols, ServerUrlConfig}; +use kamu::domain::{Protocols, ServerUrlConfig, TenancyConfig}; use kamu_accounts::{ AccountConfig, AuthenticationService, @@ -83,7 +83,7 @@ pub struct WebUIServer { impl WebUIServer { pub async fn new( server_catalog: Catalog, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, current_account_name: AccountName, predefined_accounts_config: Arc, file_upload_limit_config: Arc, @@ -115,10 +115,10 @@ impl WebUIServer { login_credentials_json: serde_json::to_string(&login_credentials).unwrap(), }; - let web_ui_url = format!("http://{}", local_addr); + let web_ui_url = format!("http://{local_addr}"); let web_ui_config = WebUIConfig { - api_server_gql_url: format!("http://{}/graphql", local_addr), + api_server_gql_url: format!("http://{local_addr}/graphql"), api_server_http_url: web_ui_url.clone(), login_instructions: Some(login_instructions.clone()), ingest_upload_file_limit_mb: file_upload_limit_config.max_file_size_in_mb(), @@ -170,23 +170,21 @@ impl WebUIServer { )) .nest( "/odata", - if multi_tenant_workspace { - kamu_adapter_odata::router_multi_tenant() - } else { - kamu_adapter_odata::router_single_tenant() + match tenancy_config { + TenancyConfig::MultiTenant => kamu_adapter_odata::router_multi_tenant(), + TenancyConfig::SingleTenant => kamu_adapter_odata::router_single_tenant(), }, ) .nest( - if multi_tenant_workspace { - "/:account_name/:dataset_name" - } else { - "/:dataset_name" + match tenancy_config { + TenancyConfig::MultiTenant => "/:account_name/:dataset_name", + TenancyConfig::SingleTenant => "/:dataset_name", }, kamu_adapter_http::add_dataset_resolver_layer( OpenApiRouter::new() .merge(kamu_adapter_http::smart_transfer_protocol_router()) .merge(kamu_adapter_http::data::dataset_router()), - multi_tenant_workspace, + tenancy_config, ), ) .fallback(app_handler) diff --git a/src/app/cli/src/services/accounts/account_service.rs b/src/app/cli/src/services/accounts/account_service.rs index 686c67c3fc..7814a276c4 100644 --- a/src/app/cli/src/services/accounts/account_service.rs +++ b/src/app/cli/src/services/accounts/account_service.rs @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use kamu::domain::TenancyConfig; use kamu_accounts::{PredefinedAccountsConfig, DEFAULT_ACCOUNT_NAME_STR}; use crate::accounts::models::*; @@ -16,29 +17,27 @@ use crate::accounts::models::*; pub struct AccountService {} impl AccountService { - pub fn default_account_name(multi_tenant_workspace: bool) -> String { - if multi_tenant_workspace { - whoami::username() - } else { - String::from(DEFAULT_ACCOUNT_NAME_STR) + pub fn default_account_name(tenancy_config: TenancyConfig) -> String { + match tenancy_config { + TenancyConfig::MultiTenant => whoami::username(), + TenancyConfig::SingleTenant => String::from(DEFAULT_ACCOUNT_NAME_STR), } } - pub fn default_user_name(multi_tenant_workspace: bool) -> String { - if multi_tenant_workspace { - whoami::realname() - } else { - String::from(DEFAULT_ACCOUNT_NAME_STR) + pub fn default_user_name(tenancy_config: TenancyConfig) -> String { + match tenancy_config { + TenancyConfig::MultiTenant => whoami::realname(), + TenancyConfig::SingleTenant => String::from(DEFAULT_ACCOUNT_NAME_STR), } } pub fn current_account_indication( account: Option, - multi_tenant_workspace: bool, + tenancy_config: TenancyConfig, predefined_accounts_config: &PredefinedAccountsConfig, ) -> CurrentAccountIndication { let (current_account, user_name, specified_explicitly) = { - let default_account_name = AccountService::default_account_name(multi_tenant_workspace); + let default_account_name = AccountService::default_account_name(tenancy_config); if let Some(account) = account { ( @@ -52,13 +51,13 @@ impl AccountService { true, ) } else { - let default_user_name = AccountService::default_user_name(multi_tenant_workspace); + let default_user_name = AccountService::default_user_name(tenancy_config); (default_account_name, default_user_name, false) } }; - let is_admin = if multi_tenant_workspace { + let is_admin = if tenancy_config == TenancyConfig::MultiTenant { predefined_accounts_config .predefined .iter() diff --git a/src/app/cli/src/services/workspace/workspace_layout.rs b/src/app/cli/src/services/workspace/workspace_layout.rs index e0c74d9ff2..1f3de5924d 100644 --- a/src/app/cli/src/services/workspace/workspace_layout.rs +++ b/src/app/cli/src/services/workspace/workspace_layout.rs @@ -10,6 +10,7 @@ use std::path::{Path, PathBuf}; use internal_error::{InternalError, ResultIntoInternal}; +use kamu::domain::TenancyConfig; use opendatafabric::serde::yaml::Manifest; use serde::{Deserialize, Serialize}; @@ -53,7 +54,10 @@ impl WorkspaceLayout { } } - pub fn create(root: impl Into, multi_tenant: bool) -> Result { + pub fn create( + root: impl Into, + tenancy_config: TenancyConfig, + ) -> Result { let ws = Self::new(root); if !ws.root_dir.exists() || ws.root_dir.read_dir().int_err()?.next().is_some() { std::fs::create_dir(&ws.root_dir).int_err()?; @@ -65,7 +69,7 @@ impl WorkspaceLayout { std::fs::write(&ws.version_path, WorkspaceVersion::LATEST.to_string()).int_err()?; // Only save the workspace configuration if it is different from default - let ws_config = WorkspaceConfig::new(multi_tenant); + let ws_config = WorkspaceConfig::new(tenancy_config == TenancyConfig::MultiTenant); if ws_config != WorkspaceConfig::default() { ws_config.save_to(&ws.config_path).int_err()?; } diff --git a/src/app/cli/tests/tests/test_di_graph.rs b/src/app/cli/tests/tests/test_di_graph.rs index 6dcae8fc12..1daf27e0ff 100644 --- a/src/app/cli/tests/tests/test_di_graph.rs +++ b/src/app/cli/tests/tests/test_di_graph.rs @@ -7,63 +7,64 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu::domain::ServerUrlConfig; +use kamu::domain::{ServerUrlConfig, TenancyConfig}; use kamu_accounts::{CurrentAccountSubject, JwtAuthenticationConfig}; use kamu_adapter_http::AccessToken; +use kamu_adapter_oauth::GithubAuthenticationConfig; use kamu_cli::{self, OutputConfig, WorkspaceLayout}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_di_cli_graph_validates_st() { - test_di_cli_graph_validates(false); + test_di_cli_graph_validates(TenancyConfig::SingleTenant); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_di_cli_graph_validates_mt() { - test_di_cli_graph_validates(true); + test_di_cli_graph_validates(TenancyConfig::MultiTenant); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_di_server_graph_validates_st() { - test_di_server_graph_validates(false); + test_di_server_graph_validates(TenancyConfig::SingleTenant); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_di_server_graph_validates_mt() { - test_di_server_graph_validates(true); + test_di_server_graph_validates(TenancyConfig::MultiTenant); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Tests //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -fn test_di_cli_graph_validates(multi_tenant_workspace: bool) { +fn test_di_cli_graph_validates(tenancy_config: TenancyConfig) { let temp_dir = tempfile::tempdir().unwrap(); let workspace_layout = WorkspaceLayout::new(temp_dir.path()); let mut base_catalog_builder = - kamu_cli::configure_base_catalog(&workspace_layout, false, None, false); + kamu_cli::configure_base_catalog(&workspace_layout, tenancy_config, None, false); kamu_cli::configure_in_memory_components(&mut base_catalog_builder); base_catalog_builder.add_value(OutputConfig::default()); kamu_cli::register_config_in_catalog( &kamu_cli::config::CLIConfig::default(), &mut base_catalog_builder, - multi_tenant_workspace, + tenancy_config, ); let base_catalog = base_catalog_builder.build(); - let mut cli_catalog_builder = - kamu_cli::configure_cli_catalog(&base_catalog, multi_tenant_workspace); + let mut cli_catalog_builder = kamu_cli::configure_cli_catalog(&base_catalog, tenancy_config); cli_catalog_builder.add_value(CurrentAccountSubject::new_test()); cli_catalog_builder.add_value(JwtAuthenticationConfig::default()); + cli_catalog_builder.add_value(GithubAuthenticationConfig::default()); let validate_result = cli_catalog_builder.validate(); @@ -76,25 +77,25 @@ fn test_di_cli_graph_validates(multi_tenant_workspace: bool) { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -fn test_di_server_graph_validates(multi_tenant_workspace: bool) { +fn test_di_server_graph_validates(tenancy_config: TenancyConfig) { let temp_dir = tempfile::tempdir().unwrap(); let workspace_layout = WorkspaceLayout::new(temp_dir.path()); let mut base_catalog_builder = - kamu_cli::configure_base_catalog(&workspace_layout, false, None, false); + kamu_cli::configure_base_catalog(&workspace_layout, tenancy_config, None, false); kamu_cli::configure_in_memory_components(&mut base_catalog_builder); base_catalog_builder.add_value(OutputConfig::default()); kamu_cli::register_config_in_catalog( &kamu_cli::config::CLIConfig::default(), &mut base_catalog_builder, - multi_tenant_workspace, + tenancy_config, ); let base_catalog = base_catalog_builder.build(); let mut cli_catalog_builder = kamu_cli::configure_server_catalog(&base_catalog); - cli_catalog_builder.add_value(CurrentAccountSubject::new_test()); cli_catalog_builder.add_value(JwtAuthenticationConfig::default()); + cli_catalog_builder.add_value(GithubAuthenticationConfig::default()); cli_catalog_builder.add_value(ServerUrlConfig::new_test(None)); cli_catalog_builder.add_value(AccessToken::new("some-test-token")); diff --git a/src/domain/accounts/domain/src/entities/current_account_subject.rs b/src/domain/accounts/domain/src/entities/current_account_subject.rs index 2c3c8c8a8f..e8b39e982f 100644 --- a/src/domain/accounts/domain/src/entities/current_account_subject.rs +++ b/src/domain/accounts/domain/src/entities/current_account_subject.rs @@ -55,6 +55,22 @@ impl CurrentAccountSubject { is_admin, ) } + + pub fn account_name(&self) -> &AccountName { + match self { + CurrentAccountSubject::Anonymous(_) => { + panic!("Anonymous account misused"); + } + CurrentAccountSubject::Logged(l) => &l.account_name, + } + } + + pub fn account_name_or_default(&self) -> &AccountName { + match self { + CurrentAccountSubject::Logged(l) => &l.account_name, + CurrentAccountSubject::Anonymous(_) => &DEFAULT_ACCOUNT_NAME, + } + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/auth-rebac/services/src/multi_tenant_rebac_dataset_lifecycle_message_consumer.rs b/src/domain/auth-rebac/services/src/multi_tenant_rebac_dataset_lifecycle_message_consumer.rs index 41634a735d..2c8fb3108d 100644 --- a/src/domain/auth-rebac/services/src/multi_tenant_rebac_dataset_lifecycle_message_consumer.rs +++ b/src/domain/auth-rebac/services/src/multi_tenant_rebac_dataset_lifecycle_message_consumer.rs @@ -22,7 +22,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, }; use crate::{RebacServiceImpl, MESSAGE_CONSUMER_KAMU_REBAC_SERVICE}; @@ -41,7 +41,7 @@ pub struct MultiTenantRebacDatasetLifecycleMessageConsumer { feeding_producers: &[ MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, ], - durability: MessageConsumptionDurability::Durable, + delivery: MessageDeliveryMechanism::Immediate, })] impl MultiTenantRebacDatasetLifecycleMessageConsumer { pub fn new(rebac_service: Arc) -> Self { diff --git a/src/domain/core/src/auth/dataset_action_authorizer.rs b/src/domain/core/src/auth/dataset_action_authorizer.rs index 20c6cccafd..7e7f09d23b 100644 --- a/src/domain/core/src/auth/dataset_action_authorizer.rs +++ b/src/domain/core/src/auth/dataset_action_authorizer.rs @@ -40,6 +40,18 @@ pub trait DatasetActionAuthorizer: Sync + Send { } async fn get_allowed_actions(&self, dataset_handle: &DatasetHandle) -> HashSet; + + async fn filter_datasets_allowing( + &self, + dataset_handles: Vec, + action: DatasetAction, + ) -> Result, InternalError>; + + async fn classify_datasets_by_allowance( + &self, + dataset_handles: Vec, + action: DatasetAction, + ) -> Result; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -93,6 +105,14 @@ pub struct DatasetActionNotEnoughPermissionsError { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[derive(Debug)] +pub struct ClassifyByAllowanceResponse { + pub authorized_handles: Vec, + pub unauthorized_handles_with_errors: Vec<(DatasetHandle, DatasetActionUnauthorizedError)>, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[component(pub)] #[interface(dyn DatasetActionAuthorizer)] pub struct AlwaysHappyDatasetActionAuthorizer {} @@ -117,6 +137,25 @@ impl DatasetActionAuthorizer for AlwaysHappyDatasetActionAuthorizer { async fn get_allowed_actions(&self, _dataset_handle: &DatasetHandle) -> HashSet { HashSet::from([DatasetAction::Read, DatasetAction::Write]) } + + async fn filter_datasets_allowing( + &self, + dataset_handles: Vec, + _action: DatasetAction, + ) -> Result, InternalError> { + Ok(dataset_handles) + } + + async fn classify_datasets_by_allowance( + &self, + dataset_handles: Vec, + _action: DatasetAction, + ) -> Result { + Ok(ClassifyByAllowanceResponse { + authorized_handles: dataset_handles, + unauthorized_handles_with_errors: vec![], + }) + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/entities/dataset.rs b/src/domain/core/src/entities/dataset.rs index b7fa029797..26cf1da4a7 100644 --- a/src/domain/core/src/entities/dataset.rs +++ b/src/domain/core/src/entities/dataset.rs @@ -14,6 +14,7 @@ use chrono::{DateTime, Utc}; use internal_error::*; use opendatafabric::*; use thiserror::Error; +use url::Url; pub use crate::utils::owned_file::OwnedFile; use crate::*; @@ -64,6 +65,8 @@ pub trait Dataset: Send + Sync { checkpoint: Option<&CheckpointRef>, ) -> Result; + fn get_storage_internal_url(&self) -> &Url; + fn as_metadata_chain(&self) -> &dyn MetadataChain; fn as_data_repo(&self) -> &dyn ObjectRepository; fn as_checkpoint_repo(&self) -> &dyn ObjectRepository; diff --git a/src/domain/core/src/entities/engine.rs b/src/domain/core/src/entities/engine.rs index 808747147e..00e45c8f7b 100644 --- a/src/domain/core/src/entities/engine.rs +++ b/src/domain/core/src/entities/engine.rs @@ -17,7 +17,7 @@ use internal_error::*; use opendatafabric::*; use thiserror::Error; -use crate::{BlockRef, OwnedFile}; +use crate::{BlockRef, OwnedFile, ResolvedDatasetsMap}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Engine @@ -33,6 +33,7 @@ pub trait Engine: Send + Sync { async fn execute_transform( &self, request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, ) -> Result; } diff --git a/src/domain/core/src/entities/metadata_chain.rs b/src/domain/core/src/entities/metadata_chain.rs index 092b318754..b63c6727c6 100644 --- a/src/domain/core/src/entities/metadata_chain.rs +++ b/src/domain/core/src/entities/metadata_chain.rs @@ -558,6 +558,7 @@ impl From for IterBlocksError { pub enum AcceptVisitorError { #[error(transparent)] Traversal(IterBlocksError), + #[error(transparent)] Visitor(E), } diff --git a/src/domain/core/src/entities/mod.rs b/src/domain/core/src/entities/mod.rs index a9a5137a6e..ae32ef0e86 100644 --- a/src/domain/core/src/entities/mod.rs +++ b/src/domain/core/src/entities/mod.rs @@ -12,8 +12,12 @@ pub mod dataset_summary; pub mod engine; pub mod metadata_chain; pub mod metadata_stream; +pub mod resolved_dataset; +pub mod resolved_datasets_map; pub use dataset::*; pub use dataset_summary::*; pub use metadata_chain::*; pub use metadata_stream::*; +pub use resolved_dataset::*; +pub use resolved_datasets_map::*; diff --git a/src/domain/core/src/entities/resolved_dataset.rs b/src/domain/core/src/entities/resolved_dataset.rs new file mode 100644 index 0000000000..bc01b32dc9 --- /dev/null +++ b/src/domain/core/src/entities/resolved_dataset.rs @@ -0,0 +1,74 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use opendatafabric::{self as odf}; + +use crate::{CreateDatasetResult, Dataset}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Clone)] +pub struct ResolvedDataset { + dataset: Arc, + handle: odf::DatasetHandle, +} + +impl ResolvedDataset { + pub fn new(dataset: Arc, handle: odf::DatasetHandle) -> Self { + Self { dataset, handle } + } + + pub fn from(create_dataset_result: &CreateDatasetResult) -> Self { + Self { + dataset: create_dataset_result.dataset.clone(), + handle: create_dataset_result.dataset_handle.clone(), + } + } + + #[inline] + pub fn get_id(&self) -> &odf::DatasetID { + &self.handle.id + } + + #[inline] + pub fn get_alias(&self) -> &odf::DatasetAlias { + &self.handle.alias + } + + #[inline] + pub fn get_handle(&self) -> &odf::DatasetHandle { + &self.handle + } + + #[inline] + pub fn take_handle(self) -> odf::DatasetHandle { + self.handle + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl std::ops::Deref for ResolvedDataset { + type Target = Arc; + fn deref(&self) -> &Self::Target { + &self.dataset + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl std::fmt::Debug for ResolvedDataset { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.handle.fmt(f) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/entities/resolved_datasets_map.rs b/src/domain/core/src/entities/resolved_datasets_map.rs new file mode 100644 index 0000000000..87c1d7be01 --- /dev/null +++ b/src/domain/core/src/entities/resolved_datasets_map.rs @@ -0,0 +1,64 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; + +use opendatafabric::{DatasetHandle, DatasetID}; + +use crate::ResolvedDataset; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +pub struct ResolvedDatasetsMap { + resolved_datasets_by_id: HashMap, +} + +impl ResolvedDatasetsMap { + pub fn get_by_id(&self, id: &DatasetID) -> &ResolvedDataset { + self.resolved_datasets_by_id + .get(id) + .expect("Dataset must be present") + } + + #[inline] + pub fn get_by_handle(&self, handle: &DatasetHandle) -> &ResolvedDataset { + self.get_by_id(&handle.id) + } + + pub fn iterate_all_handles(&self) -> impl Iterator { + self.resolved_datasets_by_id + .values() + .map(ResolvedDataset::get_handle) + } + + pub fn register(&mut self, resolved_dataset: ResolvedDataset) { + if !self + .resolved_datasets_by_id + .contains_key(resolved_dataset.get_id()) + { + self.resolved_datasets_by_id + .insert(resolved_dataset.get_id().clone(), resolved_dataset); + } + } + + pub fn register_with( + &mut self, + handle: &DatasetHandle, + dataset_fn: impl Fn(&DatasetHandle) -> ResolvedDataset, + ) { + if !self.resolved_datasets_by_id.contains_key(&handle.id) { + let resolved_dataset = dataset_fn(handle); + self.resolved_datasets_by_id + .insert(handle.id.clone(), resolved_dataset); + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/lib.rs b/src/domain/core/src/lib.rs index ca244ca588..909dfc60bd 100644 --- a/src/domain/core/src/lib.rs +++ b/src/domain/core/src/lib.rs @@ -28,3 +28,4 @@ pub use repos::{DatasetNotFoundError, *}; pub use services::*; pub use use_cases::*; pub use utils::paths::*; +pub use utils::TenancyConfig; diff --git a/src/domain/core/src/repos/dataset_registry.rs b/src/domain/core/src/repos/dataset_registry.rs deleted file mode 100644 index 0cf49b620c..0000000000 --- a/src/domain/core/src/repos/dataset_registry.rs +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use internal_error::InternalError; -use opendatafabric::*; -use url::Url; - -use super::*; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait::async_trait] -pub trait DatasetRegistry: Send + Sync { - async fn get_dataset_url(&self, dataset_ref: &DatasetRef) -> Result; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Errors -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(thiserror::Error, Debug)] -pub enum GetDatasetUrlError { - #[error(transparent)] - NotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), - #[error(transparent)] - Access( - #[from] - #[backtrace] - AccessError, - ), - #[error(transparent)] - Internal( - #[from] - #[backtrace] - InternalError, - ), -} - -impl From for GetDatasetUrlError { - fn from(v: GetDatasetError) -> Self { - match v { - GetDatasetError::NotFound(e) => Self::NotFound(e), - GetDatasetError::Internal(e) => Self::Internal(e), - } - } -} diff --git a/src/domain/core/src/repos/dataset_repository.rs b/src/domain/core/src/repos/dataset_repository.rs index 6f56c3d7d2..df640e4190 100644 --- a/src/domain/core/src/repos/dataset_repository.rs +++ b/src/domain/core/src/repos/dataset_repository.rs @@ -46,23 +46,17 @@ pub struct CreateDatasetFromSnapshotResult { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/// Abstraction of datasets storage repository #[async_trait] -pub trait DatasetRepository: DatasetRegistry + Sync + Send { - fn is_multi_tenant(&self) -> bool; - - async fn resolve_dataset_ref( +pub trait DatasetRepository: Sync + Send { + async fn resolve_dataset_handle_by_ref( &self, dataset_ref: &DatasetRef, ) -> Result; - fn get_all_datasets(&self) -> DatasetHandleStream<'_>; - - fn get_datasets_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_>; + fn all_dataset_handles(&self) -> DatasetHandleStream<'_>; - async fn find_dataset_by_ref( - &self, - dataset_ref: &DatasetRef, - ) -> Result, GetDatasetError>; + fn all_dataset_handles_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_>; fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> Arc; } @@ -72,56 +66,6 @@ pub trait DatasetRepository: DatasetRegistry + Sync + Send { pub type DatasetHandleStream<'a> = Pin> + Send + 'a>>; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Extensions -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait] -pub trait DatasetRepositoryExt: DatasetRepository { - async fn try_resolve_dataset_ref( - &self, - dataset_ref: &DatasetRef, - ) -> Result, InternalError>; - - async fn try_get_dataset( - &self, - dataset_ref: &DatasetRef, - ) -> Result>, InternalError>; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait] -impl DatasetRepositoryExt for T -where - T: DatasetRepository, - T: ?Sized, -{ - async fn try_resolve_dataset_ref( - &self, - dataset_ref: &DatasetRef, - ) -> Result, InternalError> { - match self.resolve_dataset_ref(dataset_ref).await { - Ok(hdl) => Ok(Some(hdl)), - Err(GetDatasetError::NotFound(_)) => Ok(None), - Err(GetDatasetError::Internal(e)) => Err(e), - } - } - - async fn try_get_dataset( - &self, - dataset_ref: &DatasetRef, - ) -> Result>, InternalError> { - match self.find_dataset_by_ref(dataset_ref).await { - Ok(ds) => Ok(Some(ds)), - Err(GetDatasetError::NotFound(_)) => Ok(None), - Err(GetDatasetError::Internal(e)) => Err(e), - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Errors //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/repos/mod.rs b/src/domain/core/src/repos/mod.rs index d361df5783..ac80a414cb 100644 --- a/src/domain/core/src/repos/mod.rs +++ b/src/domain/core/src/repos/mod.rs @@ -8,7 +8,6 @@ // by the Apache License, Version 2.0. pub mod dataset_factory; -pub mod dataset_registry; pub mod dataset_repository; pub mod metadata_block_repository; pub mod metadata_chain_visitor; @@ -19,7 +18,6 @@ pub mod object_store_registry; pub mod reference_repository; pub use dataset_factory::*; -pub use dataset_registry::*; pub use dataset_repository::*; pub use metadata_block_repository::*; pub use metadata_chain_visitor::*; diff --git a/src/domain/core/src/services/compaction_service.rs b/src/domain/core/src/services/compaction_service.rs index ee93523791..30e7420078 100644 --- a/src/domain/core/src/services/compaction_service.rs +++ b/src/domain/core/src/services/compaction_service.rs @@ -16,24 +16,21 @@ use thiserror::Error; use crate::*; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + pub const DEFAULT_MAX_SLICE_SIZE: u64 = 300_000_000; pub const DEFAULT_MAX_SLICE_RECORDS: u64 = 10_000; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[async_trait::async_trait] pub trait CompactionService: Send + Sync { async fn compact_dataset( &self, - dataset_handle: &DatasetHandle, + target: ResolvedDataset, options: CompactionOptions, listener: Option>, ) -> Result; - - async fn compact_multi( - &self, - dataset_refs: Vec, - options: CompactionOptions, - listener: Option>, - ) -> Vec; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -117,9 +114,9 @@ impl From for CompactionError { } #[derive(Error, Debug)] -#[error("Dataset {dataset_name} in not root kind")] +#[error("Dataset '{dataset_alias}' in not root kind")] pub struct InvalidDatasetKindError { - pub dataset_name: DatasetName, + pub dataset_alias: DatasetAlias, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -191,3 +188,5 @@ impl Default for CompactionOptions { } } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/dataset_registry.rs b/src/domain/core/src/services/dataset_registry.rs new file mode 100644 index 0000000000..8953b6bf73 --- /dev/null +++ b/src/domain/core/src/services/dataset_registry.rs @@ -0,0 +1,99 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use internal_error::InternalError; +use opendatafabric::{AccountName, DatasetHandle, DatasetID, DatasetRef}; +use thiserror::Error; + +use crate::{DatasetHandleStream, GetDatasetError, ResolvedDataset}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait DatasetRegistry: Send + Sync { + fn all_dataset_handles(&self) -> DatasetHandleStream<'_>; + + fn all_dataset_handles_by_owner(&self, owner_name: &AccountName) -> DatasetHandleStream<'_>; + + async fn resolve_dataset_handle_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result; + + async fn resolve_multiple_dataset_handles_by_ids( + &self, + dataset_ids: Vec, + ) -> Result; + + fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> ResolvedDataset; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Extensions +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait DatasetRegistryExt: DatasetRegistry { + async fn try_resolve_dataset_handle_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result, InternalError>; + + async fn get_dataset_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl DatasetRegistryExt for T +where + T: DatasetRegistry, + T: ?Sized, +{ + async fn try_resolve_dataset_handle_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result, InternalError> { + match self.resolve_dataset_handle_by_ref(dataset_ref).await { + Ok(hdl) => Ok(Some(hdl)), + Err(GetDatasetError::NotFound(_)) => Ok(None), + Err(GetDatasetError::Internal(e)) => Err(e), + } + } + + async fn get_dataset_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result { + let dataset_handle = self.resolve_dataset_handle_by_ref(dataset_ref).await?; + let dataset = self.get_dataset_by_handle(&dataset_handle); + Ok(dataset) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +pub struct DatasetHandlesResolution { + pub resolved_handles: Vec, + pub unresolved_datasets: Vec<(DatasetID, GetDatasetError)>, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum GetMultipleDatasetsError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/ingest/polling_ingest_service.rs b/src/domain/core/src/services/ingest/polling_ingest_service.rs index bead1ad946..be5e18ed1a 100644 --- a/src/domain/core/src/services/ingest/polling_ingest_service.rs +++ b/src/domain/core/src/services/ingest/polling_ingest_service.rs @@ -31,25 +31,17 @@ pub trait PollingIngestService: Send + Sync { /// Returns an active polling source, if any async fn get_active_polling_source( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, ) -> Result)>, GetDatasetError>; /// Uses polling source definition in metadata to ingest data from an /// external source async fn ingest( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, options: PollingIngestOptions, - listener: Option>, + maybe_listener: Option>, ) -> Result; - - /// A batch version of [PollingIngestService::ingest] - async fn ingest_multi( - &self, - dataset_refs: Vec, - options: PollingIngestOptions, - listener: Option>, - ) -> Vec; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -90,12 +82,6 @@ impl Default for SchemaInferenceOpts { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[derive(Debug)] -pub struct PollingIngestResponse { - pub dataset_ref: DatasetRef, - pub result: Result, -} - #[derive(Debug)] pub enum PollingIngestResult { UpToDate { @@ -233,13 +219,6 @@ impl TemplateInvalidPatternError { // TODO: Revisit error granularity #[derive(Debug, Error)] pub enum PollingIngestError { - #[error(transparent)] - DatasetNotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), - #[error("Source is unreachable at {path}")] Unreachable { path: String, @@ -331,13 +310,6 @@ pub enum PollingIngestError { CommitError, ), - #[error(transparent)] - Access( - #[from] - #[backtrace] - AccessError, - ), - #[error(transparent)] InvalidParameterFormat( #[from] @@ -360,24 +332,6 @@ pub enum PollingIngestError { ), } -impl From for PollingIngestError { - fn from(v: GetDatasetError) -> Self { - match v { - GetDatasetError::NotFound(e) => Self::DatasetNotFound(e), - GetDatasetError::Internal(e) => Self::Internal(e), - } - } -} - -impl From for PollingIngestError { - fn from(v: auth::DatasetActionUnauthorizedError) -> Self { - match v { - auth::DatasetActionUnauthorizedError::Access(e) => Self::Access(e), - auth::DatasetActionUnauthorizedError::Internal(e) => Self::Internal(e), - } - } -} - impl From for PollingIngestError { fn from(value: FindDatasetEnvVarError) -> Self { match value { diff --git a/src/domain/core/src/services/ingest/push_ingest_service.rs b/src/domain/core/src/services/ingest/push_ingest_service.rs index 2b1ba8e725..b284d042cb 100644 --- a/src/domain/core/src/services/ingest/push_ingest_service.rs +++ b/src/domain/core/src/services/ingest/push_ingest_service.rs @@ -26,7 +26,7 @@ pub trait PushIngestService: Send + Sync { /// Returns the set of active push sources async fn get_active_push_sources( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, ) -> Result)>, GetDatasetError>; /// Uses push source definition in metadata to ingest data from the @@ -35,7 +35,7 @@ pub trait PushIngestService: Send + Sync { /// See also [MediaType]. async fn ingest_from_url( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, source_name: Option<&str>, url: url::Url, opts: PushIngestOpts, @@ -48,7 +48,7 @@ pub trait PushIngestService: Send + Sync { /// See also [MediaType]. async fn ingest_from_file_stream( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, source_name: Option<&str>, data: Box, opts: PushIngestOpts, @@ -122,13 +122,6 @@ impl PushIngestListener for NullPushIngestListener {} // TODO: Revisit error granularity #[derive(Debug, Error)] pub enum PushIngestError { - #[error(transparent)] - DatasetNotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), - #[error(transparent)] SourceNotFound( #[from] @@ -200,24 +193,6 @@ pub enum PushIngestError { ), } -impl From for PushIngestError { - fn from(v: GetDatasetError) -> Self { - match v { - GetDatasetError::NotFound(e) => Self::DatasetNotFound(e), - GetDatasetError::Internal(e) => Self::Internal(e), - } - } -} - -impl From for PushIngestError { - fn from(v: auth::DatasetActionUnauthorizedError) -> Self { - match v { - auth::DatasetActionUnauthorizedError::Access(e) => Self::Access(e), - auth::DatasetActionUnauthorizedError::Internal(e) => Self::Internal(e), - } - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Error, Default)] diff --git a/src/domain/core/src/services/mod.rs b/src/domain/core/src/services/mod.rs index 6c56d7617f..a9beda7b29 100644 --- a/src/domain/core/src/services/mod.rs +++ b/src/domain/core/src/services/mod.rs @@ -13,13 +13,14 @@ pub use container_runtime::{NullPullImageListener, PullImageListener}; pub mod compaction_service; pub mod dataset_changes_service; pub mod dataset_ownership_service; +pub mod dataset_registry; pub mod dependency_graph_repository; pub mod dependency_graph_service; pub mod engine_provisioner; pub mod ingest; pub mod provenance_service; -pub mod pull_service; -pub mod push_service; +pub mod pull_request_planner; +pub mod push_request_planner; pub mod query_service; pub mod remote_aliases; pub mod remote_aliases_registry; @@ -29,19 +30,21 @@ pub mod resource_loader; pub mod search_service; pub mod server_url_config; pub mod sync_service; -pub mod transform_service; +pub mod transform; pub mod verification_service; +pub mod watermark_service; pub use compaction_service::*; pub use dataset_changes_service::*; pub use dataset_ownership_service::*; +pub use dataset_registry::*; pub use dependency_graph_repository::*; pub use dependency_graph_service::*; pub use engine_provisioner::*; pub use ingest::*; pub use provenance_service::*; -pub use pull_service::*; -pub use push_service::*; +pub use pull_request_planner::*; +pub use push_request_planner::*; pub use query_service::*; pub use remote_aliases::*; pub use remote_aliases_registry::*; @@ -51,5 +54,6 @@ pub use resource_loader::*; pub use search_service::*; pub use server_url_config::*; pub use sync_service::*; -pub use transform_service::*; +pub use transform::*; pub use verification_service::*; +pub use watermark_service::*; diff --git a/src/domain/core/src/services/pull_request_planner.rs b/src/domain/core/src/services/pull_request_planner.rs new file mode 100644 index 0000000000..c5b3c6953a --- /dev/null +++ b/src/domain/core/src/services/pull_request_planner.rs @@ -0,0 +1,427 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::borrow::Cow; +use std::sync::Arc; + +use ::serde::{Deserialize, Serialize}; +use internal_error::InternalError; +use opendatafabric::*; +use thiserror::Error; + +use crate::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Service +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait PullRequestPlanner: Send + Sync { + async fn build_pull_plan( + &self, + request: PullRequest, + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> Result; + + // This function descends down the dependency tree of datasets (starting with + // provided references) assigning depth index to every dataset in the + // graph(s). Datasets that share the same depth level are independent and + // can be pulled in parallel. + async fn build_pull_multi_plan( + &self, + requests: &[PullRequest], + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> (Vec, Vec); + + async fn build_pull_plan_all_owner_datasets( + &self, + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> Result<(Vec, Vec), InternalError>; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct PullPlanIteration { + pub depth: i32, + pub jobs: Vec, +} + +#[derive(Debug)] +pub enum PullPlanIterationJob { + Ingest(PullIngestItem), + Transform(PullTransformItem), + Sync(PullSyncItem), +} + +impl PullPlanIterationJob { + pub fn as_common_item(&self) -> &dyn PullItemCommon { + match self { + Self::Ingest(pii) => pii, + Self::Transform(pti) => pti, + Self::Sync(psi) => psi, + } + } + + pub fn into_original_pull_request(self) -> Option { + match self { + Self::Ingest(pii) => pii.maybe_original_request, + Self::Transform(pti) => pti.maybe_original_request, + Self::Sync(psi) => psi.maybe_original_request, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct PullIngestItem { + pub depth: i32, + pub target: ResolvedDataset, + pub maybe_original_request: Option, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct PullTransformItem { + pub depth: i32, + pub target: ResolvedDataset, + pub maybe_original_request: Option, + pub plan: TransformPreliminaryPlan, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct PullSyncItem { + pub depth: i32, + pub local_target: PullLocalTarget, + pub remote_ref: DatasetRefRemote, + pub maybe_original_request: Option, + pub sync_request: Box, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum PullLocalTarget { + Existing(DatasetHandle), + ToCreate(DatasetAlias), +} + +impl PullLocalTarget { + pub fn existing(hdl: DatasetHandle) -> Self { + Self::Existing(hdl) + } + + pub fn to_create(alias: DatasetAlias) -> Self { + Self::ToCreate(alias) + } + + pub fn alias(&self) -> &DatasetAlias { + match self { + Self::Existing(hdl) => &hdl.alias, + Self::ToCreate(alias) => alias, + } + } + + pub fn as_local_ref(&self) -> DatasetRef { + match self { + Self::Existing(hdl) => hdl.as_local_ref(), + Self::ToCreate(alias) => alias.as_local_ref(), + } + } + + pub fn as_any_ref(&self) -> DatasetRefAny { + match self { + Self::Existing(hdl) => hdl.as_any_ref(), + Self::ToCreate(alias) => alias.as_any_ref(), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum PullRequest { + Local(DatasetRef), + Remote(PullRequestRemote), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PullRequestRemote { + pub remote_ref: DatasetRefRemote, + pub maybe_local_alias: Option, +} + +impl PullRequest { + pub fn local(dataset_ref: DatasetRef) -> Self { + Self::Local(dataset_ref) + } + + pub fn remote(remote_ref: DatasetRefRemote, maybe_local_alias: Option) -> Self { + Self::Remote(PullRequestRemote { + remote_ref, + maybe_local_alias, + }) + } + + pub fn from_any_ref(dataset_ref: &DatasetRefAny, is_repo: impl Fn(&RepoName) -> bool) -> Self { + // Single-tenant workspace => treat all repo-like references as repos. + // Multi-tenant workspace => treat all repo-like references as accounts, use + // repo:// for repos + match dataset_ref.as_local_ref(is_repo) { + Ok(local_ref) => Self::local(local_ref), + Err(remote_ref) => Self::remote(remote_ref, None), + } + } + + pub fn local_ref(&self) -> Option> { + match self { + PullRequest::Local(local_ref) => Some(Cow::Borrowed(local_ref)), + PullRequest::Remote(remote) => remote + .maybe_local_alias + .as_ref() + .map(|alias| Cow::Owned(alias.as_local_ref())), + } + } + + pub fn remote_ref(&self) -> Option<&DatasetRefRemote> { + match self { + PullRequest::Local(_) => None, + PullRequest::Remote(remote) => Some(&remote.remote_ref), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub trait PullItemCommon { + fn try_get_written_handle(&self) -> Option<&DatasetHandle>; + fn get_read_handles(&self) -> Vec<&DatasetHandle>; +} + +impl PullItemCommon for PullIngestItem { + fn try_get_written_handle(&self) -> Option<&DatasetHandle> { + Some(self.target.get_handle()) + } + + fn get_read_handles(&self) -> Vec<&DatasetHandle> { + vec![] + } +} + +impl PullItemCommon for PullTransformItem { + fn try_get_written_handle(&self) -> Option<&DatasetHandle> { + Some(self.target.get_handle()) + } + + fn get_read_handles(&self) -> Vec<&DatasetHandle> { + let mut read_handles = Vec::new(); + for hdl in self.plan.datasets_map.iterate_all_handles() { + if hdl != self.target.get_handle() { + read_handles.push(hdl); + } + } + read_handles + } +} + +impl PullItemCommon for PullSyncItem { + fn try_get_written_handle(&self) -> Option<&DatasetHandle> { + match &self.local_target { + PullLocalTarget::Existing(hdl) => Some(hdl), + PullLocalTarget::ToCreate(_) => None, + } + } + + fn get_read_handles(&self) -> Vec<&DatasetHandle> { + vec![] + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct PullResponse { + /// Parameters passed into the call. Empty for datasets that were pulled as + /// recursive dependencies. + pub maybe_original_request: Option, + /// Local dataset handle, if resolved + pub maybe_local_ref: Option, + /// Destination reference, if resolved + pub maybe_remote_ref: Option, + /// Result of the push operation + pub result: Result, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct PullOptions { + /// Pull all dataset dependencies recursively in depth-first order + pub recursive: bool, + /// Whether the datasets pulled from remotes should be permanently + /// associated with them + pub add_aliases: bool, + /// Ingest-specific options + pub ingest_options: PollingIngestOptions, + /// Sync-specific options, + pub sync_options: SyncOptions, + /// Transform-specific options, + pub transform_options: TransformOptions, +} + +impl Default for PullOptions { + fn default() -> Self { + Self { + recursive: false, + add_aliases: true, + ingest_options: PollingIngestOptions::default(), + sync_options: SyncOptions::default(), + transform_options: TransformOptions::default(), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub trait PullListener: Send + Sync { + fn get_ingest_listener(self: Arc) -> Option>; + fn get_transform_listener(self: Arc) -> Option>; + fn get_sync_listener(self: Arc) -> Option>; +} + +pub trait PullMultiListener: Send + Sync { + fn get_ingest_listener(self: Arc) -> Option>; + fn get_transform_listener(self: Arc) -> Option>; + fn get_sync_listener(self: Arc) -> Option>; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub enum PullResult { + UpToDate(PullResultUpToDate), + Updated { + old_head: Option, + new_head: Multihash, + }, +} + +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub enum PullResultUpToDate { + PollingIngest(PollingIngestResultUpToDate), + PushIngest(PushInsgestResultUpToDate), + Transform, + Sync, +} + +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct PollingIngestResultUpToDate { + pub uncacheable: bool, +} + +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct PushInsgestResultUpToDate { + pub uncacheable: bool, +} + +impl From for PullResult { + fn from(other: PollingIngestResult) -> Self { + match other { + PollingIngestResult::UpToDate { uncacheable, .. } => PullResult::UpToDate( + PullResultUpToDate::PollingIngest(PollingIngestResultUpToDate { uncacheable }), + ), + PollingIngestResult::Updated { + old_head, new_head, .. + } => PullResult::Updated { + old_head: Some(old_head), + new_head, + }, + } + } +} + +impl From for PullResult { + fn from(other: TransformResult) -> Self { + match other { + TransformResult::UpToDate => PullResult::UpToDate(PullResultUpToDate::Transform), + TransformResult::Updated { old_head, new_head } => PullResult::Updated { + old_head: Some(old_head), + new_head, + }, + } + } +} + +impl From for PullResult { + fn from(other: SyncResult) -> Self { + match other { + SyncResult::UpToDate => PullResult::UpToDate(PullResultUpToDate::Sync), + SyncResult::Updated { + old_head, new_head, .. + } => PullResult::Updated { old_head, new_head }, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Errors +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum PullError { + #[error(transparent)] + NotFound( + #[from] + #[backtrace] + DatasetNotFoundError, + ), + #[error("Cannot choose between multiple pull aliases")] + AmbiguousSource, + #[error("{0}")] + InvalidOperation(String), + #[error(transparent)] + PollingIngestError( + #[from] + #[backtrace] + PollingIngestError, + ), + #[error(transparent)] + TransformError( + #[from] + #[backtrace] + TransformError, + ), + #[error(transparent)] + SyncError( + #[from] + #[backtrace] + SyncError, + ), + #[error(transparent)] + Access( + #[from] + #[backtrace] + AccessError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/pull_service.rs b/src/domain/core/src/services/pull_service.rs deleted file mode 100644 index d18451b6dd..0000000000 --- a/src/domain/core/src/services/pull_service.rs +++ /dev/null @@ -1,335 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::sync::Arc; - -use ::serde::{Deserialize, Serialize}; -use chrono::{DateTime, Utc}; -use internal_error::InternalError; -use opendatafabric::*; -use thiserror::Error; - -use crate::auth::DatasetActionUnauthorizedError; -use crate::*; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Service -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait::async_trait] -pub trait PullService: Send + Sync { - async fn pull( - &self, - dataset_ref: &DatasetRefAny, - options: PullOptions, - listener: Option>, - ) -> Result; - - async fn pull_ext( - &self, - request: &PullRequest, - options: PullOptions, - listener: Option>, - ) -> Result; - - async fn pull_multi( - &self, - dataset_refs: Vec, - options: PullMultiOptions, - listener: Option>, - ) -> Result, InternalError>; - - async fn pull_multi_ext( - &self, - requests: Vec, - options: PullMultiOptions, - listener: Option>, - ) -> Result, InternalError>; - - /// Manually advances the watermark of a root dataset - async fn set_watermark( - &self, - dataset_ref: &DatasetRef, - watermark: DateTime, - ) -> Result; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PullRequest { - pub local_ref: Option, - pub remote_ref: Option, -} - -impl PullRequest { - pub fn from_any_ref(dataset_ref: &DatasetRefAny, is_repo: impl Fn(&RepoName) -> bool) -> Self { - // Single-tenant workspace => treat all repo-like references as repos. - // Multi-tenant workspace => treat all repo-like references as accounts, use - // repo:// for repos - match dataset_ref.as_local_ref(is_repo) { - Ok(local_ref) => Self { - local_ref: Some(local_ref), - remote_ref: None, - }, - Err(remote_ref) => Self { - local_ref: None, - remote_ref: Some(remote_ref), - }, - } - } -} - -#[derive(Debug)] -pub struct PullResponse { - /// Parameters passed into the call. Empty for datasets that were pulled as - /// recursive dependencies. - pub original_request: Option, - /// Local dataset handle, if resolved - pub local_ref: Option, - /// Destination reference, if resolved - pub remote_ref: Option, - /// Result of the push operation - pub result: Result, -} - -#[derive(Debug, Clone)] -pub struct PullOptions { - /// Whether the datasets pulled from remotes should be permanently - /// associated with them - pub add_aliases: bool, - /// Ingest-specific options - pub ingest_options: PollingIngestOptions, - /// Sync-specific options, - pub sync_options: SyncOptions, - /// Run compaction of derivative dataset without saving data - /// if transformation failed due to root dataset compaction - pub reset_derivatives_on_diverged_input: bool, -} - -impl Default for PullOptions { - fn default() -> Self { - Self { - add_aliases: true, - reset_derivatives_on_diverged_input: false, - ingest_options: PollingIngestOptions::default(), - sync_options: SyncOptions::default(), - } - } -} - -#[derive(Debug, Clone)] -pub struct PullMultiOptions { - /// Pull all dataset dependencies recursively in depth-first order - pub recursive: bool, - /// Pull all known datasets - pub all: bool, - /// Whether the datasets pulled from remotes should be permanently - /// associated with them - pub add_aliases: bool, - /// Ingest-specific options - pub ingest_options: PollingIngestOptions, - /// Sync-specific options, - pub sync_options: SyncOptions, - /// Run compaction of all derivative datasets without saving data - /// if transformation fails due to root dataset compaction - pub reset_derivatives_on_diverged_input: bool, -} - -impl Default for PullMultiOptions { - fn default() -> Self { - Self { - recursive: false, - all: false, - add_aliases: true, - ingest_options: PollingIngestOptions::default(), - sync_options: SyncOptions::default(), - reset_derivatives_on_diverged_input: false, - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub trait PullListener: Send + Sync { - fn get_ingest_listener(self: Arc) -> Option>; - fn get_transform_listener(self: Arc) -> Option>; - fn get_sync_listener(self: Arc) -> Option>; -} - -pub trait PullMultiListener: Send + Sync { - fn get_ingest_listener(self: Arc) -> Option>; - fn get_transform_listener(self: Arc) -> Option>; - fn get_sync_listener(self: Arc) -> Option>; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] -pub enum PullResult { - UpToDate(PullResultUpToDate), - Updated { - old_head: Option, - new_head: Multihash, - }, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] -pub enum PullResultUpToDate { - PollingIngest(PollingIngestResultUpToDate), - PushIngest(PushInsgestResultUpToDate), - Transform, - Sync, - SetWatermark, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] -pub struct PollingIngestResultUpToDate { - pub uncacheable: bool, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] -pub struct PushInsgestResultUpToDate { - pub uncacheable: bool, -} - -impl From for PullResult { - fn from(other: PollingIngestResult) -> Self { - match other { - PollingIngestResult::UpToDate { uncacheable, .. } => PullResult::UpToDate( - PullResultUpToDate::PollingIngest(PollingIngestResultUpToDate { uncacheable }), - ), - PollingIngestResult::Updated { - old_head, new_head, .. - } => PullResult::Updated { - old_head: Some(old_head), - new_head, - }, - } - } -} - -impl From for PullResult { - fn from(other: TransformResult) -> Self { - match other { - TransformResult::UpToDate => PullResult::UpToDate(PullResultUpToDate::Transform), - TransformResult::Updated { old_head, new_head } => PullResult::Updated { - old_head: Some(old_head), - new_head, - }, - } - } -} - -impl From for PullResult { - fn from(other: SyncResult) -> Self { - match other { - SyncResult::UpToDate => PullResult::UpToDate(PullResultUpToDate::Sync), - SyncResult::Updated { - old_head, new_head, .. - } => PullResult::Updated { old_head, new_head }, - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Errors -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Error)] -pub enum PullError { - #[error(transparent)] - NotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), - #[error("Source is not specified and there is no associated pull alias")] - NoSource, - #[error("Cannot choose between multiple pull aliases")] - AmbiguousSource, - #[error("{0}")] - InvalidOperation(String), - #[error(transparent)] - PollingIngestError( - #[from] - #[backtrace] - PollingIngestError, - ), - #[error(transparent)] - TransformError( - #[from] - #[backtrace] - TransformError, - ), - #[error(transparent)] - SyncError( - #[from] - #[backtrace] - SyncError, - ), - #[error(transparent)] - Internal( - #[from] - #[backtrace] - InternalError, - ), -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Error)] -pub enum SetWatermarkError { - #[error(transparent)] - NotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), - - #[error("Attempting to set watermark on a derivative dataset")] - IsDerivative, - - #[error("Attempting to set watermark on a remote dataset")] - IsRemote, - - #[error(transparent)] - Access( - #[from] - #[backtrace] - AccessError, - ), - - #[error(transparent)] - Internal( - #[from] - #[backtrace] - InternalError, - ), -} - -impl From for SetWatermarkError { - fn from(v: GetDatasetError) -> Self { - match v { - GetDatasetError::NotFound(e) => Self::NotFound(e), - GetDatasetError::Internal(e) => Self::Internal(e), - } - } -} - -impl From for SetWatermarkError { - fn from(v: DatasetActionUnauthorizedError) -> Self { - match v { - DatasetActionUnauthorizedError::Access(e) => Self::Access(e), - DatasetActionUnauthorizedError::Internal(e) => Self::Internal(e), - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/push_service.rs b/src/domain/core/src/services/push_request_planner.rs similarity index 67% rename from src/domain/core/src/services/push_service.rs rename to src/domain/core/src/services/push_request_planner.rs index 10b0cfa48c..e20c80e4b3 100644 --- a/src/domain/core/src/services/push_service.rs +++ b/src/domain/core/src/services/push_request_planner.rs @@ -7,14 +7,12 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::sync::Arc; - use internal_error::InternalError; use opendatafabric::*; use thiserror::Error; use super::sync_service::*; -use super::RepositoryNotFoundError; +use super::{RemoteTarget, RepositoryNotFoundError}; use crate::{DatasetNotFoundError, GetDatasetError}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -22,16 +20,35 @@ use crate::{DatasetNotFoundError, GetDatasetError}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[async_trait::async_trait] -pub trait PushService: Send + Sync { - async fn push_multi( +pub trait PushRequestPlanner: Send + Sync { + async fn collect_plan( &self, - dataset_refs: Vec, - options: PushMultiOptions, - sync_listener: Option>, - ) -> Vec; + dataset_handles: &[DatasetHandle], + push_target: Option<&DatasetPushTarget>, + ) -> (Vec, Vec); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Eq, PartialEq)] +pub struct PushItem { + pub local_handle: DatasetHandle, + pub remote_target: RemoteTarget, + pub push_target: Option, } -#[derive(Debug)] +impl PushItem { + pub fn as_response(&self, result: Result) -> PushResponse { + PushResponse { + local_handle: Some(self.local_handle.clone()), + target: self.push_target.clone(), + result: result.map_err(Into::into), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + pub struct PushResponse { /// Local dataset handle, if resolved pub local_handle: Option, @@ -52,6 +69,22 @@ impl std::fmt::Display for PushResponse { } } +impl std::fmt::Debug for PushResponse { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "PushResponse(local_handle={:?}, target={:?}, result=", + self.local_handle, self.target + )?; + match &self.result { + Ok(sync_result) => write!(f, "Ok({sync_result:?})"), + Err(e) => write!(f, "Err({e:?})"), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[derive(Debug, Clone)] pub struct PushMultiOptions { /// Push all dataset dependencies recursively in depth-first order @@ -122,3 +155,5 @@ impl From for PushError { } } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/remote_aliases_registry.rs b/src/domain/core/src/services/remote_aliases_registry.rs index 6200032d19..c8d9a88b89 100644 --- a/src/domain/core/src/services/remote_aliases_registry.rs +++ b/src/domain/core/src/services/remote_aliases_registry.rs @@ -7,29 +7,26 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use async_trait::async_trait; use internal_error::InternalError; -use opendatafabric::*; +use opendatafabric::{AccountName, DatasetHandle, DatasetName, DatasetPushTarget, RepoName}; use thiserror::Error; use crate::*; -#[async_trait] +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] pub trait RemoteAliasesRegistry: Send + Sync { async fn get_remote_aliases( &self, - dataset_ref: &DatasetRef, + dataset_handle: &DatasetHandle, ) -> Result, GetAliasesError>; } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[derive(Error, Debug)] pub enum GetAliasesError { - #[error(transparent)] - DatasetNotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), #[error(transparent)] Internal( #[from] @@ -38,32 +35,25 @@ pub enum GetAliasesError { ), } -impl From for GetAliasesError { - fn from(v: GetDatasetError) -> Self { - match v { - GetDatasetError::NotFound(e) => Self::DatasetNotFound(e), - GetDatasetError::Internal(e) => Self::Internal(e), - } - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // RemoteAliasResolver //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[async_trait] +#[async_trait::async_trait] pub trait RemoteAliasResolver: Send + Sync { // Resolve remote push target. // Firstly try to resolve from AliasRegistry, if cannot do it // try to resolve via repository registry async fn resolve_push_target( &self, - local_dataset_handle: &DatasetHandle, + dataset_handle: &DatasetHandle, dataset_push_target_maybe: Option, ) -> Result; } -#[derive(Debug, Clone)] +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Eq, PartialEq)] pub struct RemoteTarget { pub url: url::Url, pub repo_name: Option, @@ -153,3 +143,5 @@ impl From for GetRemoteAccountError { Self::Internal(value) } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/reset_service.rs b/src/domain/core/src/services/reset_service.rs index 51b3738d25..0ba45e205d 100644 --- a/src/domain/core/src/services/reset_service.rs +++ b/src/domain/core/src/services/reset_service.rs @@ -20,7 +20,7 @@ use crate::*; pub trait ResetService: Send + Sync { async fn reset_dataset( &self, - dataset_handle: &DatasetHandle, + target: ResolvedDataset, block_hash: Option<&Multihash>, old_head_maybe: Option<&Multihash>, ) -> Result; diff --git a/src/domain/core/src/services/sync_service.rs b/src/domain/core/src/services/sync_service.rs index ee25be4e82..d43cba9c10 100644 --- a/src/domain/core/src/services/sync_service.rs +++ b/src/domain/core/src/services/sync_service.rs @@ -12,6 +12,7 @@ use std::sync::Arc; use internal_error::{BoxedError, InternalError}; use opendatafabric::*; use thiserror::Error; +use url::Url; use crate::utils::metadata_chain_comparator::CompareChainsError; use crate::*; @@ -24,31 +25,77 @@ use crate::*; pub trait SyncService: Send + Sync { async fn sync( &self, - src: &DatasetRefAny, - dst: &DatasetRefAny, + request: SyncRequest, options: SyncOptions, listener: Option>, ) -> Result; - async fn sync_multi( - &self, - requests: Vec, - options: SyncOptions, - listener: Option>, - ) -> Vec; - /// Adds dataset to IPFS and returns the root CID. /// Unlike `sync` it does not do IPNS resolution and publishing. - async fn ipfs_add(&self, src: &DatasetRef) -> Result; + async fn ipfs_add(&self, src: ResolvedDataset) -> Result; } -#[derive(Debug, Clone)] +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] pub struct SyncRequest { - pub src: DatasetRefAny, - pub dst: DatasetRefAny, + pub src: SyncRef, + pub dst: SyncRef, } #[derive(Debug, Clone)] +pub enum SyncRef { + Local(ResolvedDataset), + LocalNew(DatasetAlias), + Remote(SyncRefRemote), +} + +impl SyncRef { + pub fn is_local(&self) -> bool { + match self { + Self::Local(_) | Self::LocalNew(_) => true, + Self::Remote(_) => false, + } + } + + // If remote, refers to resolved repository URL + pub fn as_internal_any_ref(&self) -> DatasetRefAny { + match self { + Self::Local(local_ref) => local_ref.get_handle().as_any_ref(), + Self::LocalNew(alias) => alias.as_any_ref(), + Self::Remote(remote_ref) => DatasetRefAny::Url(remote_ref.url.clone()), + } + } + + // If remote, returns the original unresolved ref + pub fn as_user_friendly_any_ref(&self) -> DatasetRefAny { + match self { + Self::Local(local_ref) => local_ref.get_handle().as_any_ref(), + Self::LocalNew(alias) => alias.as_any_ref(), + Self::Remote(remote_ref) => remote_ref.original_remote_ref.as_any_ref(), + } + } +} + +#[derive(Clone)] +pub struct SyncRefRemote { + pub url: Arc, + pub dataset: Arc, + pub original_remote_ref: DatasetRefRemote, +} + +impl std::fmt::Debug for SyncRefRemote { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SyncRefRemote") + .field("url", &self.url) + .field("original_remote_ref", &self.original_remote_ref) + .finish_non_exhaustive() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Copy, Clone)] pub struct SyncOptions { /// Whether the source of data can be assumed non-malicious to skip hash sum /// and other expensive checks. Defaults to `true` when the source is @@ -76,6 +123,8 @@ impl Default for SyncOptions { } } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[derive(Debug, Clone, Eq, PartialEq)] pub enum SyncResult { UpToDate, @@ -86,13 +135,6 @@ pub enum SyncResult { }, } -#[derive(Debug)] -pub struct SyncResultMulti { - pub src: DatasetRefAny, - pub dst: DatasetRefAny, - pub result: Result, -} - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Listener //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -178,6 +220,8 @@ pub enum SyncError { #[error(transparent)] UnsupportedProtocol(#[from] UnsupportedProtocolError), #[error(transparent)] + UnsupportedIpfsStorageType(#[from] UnsupportedIpfsStorageTypeError), + #[error(transparent)] RepositoryNotFound( #[from] #[backtrace] @@ -210,6 +254,21 @@ pub enum SyncError { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[derive(Debug, Error)] +pub enum IpfsAddError { + #[error(transparent)] + UnsupportedIpfsStorageType(#[from] UnsupportedIpfsStorageTypeError), + + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[derive(Error, Clone, Eq, PartialEq, Debug)] #[error("Dataset {dataset_ref} not found")] pub struct DatasetNotFoundError { @@ -297,15 +356,6 @@ impl From for SyncError { } } -impl From for SyncError { - fn from(v: auth::DatasetActionUnauthorizedError) -> Self { - match v { - auth::DatasetActionUnauthorizedError::Access(e) => Self::Access(e), - auth::DatasetActionUnauthorizedError::Internal(e) => Self::Internal(e), - } - } -} - impl From for SyncError { fn from(v: GetRepoError) -> Self { match v { @@ -327,9 +377,28 @@ impl From for SyncError { impl From for SyncError { fn from(v: CompareChainsError) -> Self { match v { - CompareChainsError::Corrupted(e) => SyncError::Corrupted(e), - CompareChainsError::Access(e) => SyncError::Access(e), - CompareChainsError::Internal(e) => SyncError::Internal(e), + CompareChainsError::Corrupted(e) => Self::Corrupted(e), + CompareChainsError::Access(e) => Self::Access(e), + CompareChainsError::Internal(e) => Self::Internal(e), + } + } +} + +impl From for SyncError { + fn from(v: IpfsAddError) -> Self { + match v { + IpfsAddError::UnsupportedIpfsStorageType(e) => Self::UnsupportedIpfsStorageType(e), + IpfsAddError::Internal(e) => Self::Internal(e), } } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Error, Debug)] +#[error("Dataset storage type '{}' is unsupported for IPFS operations", url.scheme())] +pub struct UnsupportedIpfsStorageTypeError { + pub url: Url, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/transform/mod.rs b/src/domain/core/src/services/transform/mod.rs new file mode 100644 index 0000000000..d7f598d24a --- /dev/null +++ b/src/domain/core/src/services/transform/mod.rs @@ -0,0 +1,20 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod transform_elaboration_service; +mod transform_execution_service; +mod transform_listener; +mod transform_request_planner; +mod transform_types; + +pub use transform_elaboration_service::*; +pub use transform_execution_service::*; +pub use transform_listener::*; +pub use transform_request_planner::*; +pub use transform_types::*; diff --git a/src/domain/core/src/services/transform/transform_elaboration_service.rs b/src/domain/core/src/services/transform/transform_elaboration_service.rs new file mode 100644 index 0000000000..572ff52d20 --- /dev/null +++ b/src/domain/core/src/services/transform/transform_elaboration_service.rs @@ -0,0 +1,84 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use internal_error::InternalError; +use thiserror::Error; + +use super::TransformPreliminaryPlan; +use crate::engine::TransformRequestExt; +use crate::{ + InputSchemaNotDefinedError, + InvalidInputIntervalError, + ResolvedDataset, + ResolvedDatasetsMap, + TransformListener, + TransformOptions, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait TransformElaborationService: Send + Sync { + async fn elaborate_transform( + &self, + target: ResolvedDataset, + plan: TransformPreliminaryPlan, + transform_options: TransformOptions, + maybe_listener: Option>, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub enum TransformElaboration { + Elaborated(TransformPlan), + UpToDate, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TransformPlan { + pub request: TransformRequestExt, + pub datasets_map: ResolvedDatasetsMap, +} + +impl std::fmt::Debug for TransformPlan { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.request.fmt(f) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum TransformElaborateError { + #[error(transparent)] + InputSchemaNotDefined( + #[from] + #[backtrace] + InputSchemaNotDefinedError, + ), + #[error(transparent)] + InvalidInputInterval( + #[from] + #[backtrace] + InvalidInputIntervalError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/transform/transform_execution_service.rs b/src/domain/core/src/services/transform/transform_execution_service.rs new file mode 100644 index 0000000000..199fd01002 --- /dev/null +++ b/src/domain/core/src/services/transform/transform_execution_service.rs @@ -0,0 +1,110 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use internal_error::InternalError; +use thiserror::Error; + +use super::TransformPlan; +use crate::engine::EngineError; +use crate::{ + CommitError, + DataNotReproducible, + EngineProvisioningError, + ResolvedDataset, + TransformListener, + TransformResult, + VerificationListener, + VerifyTransformOperation, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait TransformExecutionService: Send + Sync { + async fn execute_transform( + &self, + target: ResolvedDataset, + plan: TransformPlan, + maybe_listener: Option>, + ) -> ( + ResolvedDataset, + Result, + ); + + async fn execute_verify_transform( + &self, + target: ResolvedDataset, + verification_operation: VerifyTransformOperation, + maybe_listener: Option>, + ) -> Result<(), VerifyTransformExecuteError>; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum TransformExecuteError { + #[error(transparent)] + EngineProvisioningError( + #[from] + #[backtrace] + EngineProvisioningError, + ), + #[error(transparent)] + EngineError( + #[from] + #[backtrace] + EngineError, + ), + #[error(transparent)] + CommitError( + #[from] + #[backtrace] + CommitError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum VerifyTransformExecuteError { + #[error(transparent)] + EngineProvisioningError( + #[from] + #[backtrace] + EngineProvisioningError, + ), + #[error(transparent)] + EngineError( + #[from] + #[backtrace] + EngineError, + ), + #[error("Data is not reproducible")] + DataNotReproducible( + #[from] + #[backtrace] + DataNotReproducible, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/transform/transform_listener.rs b/src/domain/core/src/services/transform/transform_listener.rs new file mode 100644 index 0000000000..3d411ad000 --- /dev/null +++ b/src/domain/core/src/services/transform/transform_listener.rs @@ -0,0 +1,46 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use opendatafabric::DatasetHandle; + +use super::{TransformElaborateError, TransformExecuteError, TransformResult}; +use crate::EngineProvisioningListener; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub trait TransformListener: Send + Sync { + fn begin(&self) {} + fn success(&self, _result: &TransformResult) {} + fn elaborate_error(&self, _error: &TransformElaborateError) {} + fn execute_error(&self, _error: &TransformExecuteError) {} + + fn get_engine_provisioning_listener( + self: Arc, + ) -> Option> { + None + } +} + +pub struct NullTransformListener; +impl TransformListener for NullTransformListener {} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub trait TransformMultiListener: Send + Sync { + fn begin_transform(&self, _dataset: &DatasetHandle) -> Option> { + None + } +} + +pub struct NullTransformMultiListener; +impl TransformMultiListener for NullTransformMultiListener {} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/transform/transform_request_planner.rs b/src/domain/core/src/services/transform/transform_request_planner.rs new file mode 100644 index 0000000000..3d9e547b1b --- /dev/null +++ b/src/domain/core/src/services/transform/transform_request_planner.rs @@ -0,0 +1,229 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use datafusion::arrow::datatypes::SchemaRef; +use internal_error::InternalError; +use opendatafabric::*; +use thiserror::Error; + +use crate::engine::TransformRequestExt; +use crate::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait TransformRequestPlanner: Send + Sync { + /// Returns an active transform, if any + async fn get_active_transform( + &self, + target: ResolvedDataset, + ) -> Result)>, InternalError>; + + async fn build_transform_preliminary_plan( + &self, + target: ResolvedDataset, + ) -> Result; + + async fn build_transform_verification_plan( + &self, + target: ResolvedDataset, + block_range: (Option, Option), + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TransformPreliminaryPlan { + pub preliminary_request: TransformPreliminaryRequestExt, + pub datasets_map: ResolvedDatasetsMap, +} + +impl std::fmt::Debug for TransformPreliminaryPlan { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.preliminary_request.fmt(f) + } +} + +#[derive(Debug, Clone)] +pub struct TransformPreliminaryRequestExt { + /// Randomly assigned value that identifies this specific engine operation + pub operation_id: String, + /// Identifies the output dataset + pub dataset_handle: DatasetHandle, + /// Block reference to advance upon commit + pub block_ref: BlockRef, + /// Current head (for concurrency control) + pub head: Multihash, + /// Transformation that will be applied to produce new data + pub transform: Transform, + /// System time to use for new records + pub system_time: DateTime, + /// Expected data schema (if already defined) + pub schema: Option, + /// Preceding record offset, if any + pub prev_offset: Option, + /// State of inputs + pub input_states: Vec<(TransformInput, Option)>, + /// Output dataset's vocabulary + pub vocab: DatasetVocabulary, + /// Previous checkpoint, if any + pub prev_checkpoint: Option, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct VerifyTransformStep { + pub request: TransformRequestExt, + pub expected_block: MetadataBlock, + pub expected_hash: Multihash, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct VerifyTransformOperation { + pub steps: Vec, + pub datasets_map: ResolvedDatasetsMap, +} + +impl std::fmt::Debug for VerifyTransformOperation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_list().entries(self.steps.iter()).finish() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum TransformPlanError { + #[error(transparent)] + TransformNotDefined( + #[from] + #[backtrace] + TransformNotDefinedError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +#[error("Dataset does not define a transform")] +pub struct TransformNotDefinedError {} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum VerifyTransformPlanError { + #[error(transparent)] + DatasetNotFound( + #[from] + #[backtrace] + DatasetNotFoundError, + ), + #[error(transparent)] + RefNotFound( + #[from] + #[backtrace] + RefNotFoundError, + ), + #[error(transparent)] + BlockNotFound( + #[from] + #[backtrace] + BlockNotFoundError, + ), + #[error(transparent)] + BlockVersion( + #[from] + #[backtrace] + BlockVersionError, + ), + #[error(transparent)] + BlockMalformed( + #[from] + #[backtrace] + BlockMalformedError, + ), + #[error(transparent)] + InvalidInterval( + #[from] + #[backtrace] + InvalidIntervalError, + ), + #[error(transparent)] + InputSchemaNotDefined( + #[from] + #[backtrace] + InputSchemaNotDefinedError, + ), + #[error(transparent)] + InvalidInputInterval( + #[from] + #[backtrace] + InvalidInputIntervalError, + ), + #[error(transparent)] + Access( + #[from] + #[backtrace] + AccessError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl From for VerifyTransformPlanError { + fn from(v: GetDatasetError) -> Self { + match v { + GetDatasetError::NotFound(e) => Self::DatasetNotFound(e), + GetDatasetError::Internal(e) => Self::Internal(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl From for VerifyTransformPlanError { + fn from(v: GetRefError) -> Self { + match v { + GetRefError::NotFound(e) => Self::RefNotFound(e), + GetRefError::Access(e) => Self::Access(e), + GetRefError::Internal(e) => Self::Internal(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl From for VerifyTransformPlanError { + fn from(v: GetBlockError) -> Self { + match v { + GetBlockError::NotFound(e) => Self::BlockNotFound(e), + GetBlockError::BlockVersion(e) => Self::BlockVersion(e), + GetBlockError::BlockMalformed(e) => Self::BlockMalformed(e), + GetBlockError::Access(e) => Self::Access(e), + GetBlockError::Internal(e) => Self::Internal(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/transform/transform_types.rs b/src/domain/core/src/services/transform/transform_types.rs new file mode 100644 index 0000000000..ab2461e714 --- /dev/null +++ b/src/domain/core/src/services/transform/transform_types.rs @@ -0,0 +1,110 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use internal_error::InternalError; +use opendatafabric::{DatasetHandle, DatasetID, Multihash}; +use thiserror::Error; + +use super::{ + TransformElaborateError, + TransformExecuteError, + TransformPlanError, + VerifyTransformExecuteError, + VerifyTransformPlanError, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// DTOs +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub enum TransformResult { + UpToDate, + Updated { + old_head: Multihash, + new_head: Multihash, + }, +} + +#[derive(Clone, Copy, Debug, Default)] +pub struct TransformOptions { + /// Run compaction of derivative datasets without saving data + /// if transformation fails due to root dataset compaction + pub reset_derivatives_on_diverged_input: bool, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Errors +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum TransformError { + #[error(transparent)] + Plan( + #[from] + #[backtrace] + TransformPlanError, + ), + #[error(transparent)] + Elaborate( + #[from] + #[backtrace] + TransformElaborateError, + ), + #[error(transparent)] + Execute( + #[from] + #[backtrace] + TransformExecuteError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum VerifyTransformError { + #[error(transparent)] + Plan( + #[from] + #[backtrace] + VerifyTransformPlanError, + ), + #[error(transparent)] + Execute( + #[from] + #[backtrace] + VerifyTransformExecuteError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +#[error("Dataset {dataset_handle} has not defined a schema yet")] +pub struct InputSchemaNotDefinedError { + pub dataset_handle: DatasetHandle, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Error, Debug)] +#[error("Invalid block interval [{head}, {tail}) in input dataset '{input_dataset_id}'")] +pub struct InvalidInputIntervalError { + pub input_dataset_id: DatasetID, + pub head: Multihash, + pub tail: Multihash, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/transform_service.rs b/src/domain/core/src/services/transform_service.rs deleted file mode 100644 index 0a61f20c07..0000000000 --- a/src/domain/core/src/services/transform_service.rs +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::sync::Arc; - -use internal_error::InternalError; -use opendatafabric::*; -use thiserror::Error; - -use crate::engine::EngineError; -use crate::*; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Service -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait::async_trait] -pub trait TransformService: Send + Sync { - /// Returns an active transform, if any - async fn get_active_transform( - &self, - dataset_ref: &DatasetRef, - ) -> Result)>, GetDatasetError>; - - async fn transform( - &self, - dataset_ref: &DatasetRef, - transform_options: TransformOptions, - listener: Option>, - ) -> Result; - - async fn transform_multi( - &self, - dataset_refs: Vec, - transform_options: TransformOptions, - listener: Option>, - ) -> Vec<(DatasetRef, Result)>; - - async fn verify_transform( - &self, - dataset_ref: &DatasetRef, - block_range: (Option, Option), - listener: Option>, - ) -> Result<(), VerificationError>; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// DTOs -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug)] -pub enum TransformResult { - UpToDate, - Updated { - old_head: Multihash, - new_head: Multihash, - }, -} - -#[derive(Clone, Copy, Debug, Default)] -pub struct TransformOptions { - /// Run compaction of derivative datasets without saving data - /// if transformation fails due to root dataset compaction - pub reset_derivatives_on_diverged_input: bool, -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Listeners -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub trait TransformListener: Send + Sync { - fn begin(&self) {} - fn success(&self, _result: &TransformResult) {} - fn error(&self, _error: &TransformError) {} - - fn get_engine_provisioning_listener( - self: Arc, - ) -> Option> { - None - } -} - -pub struct NullTransformListener; -impl TransformListener for NullTransformListener {} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub trait TransformMultiListener: Send + Sync { - fn begin_transform(&self, _dataset: &DatasetHandle) -> Option> { - None - } -} - -pub struct NullTransformMultiListener; -impl TransformMultiListener for NullTransformMultiListener {} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Errors -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Error)] -pub enum TransformError { - #[error(transparent)] - DatasetNotFound( - #[from] - #[backtrace] - DatasetNotFoundError, - ), - #[error(transparent)] - TransformNotDefined( - #[from] - #[backtrace] - TransformNotDefinedError, - ), - #[error(transparent)] - InputSchemaNotDefined( - #[from] - #[backtrace] - InputSchemaNotDefinedError, - ), - #[error(transparent)] - EngineProvisioningError( - #[from] - #[backtrace] - EngineProvisioningError, - ), - #[error(transparent)] - EngineError( - #[from] - #[backtrace] - EngineError, - ), - #[error(transparent)] - CommitError( - #[from] - #[backtrace] - CommitError, - ), - #[error(transparent)] - Access( - #[from] - #[backtrace] - AccessError, - ), - #[error(transparent)] - InvalidInputInterval( - #[from] - #[backtrace] - InvalidInputIntervalError, - ), - #[error(transparent)] - Internal( - #[from] - #[backtrace] - InternalError, - ), -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, thiserror::Error)] -#[error("Dataset does not define a transform")] -pub struct TransformNotDefinedError {} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, thiserror::Error)] -#[error("Dataset {dataset_handle} has not defined a schema yet")] -pub struct InputSchemaNotDefinedError { - pub dataset_handle: DatasetHandle, -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Error, Debug)] -#[error("Invalid block interval [{head}, {tail}) in input dataset '{input_dataset_id}'")] -pub struct InvalidInputIntervalError { - pub input_dataset_id: DatasetID, - pub head: Multihash, - pub tail: Multihash, -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -impl From for TransformError { - fn from(v: GetDatasetError) -> Self { - match v { - GetDatasetError::NotFound(e) => Self::DatasetNotFound(e), - GetDatasetError::Internal(e) => Self::Internal(e), - } - } -} - -impl From for TransformError { - fn from(v: auth::DatasetActionUnauthorizedError) -> Self { - match v { - auth::DatasetActionUnauthorizedError::Access(e) => Self::Access(e), - auth::DatasetActionUnauthorizedError::Internal(e) => Self::Internal(e), - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/verification_service.rs b/src/domain/core/src/services/verification_service.rs index 69fcc7e17c..ad0609a577 100644 --- a/src/domain/core/src/services/verification_service.rs +++ b/src/domain/core/src/services/verification_service.rs @@ -24,16 +24,13 @@ use crate::*; pub trait VerificationService: Send + Sync { async fn verify( &self, - dataset_ref: &DatasetRef, - block_range: (Option, Option), - options: VerificationOptions, + request: VerificationRequest, listener: Option>, ) -> VerificationResult; async fn verify_multi( &self, - requests: Vec, - options: VerificationOptions, + requests: Vec>, listener: Option>, ) -> Vec; } @@ -42,10 +39,11 @@ pub trait VerificationService: Send + Sync { // DTOs //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[derive(Debug, Clone)] -pub struct VerificationRequest { - pub dataset_ref: DatasetRef, +#[derive(Clone, Debug)] +pub struct VerificationRequest { + pub target: TTarget, pub block_range: (Option, Option), + pub options: VerificationOptions, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -123,6 +121,7 @@ pub trait VerificationListener: Send + Sync { fn begin(&self) {} fn success(&self, _result: &VerificationResult) {} fn error(&self, _error: &VerificationError) {} + fn transform_error(&self, _error: &VerifyTransformExecuteError) {} fn begin_phase(&self, _phase: VerificationPhase) {} fn end_phase(&self, _phase: VerificationPhase) {} @@ -224,10 +223,10 @@ pub enum VerificationError { CheckpointDoesNotMatchMetadata, ), #[error(transparent)] - Transform( + VerifyTransform( #[from] #[backtrace] - TransformError, + VerifyTransformError, ), #[error(transparent)] Access( diff --git a/src/domain/core/src/services/watermark_service.rs b/src/domain/core/src/services/watermark_service.rs new file mode 100644 index 0000000000..2e35d9310c --- /dev/null +++ b/src/domain/core/src/services/watermark_service.rs @@ -0,0 +1,95 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use internal_error::InternalError; +use opendatafabric::Multihash; +use thiserror::Error; + +use crate::auth::DatasetActionUnauthorizedError; +use crate::{AccessError, ResolvedDataset}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait WatermarkService: Send + Sync { + /// Attempt reading watermark that is currently associated with a dataset + async fn try_get_current_watermark( + &self, + dataset: ResolvedDataset, + ) -> Result>, GetWatermarkError>; + + /// Manually advances the watermark of a root dataset + async fn set_watermark( + &self, + target: ResolvedDataset, + new_watermark: DateTime, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub enum SetWatermarkResult { + UpToDate, + Updated { + old_head: Option, + new_head: Multihash, + }, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum GetWatermarkError { + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub enum SetWatermarkError { + #[error("Attempting to set watermark on a derivative dataset")] + IsDerivative, + + #[error("Attempting to set watermark on a remote dataset")] + IsRemote, + + #[error(transparent)] + Access( + #[from] + #[backtrace] + AccessError, + ), + + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl From for SetWatermarkError { + fn from(v: DatasetActionUnauthorizedError) -> Self { + match v { + DatasetActionUnauthorizedError::Access(e) => Self::Access(e), + DatasetActionUnauthorizedError::Internal(e) => Self::Internal(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/testing/mock_dataset_repository.rs b/src/domain/core/src/testing/mock_dataset_repository.rs index 73b92e966b..8de7949b75 100644 --- a/src/domain/core/src/testing/mock_dataset_repository.rs +++ b/src/domain/core/src/testing/mock_dataset_repository.rs @@ -10,44 +10,24 @@ use std::sync::Arc; use opendatafabric::{AccountName, DatasetHandle, DatasetRef}; -use url::Url; -use crate::{ - Dataset, - DatasetHandleStream, - DatasetRegistry, - DatasetRepository, - GetDatasetError, - GetDatasetUrlError, -}; +use crate::{Dataset, DatasetHandleStream, DatasetRepository, GetDatasetError}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// mockall::mock! { pub DatasetRepository {} - #[async_trait::async_trait] - impl DatasetRegistry for DatasetRepository { - async fn get_dataset_url(&self, dataset_ref: &DatasetRef) -> Result; - } - #[async_trait::async_trait] impl DatasetRepository for DatasetRepository { - fn is_multi_tenant(&self) -> bool; - - async fn resolve_dataset_ref( + async fn resolve_dataset_handle_by_ref( &self, dataset_ref: &DatasetRef, ) -> Result; - fn get_all_datasets(&self) -> DatasetHandleStream<'_>; + fn all_dataset_handles(&self) -> DatasetHandleStream<'_>; - fn get_datasets_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_>; - - async fn find_dataset_by_ref( - &self, - dataset_ref: &DatasetRef, - ) -> Result, GetDatasetError>; + fn all_dataset_handles_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_>; fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> Arc; } diff --git a/src/domain/core/src/use_cases/compact_dataset_use_case.rs b/src/domain/core/src/use_cases/compact_dataset_use_case.rs new file mode 100644 index 0000000000..43355b65c2 --- /dev/null +++ b/src/domain/core/src/use_cases/compact_dataset_use_case.rs @@ -0,0 +1,42 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use opendatafabric::DatasetHandle; + +use crate::{ + CompactionError, + CompactionListener, + CompactionMultiListener, + CompactionOptions, + CompactionResponse, + CompactionResult, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait CompactDatasetUseCase: Send + Sync { + async fn execute( + &self, + dataset_handle: &DatasetHandle, + options: CompactionOptions, + maybe_listener: Option>, + ) -> Result; + + async fn execute_multi( + &self, + dataset_handles: Vec, + options: CompactionOptions, + multi_listener: Option>, + ) -> Vec; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/use_cases/mod.rs b/src/domain/core/src/use_cases/mod.rs index be7db68249..27f41b70b2 100644 --- a/src/domain/core/src/use_cases/mod.rs +++ b/src/domain/core/src/use_cases/mod.rs @@ -9,14 +9,26 @@ mod append_dataset_metadata_batch_use_case; mod commit_dataset_event_use_case; +mod compact_dataset_use_case; mod create_dataset_from_snapshot_use_case; mod create_dataset_use_case; mod delete_dataset_use_case; +mod pull_dataset_use_case; +mod push_dataset_use_case; mod rename_dataset_use_case; +mod reset_dataset_use_case; +mod set_watermark_use_case; +mod verify_dataset_use_case; pub use append_dataset_metadata_batch_use_case::*; pub use commit_dataset_event_use_case::*; +pub use compact_dataset_use_case::*; pub use create_dataset_from_snapshot_use_case::*; pub use create_dataset_use_case::*; pub use delete_dataset_use_case::*; +pub use pull_dataset_use_case::*; +pub use push_dataset_use_case::*; pub use rename_dataset_use_case::*; +pub use reset_dataset_use_case::*; +pub use set_watermark_use_case::*; +pub use verify_dataset_use_case::*; diff --git a/src/domain/core/src/use_cases/pull_dataset_use_case.rs b/src/domain/core/src/use_cases/pull_dataset_use_case.rs new file mode 100644 index 0000000000..8cff9c42ee --- /dev/null +++ b/src/domain/core/src/use_cases/pull_dataset_use_case.rs @@ -0,0 +1,41 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use internal_error::InternalError; + +use crate::{PullListener, PullMultiListener, PullOptions, PullRequest, PullResponse}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait PullDatasetUseCase: Send + Sync { + async fn execute( + &self, + request: PullRequest, + options: PullOptions, + listener: Option>, + ) -> Result; + + async fn execute_multi( + &self, + requests: Vec, + options: PullOptions, + listener: Option>, + ) -> Result, InternalError>; + + async fn execute_all_owned( + &self, + options: PullOptions, + listener: Option>, + ) -> Result, InternalError>; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/use_cases/push_dataset_use_case.rs b/src/domain/core/src/use_cases/push_dataset_use_case.rs new file mode 100644 index 0000000000..be62acf572 --- /dev/null +++ b/src/domain/core/src/use_cases/push_dataset_use_case.rs @@ -0,0 +1,29 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use internal_error::InternalError; +use opendatafabric::DatasetHandle; + +use crate::{PushMultiOptions, PushResponse, SyncMultiListener}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait PushDatasetUseCase: Send + Sync { + async fn execute_multi( + &self, + dataset_handles: Vec, + options: PushMultiOptions, + sync_listener: Option>, + ) -> Result, InternalError>; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/use_cases/reset_dataset_use_case.rs b/src/domain/core/src/use_cases/reset_dataset_use_case.rs new file mode 100644 index 0000000000..edf19b0b09 --- /dev/null +++ b/src/domain/core/src/use_cases/reset_dataset_use_case.rs @@ -0,0 +1,26 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use opendatafabric::{DatasetHandle, Multihash}; + +use crate::ResetError; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait ResetDatasetUseCase: Send + Sync { + async fn execute( + &self, + dataset_handle: &DatasetHandle, + maybe_new_head: Option<&Multihash>, + maybe_old_head: Option<&Multihash>, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/use_cases/set_watermark_use_case.rs b/src/domain/core/src/use_cases/set_watermark_use_case.rs new file mode 100644 index 0000000000..e5cd0cdcc3 --- /dev/null +++ b/src/domain/core/src/use_cases/set_watermark_use_case.rs @@ -0,0 +1,26 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use opendatafabric::DatasetHandle; + +use crate::{SetWatermarkError, SetWatermarkResult}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait SetWatermarkUseCase: Send + Sync { + async fn execute( + &self, + dataset_handle: &DatasetHandle, + new_watermark: DateTime, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/use_cases/verify_dataset_use_case.rs b/src/domain/core/src/use_cases/verify_dataset_use_case.rs new file mode 100644 index 0000000000..0251f08bde --- /dev/null +++ b/src/domain/core/src/use_cases/verify_dataset_use_case.rs @@ -0,0 +1,38 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use opendatafabric::DatasetHandle; + +use crate::{ + VerificationListener, + VerificationMultiListener, + VerificationRequest, + VerificationResult, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait VerifyDatasetUseCase: Send + Sync { + async fn execute( + &self, + request: VerificationRequest, + maybe_listener: Option>, + ) -> VerificationResult; + + async fn execute_multi( + &self, + requests: Vec>, + maybe_multi_listener: Option>, + ) -> Vec; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/utils/mod.rs b/src/domain/core/src/utils/mod.rs index c728a2d5b9..9afc80cb08 100644 --- a/src/domain/core/src/utils/mod.rs +++ b/src/domain/core/src/utils/mod.rs @@ -10,3 +10,6 @@ pub mod metadata_chain_comparator; pub mod owned_file; pub mod paths; + +mod tenancy_config; +pub use tenancy_config::*; diff --git a/src/domain/core/src/utils/tenancy_config.rs b/src/domain/core/src/utils/tenancy_config.rs new file mode 100644 index 0000000000..ce9d8c2b5d --- /dev/null +++ b/src/domain/core/src/utils/tenancy_config.rs @@ -0,0 +1,24 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum TenancyConfig { + SingleTenant, + MultiTenant, +} + +impl Default for TenancyConfig { + fn default() -> Self { + Self::SingleTenant + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/datasets/domain/Cargo.toml b/src/domain/datasets/domain/Cargo.toml index bf6c86baf0..7cb737c0f5 100644 --- a/src/domain/datasets/domain/Cargo.toml +++ b/src/domain/datasets/domain/Cargo.toml @@ -40,6 +40,7 @@ secrecy = "0.10" serde = "1" serde_with = { version = "3", default-features = false } thiserror = { version = "1", default-features = false } +tokio-stream = "0.1" uuid = { version = "1", default-features = false, features = ["v4"] } # Optional diff --git a/src/domain/datasets/domain/src/repos/dataset_entry_repository.rs b/src/domain/datasets/domain/src/repos/dataset_entry_repository.rs index 5234cf118f..f7ec4a80b0 100644 --- a/src/domain/datasets/domain/src/repos/dataset_entry_repository.rs +++ b/src/domain/datasets/domain/src/repos/dataset_entry_repository.rs @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use database_common::PaginationOpts; use internal_error::InternalError; use opendatafabric::{AccountID, DatasetID, DatasetName}; use thiserror::Error; @@ -18,23 +19,36 @@ use crate::DatasetEntry; #[cfg_attr(any(feature = "testing", test), mockall::automock)] #[async_trait::async_trait] pub trait DatasetEntryRepository: Send + Sync { - async fn dataset_entries_count(&self) -> Result; + async fn dataset_entries_count(&self) -> Result; + + async fn dataset_entries_count_by_owner_id( + &self, + owner_id: &AccountID, + ) -> Result; + + fn get_dataset_entries(&self, pagination: PaginationOpts) -> DatasetEntryStream<'_>; + + fn get_dataset_entries_by_owner_id( + &self, + owner_id: &AccountID, + pagination: PaginationOpts, + ) -> DatasetEntryStream<'_>; async fn get_dataset_entry( &self, dataset_id: &DatasetID, ) -> Result; - async fn get_dataset_entry_by_name( + async fn get_multiple_dataset_entries( &self, - owner_id: &AccountID, - name: &DatasetName, - ) -> Result; + dataset_ids: &[DatasetID], + ) -> Result; - async fn get_dataset_entries_by_owner_id( + async fn get_dataset_entry_by_owner_and_name( &self, owner_id: &AccountID, - ) -> Result, GetDatasetEntriesByOwnerIdError>; + name: &DatasetName, + ) -> Result; async fn save_dataset_entry( &self, @@ -55,6 +69,20 @@ pub trait DatasetEntryRepository: Send + Sync { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +pub type DatasetEntryStream<'a> = std::pin::Pin< + Box> + Send + 'a>, +>; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default, Debug, Eq, PartialEq)] +pub struct DatasetEntriesResolution { + pub resolved_entries: Vec, + pub unresolved_entries: Vec, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[derive(Error, Debug)] pub enum GetDatasetEntryError { #[error(transparent)] @@ -64,6 +92,12 @@ pub enum GetDatasetEntryError { Internal(#[from] InternalError), } +#[derive(Error, Debug)] +pub enum GetMultipleDatasetEntriesError { + #[error(transparent)] + Internal(#[from] InternalError), +} + #[derive(Error, Debug)] #[error("Dataset entry with dataset_id '{dataset_id}' not found")] pub struct DatasetEntryNotFoundError { @@ -105,14 +139,6 @@ impl DatasetEntryByNameNotFoundError { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[derive(Error, Debug)] -pub enum GetDatasetEntriesByOwnerIdError { - #[error(transparent)] - Internal(#[from] InternalError), -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #[derive(Error, Debug)] pub enum SaveDatasetEntryError { #[error(transparent)] diff --git a/src/domain/datasets/domain/src/services/dataset_entry_service.rs b/src/domain/datasets/domain/src/services/dataset_entry_service.rs new file mode 100644 index 0000000000..fb04f2accd --- /dev/null +++ b/src/domain/datasets/domain/src/services/dataset_entry_service.rs @@ -0,0 +1,48 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use database_common::PaginationOpts; +use internal_error::InternalError; +use opendatafabric::AccountID; +use thiserror::Error; + +use crate::DatasetEntry; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait DatasetEntryService: Sync + Send { + async fn list_all_entries( + &self, + pagination: PaginationOpts, + ) -> Result; + + async fn list_entries_owned_by( + &self, + owner_id: AccountID, + pagination: PaginationOpts, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct DatasetEntryListing { + pub list: Vec, + pub total_count: usize, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Error, Debug)] +pub enum ListDatasetEntriesError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/datasets/domain/src/services/mod.rs b/src/domain/datasets/domain/src/services/mod.rs index 81bbed6345..a0b3ec926a 100644 --- a/src/domain/datasets/domain/src/services/mod.rs +++ b/src/domain/datasets/domain/src/services/mod.rs @@ -7,8 +7,10 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +mod dataset_entry_service; mod dataset_env_var_service; mod dataset_key_value_service; +pub use dataset_entry_service::*; pub use dataset_env_var_service::*; pub use dataset_key_value_service::*; diff --git a/src/domain/datasets/services/Cargo.toml b/src/domain/datasets/services/Cargo.toml index dfbd2bf27e..c1c36c1360 100644 --- a/src/domain/datasets/services/Cargo.toml +++ b/src/domain/datasets/services/Cargo.toml @@ -32,6 +32,7 @@ messaging-outbox = { workspace = true } opendatafabric = { workspace = true } time-source = { workspace = true } +async-stream = "0.3" async-trait = { version = "0.1", default-features = false } chrono = { version = "0.4", default-features = false } dill = "0.9" diff --git a/src/domain/datasets/services/src/dataset_entry_indexer.rs b/src/domain/datasets/services/src/dataset_entry_indexer.rs index 8e53d53de6..e4bf2d9b55 100644 --- a/src/domain/datasets/services/src/dataset_entry_indexer.rs +++ b/src/domain/datasets/services/src/dataset_entry_indexer.rs @@ -80,7 +80,11 @@ impl DatasetEntryIndexer { async fn index_datasets(&self) -> Result<(), InternalError> { use futures::TryStreamExt; - let dataset_handles: Vec<_> = self.dataset_repo.get_all_datasets().try_collect().await?; + let dataset_handles: Vec<_> = self + .dataset_repo + .all_dataset_handles() + .try_collect() + .await?; let account_name_id_mapping = self.build_account_name_id_mapping(&dataset_handles).await?; diff --git a/src/domain/datasets/services/src/dataset_entry_service.rs b/src/domain/datasets/services/src/dataset_entry_service.rs deleted file mode 100644 index e27a7ed188..0000000000 --- a/src/domain/datasets/services/src/dataset_entry_service.rs +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::sync::Arc; - -use dill::{component, interface, meta, Catalog}; -use internal_error::{InternalError, ResultIntoInternal}; -use kamu_core::{ - DatasetLifecycleMessage, - DatasetLifecycleMessageCreated, - DatasetLifecycleMessageDeleted, - DatasetLifecycleMessageRenamed, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, -}; -use kamu_datasets::{DatasetEntry, DatasetEntryRepository}; -use messaging_outbox::{ - MessageConsumer, - MessageConsumerMeta, - MessageConsumerT, - MessageConsumptionDurability, -}; -use time_source::SystemTimeSource; - -use crate::MESSAGE_CONSUMER_KAMU_DATASET_ENTRY_SERVICE; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub struct DatasetEntryService { - dataset_entry_repo: Arc, - time_source: Arc, -} - -#[component(pub)] -#[interface(dyn MessageConsumer)] -#[interface(dyn MessageConsumerT)] -#[meta(MessageConsumerMeta { - consumer_name: MESSAGE_CONSUMER_KAMU_DATASET_ENTRY_SERVICE, - feeding_producers: &[ - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, - ], - durability: MessageConsumptionDurability::Durable, -})] -impl DatasetEntryService { - pub fn new( - dataset_entry_repo: Arc, - time_source: Arc, - ) -> Self { - Self { - dataset_entry_repo, - time_source, - } - } - - async fn handle_dataset_lifecycle_created_message( - &self, - DatasetLifecycleMessageCreated { - dataset_id, - owner_account_id, - dataset_name, - .. - }: &DatasetLifecycleMessageCreated, - ) -> Result<(), InternalError> { - let entry = DatasetEntry::new( - dataset_id.clone(), - owner_account_id.clone(), - dataset_name.clone(), - self.time_source.now(), - ); - - self.dataset_entry_repo - .save_dataset_entry(&entry) - .await - .int_err() - } - - async fn handle_dataset_lifecycle_deleted_message( - &self, - DatasetLifecycleMessageDeleted { dataset_id, .. }: &DatasetLifecycleMessageDeleted, - ) -> Result<(), InternalError> { - self.dataset_entry_repo - .delete_dataset_entry(dataset_id) - .await - .int_err() - } - - async fn handle_dataset_lifecycle_renamed_message( - &self, - DatasetLifecycleMessageRenamed { - dataset_id, - new_dataset_name, - .. - }: &DatasetLifecycleMessageRenamed, - ) -> Result<(), InternalError> { - self.dataset_entry_repo - .update_dataset_entry_name(dataset_id, new_dataset_name) - .await - .int_err() - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -impl MessageConsumer for DatasetEntryService {} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait::async_trait] -impl MessageConsumerT for DatasetEntryService { - #[tracing::instrument( - level = "debug", - skip_all, - name = "DatasetEntryService[DatasetLifecycleMessage]" - )] - async fn consume_message( - &self, - _: &Catalog, - message: &DatasetLifecycleMessage, - ) -> Result<(), InternalError> { - tracing::debug!(received_message = ?message, "Received dataset lifecycle message"); - - match message { - DatasetLifecycleMessage::Created(message) => { - self.handle_dataset_lifecycle_created_message(message).await - } - - DatasetLifecycleMessage::Deleted(message) => { - self.handle_dataset_lifecycle_deleted_message(message).await - } - - DatasetLifecycleMessage::Renamed(message) => { - self.handle_dataset_lifecycle_renamed_message(message).await - } - - DatasetLifecycleMessage::DependenciesUpdated(_) => { - // No action required - Ok(()) - } - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/datasets/services/src/dataset_entry_service_impl.rs b/src/domain/datasets/services/src/dataset_entry_service_impl.rs new file mode 100644 index 0000000000..dc4c21b527 --- /dev/null +++ b/src/domain/datasets/services/src/dataset_entry_service_impl.rs @@ -0,0 +1,556 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::{HashMap, HashSet}; +use std::sync::{Arc, Mutex}; + +use database_common::PaginationOpts; +use dill::{component, interface, meta, Catalog}; +use internal_error::{InternalError, ResultIntoInternal}; +use kamu_accounts::{AccountRepository, CurrentAccountSubject}; +use kamu_core::{ + DatasetHandleStream, + DatasetHandlesResolution, + DatasetLifecycleMessage, + DatasetLifecycleMessageCreated, + DatasetLifecycleMessageDeleted, + DatasetLifecycleMessageRenamed, + DatasetNotFoundError, + DatasetRegistry, + DatasetRepository, + GetDatasetError, + GetMultipleDatasetsError, + ResolvedDataset, + TenancyConfig, + MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, +}; +use kamu_datasets::*; +use messaging_outbox::{ + MessageConsumer, + MessageConsumerMeta, + MessageConsumerT, + MessageDeliveryMechanism, +}; +use opendatafabric::{ + AccountID, + AccountName, + DatasetAlias, + DatasetHandle, + DatasetID, + DatasetName, + DatasetRef, +}; +use time_source::SystemTimeSource; + +use crate::MESSAGE_CONSUMER_KAMU_DATASET_ENTRY_SERVICE; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct DatasetEntryServiceImpl { + time_source: Arc, + dataset_entry_repo: Arc, + dataset_repo: Arc, + account_repo: Arc, + current_account_subject: Arc, + tenancy_config: Arc, + accounts_cache: Arc>, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +struct AccountsCache { + id2names: HashMap, + names2ids: HashMap, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn DatasetEntryService)] +#[interface(dyn DatasetRegistry)] +#[interface(dyn MessageConsumer)] +#[interface(dyn MessageConsumerT)] +#[meta(MessageConsumerMeta { + consumer_name: MESSAGE_CONSUMER_KAMU_DATASET_ENTRY_SERVICE, + feeding_producers: &[ + MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, + ], + delivery: MessageDeliveryMechanism::Immediate, +})] +impl DatasetEntryServiceImpl { + pub fn new( + time_source: Arc, + dataset_entry_repo: Arc, + dataset_repo: Arc, + account_repo: Arc, + current_account_subject: Arc, + tenancy_config: Arc, + ) -> Self { + Self { + time_source, + dataset_entry_repo, + dataset_repo, + account_repo, + current_account_subject, + tenancy_config, + accounts_cache: Default::default(), + } + } + + async fn handle_dataset_lifecycle_created_message( + &self, + DatasetLifecycleMessageCreated { + dataset_id, + owner_account_id, + dataset_name, + .. + }: &DatasetLifecycleMessageCreated, + ) -> Result<(), InternalError> { + match self.dataset_entry_repo.get_dataset_entry(dataset_id).await { + Ok(_) => return Ok(()), // idempotent handling of duplicates + Err(GetDatasetEntryError::NotFound(_)) => { /* happy case, create record */ } + Err(GetDatasetEntryError::Internal(e)) => return Err(e), + } + + let entry = DatasetEntry::new( + dataset_id.clone(), + owner_account_id.clone(), + dataset_name.clone(), + self.time_source.now(), + ); + + self.dataset_entry_repo + .save_dataset_entry(&entry) + .await + .int_err() + } + + async fn handle_dataset_lifecycle_deleted_message( + &self, + DatasetLifecycleMessageDeleted { dataset_id, .. }: &DatasetLifecycleMessageDeleted, + ) -> Result<(), InternalError> { + match self + .dataset_entry_repo + .delete_dataset_entry(dataset_id) + .await + { + Ok(_) | Err(DeleteEntryDatasetError::NotFound(_)) => Ok(()), + Err(DeleteEntryDatasetError::Internal(e)) => Err(e), + } + } + + async fn handle_dataset_lifecycle_renamed_message( + &self, + DatasetLifecycleMessageRenamed { + dataset_id, + new_dataset_name, + .. + }: &DatasetLifecycleMessageRenamed, + ) -> Result<(), InternalError> { + self.dataset_entry_repo + .update_dataset_entry_name(dataset_id, new_dataset_name) + .await + .int_err() + } + + async fn entries_as_handles( + &self, + entries: Vec, + ) -> Result, ListDatasetEntriesError> { + // Select which accounts haven't been processed yet + let first_seen_account_ids = { + let accounts_cache = self.accounts_cache.lock().unwrap(); + + let mut first_seen_account_ids: HashSet = HashSet::new(); + for entry in &entries { + if !accounts_cache.id2names.contains_key(&entry.owner_id) { + first_seen_account_ids.insert(entry.owner_id.clone()); + } + } + + first_seen_account_ids + }; + + // Query first seen accounts and fill the table + if !first_seen_account_ids.is_empty() { + let account_ids = first_seen_account_ids.into_iter().collect::>(); + let accounts = self + .account_repo + .get_accounts_by_ids(account_ids) + .await + .int_err()?; + + let mut accounts_cache = self.accounts_cache.lock().unwrap(); + for account in accounts { + accounts_cache + .id2names + .insert(account.id.clone(), account.account_name.clone()); + accounts_cache + .names2ids + .insert(account.account_name, account.id); + } + } + + // Convert the entries to handles + let mut handles = Vec::new(); + let accounts_cache = self.accounts_cache.lock().unwrap(); + for entry in &entries { + // By now we should now the account name + let maybe_owner_name = accounts_cache.id2names.get(&entry.owner_id); + if let Some(owner_name) = maybe_owner_name { + // Form DatasetHandle + handles.push(DatasetHandle::new( + entry.id.clone(), + self.make_alias(owner_name.clone(), entry.name.clone()), + )); + } + } + + // Return converted list + Ok(handles) + } + + async fn resolve_account_name_by_id( + &self, + account_id: &AccountID, + ) -> Result { + let maybe_cached_name = { + let accounts_cache = self.accounts_cache.lock().unwrap(); + accounts_cache.id2names.get(account_id).cloned() + }; + + if let Some(name) = maybe_cached_name { + Ok(name) + } else { + let account = self + .account_repo + .get_account_by_id(account_id) + .await + .int_err()?; + + let mut accounts_cache = self.accounts_cache.lock().unwrap(); + accounts_cache + .id2names + .insert(account_id.clone(), account.account_name.clone()); + accounts_cache + .names2ids + .insert(account.account_name.clone(), account_id.clone()); + + Ok(account.account_name) + } + } + + async fn resolve_account_id_by_maybe_name( + &self, + maybe_account_name: Option<&AccountName>, + ) -> Result { + let account_name = maybe_account_name + .unwrap_or_else(|| self.current_account_subject.account_name_or_default()); + + let maybe_cached_id = { + let accounts_cache = self.accounts_cache.lock().unwrap(); + accounts_cache.names2ids.get(account_name).cloned() + }; + + if let Some(id) = maybe_cached_id { + Ok(id) + } else { + let account = self + .account_repo + .get_account_by_name(account_name) + .await + .int_err()?; + + let mut accounts_cache = self.accounts_cache.lock().unwrap(); + accounts_cache + .id2names + .insert(account.id.clone(), account_name.clone()); + accounts_cache + .names2ids + .insert(account_name.clone(), account.id.clone()); + + Ok(account.id) + } + } + + fn stream_datasets<'a, Args, HInitArgs, HInitArgsFut, HListing, HListingFut>( + &'a self, + get_args_callback: HInitArgs, + next_entries_callback: HListing, + ) -> DatasetHandleStream<'a> + where + Args: Clone + Send + 'a, + HInitArgs: FnOnce() -> HInitArgsFut + Send + 'a, + HInitArgsFut: std::future::Future> + Send + 'a, + HListing: Fn(Args, PaginationOpts) -> HListingFut + Send + 'a, + HListingFut: std::future::Future> + + Send + + 'a, + { + Box::pin(async_stream::try_stream! { + // Init arguments + let args = get_args_callback().await?; + + // Tracking pagination progress + let mut offset = 0; + let limit = 100; + + loop { + // Load a page of dataset entries + let entries_page = next_entries_callback(args.clone(), PaginationOpts { limit, offset }) + .await + .int_err()?; + + // Actually read entires + let loaded_entries_count = entries_page.list.len(); + + // Convert entries to handles + let handles = self.entries_as_handles(entries_page.list).await.int_err()?; + + // Stream the entries + for hdl in handles { + yield hdl; + } + + // Next page + offset += loaded_entries_count; + if offset >= entries_page.total_count { + break; + } + } + }) + } + + fn make_alias(&self, owner_name: AccountName, dataset_name: DatasetName) -> DatasetAlias { + match *self.tenancy_config { + TenancyConfig::MultiTenant => DatasetAlias::new(Some(owner_name), dataset_name), + TenancyConfig::SingleTenant => DatasetAlias::new(None, dataset_name), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl DatasetEntryService for DatasetEntryServiceImpl { + async fn list_all_entries( + &self, + pagination: PaginationOpts, + ) -> Result { + use futures::TryStreamExt; + + let total_count = self.dataset_entry_repo.dataset_entries_count().await?; + let entries = self + .dataset_entry_repo + .get_dataset_entries(pagination) + .try_collect() + .await?; + + Ok(DatasetEntryListing { + list: entries, + total_count, + }) + } + + async fn list_entries_owned_by( + &self, + owner_id: AccountID, + pagination: PaginationOpts, + ) -> Result { + use futures::TryStreamExt; + + let total_count = self + .dataset_entry_repo + .dataset_entries_count_by_owner_id(&owner_id) + .await?; + let entries = self + .dataset_entry_repo + .get_dataset_entries_by_owner_id(&owner_id, pagination) + .try_collect() + .await?; + + Ok(DatasetEntryListing { + list: entries, + total_count, + }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl DatasetRegistry for DatasetEntryServiceImpl { + #[tracing::instrument(level = "debug", skip_all)] + fn all_dataset_handles<'a>(&'a self) -> DatasetHandleStream<'a> { + #[derive(Clone)] + struct NoArgs {} + + self.stream_datasets( + || async { Ok(NoArgs {}) }, + |_, pagination| self.list_all_entries(pagination), + ) + } + + #[tracing::instrument(level = "debug", skip_all, fields(%owner_name))] + fn all_dataset_handles_by_owner(&self, owner_name: &AccountName) -> DatasetHandleStream<'_> { + #[derive(Clone)] + struct OwnerArgs { + owner_id: AccountID, + } + + let owner_name = owner_name.clone(); + + self.stream_datasets( + move || async move { + let owner_id = self + .resolve_account_id_by_maybe_name(Some(&owner_name)) + .await?; + Ok(OwnerArgs { owner_id }) + }, + |args, pagination| self.list_entries_owned_by(args.owner_id, pagination), + ) + } + + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_ref))] + async fn resolve_dataset_handle_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result { + match dataset_ref { + DatasetRef::Handle(h) => Ok(h.clone()), + DatasetRef::Alias(alias) => { + let owner_id = self + .resolve_account_id_by_maybe_name(alias.account_name.as_ref()) + .await?; + match self + .dataset_entry_repo + .get_dataset_entry_by_owner_and_name(&owner_id, &alias.dataset_name) + .await + { + Ok(entry) => Ok(DatasetHandle::new(entry.id.clone(), alias.clone())), + Err(GetDatasetEntryByNameError::NotFound(_)) => { + Err(GetDatasetError::NotFound(DatasetNotFoundError { + dataset_ref: dataset_ref.clone(), + })) + } + Err(GetDatasetEntryByNameError::Internal(e)) => { + Err(GetDatasetError::Internal(e)) + } + } + } + DatasetRef::ID(id) => match self.dataset_entry_repo.get_dataset_entry(id).await { + Ok(entry) => { + let owner_name = self.resolve_account_name_by_id(&entry.owner_id).await?; + Ok(DatasetHandle::new( + entry.id.clone(), + self.make_alias(owner_name, entry.name.clone()), + )) + } + Err(GetDatasetEntryError::NotFound(_)) => { + Err(GetDatasetError::NotFound(DatasetNotFoundError { + dataset_ref: dataset_ref.clone(), + })) + } + Err(GetDatasetEntryError::Internal(e)) => Err(GetDatasetError::Internal(e)), + }, + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?dataset_ids))] + async fn resolve_multiple_dataset_handles_by_ids( + &self, + dataset_ids: Vec, + ) -> Result { + let entries_resolution = self + .dataset_entry_repo + .get_multiple_dataset_entries(&dataset_ids) + .await + .map_err(|e| match e { + GetMultipleDatasetEntriesError::Internal(e) => { + GetMultipleDatasetsError::Internal(e) + } + })?; + + let resolved_handles = self + .entries_as_handles(entries_resolution.resolved_entries) + .await + .map_err(|e| match e { + ListDatasetEntriesError::Internal(e) => GetMultipleDatasetsError::Internal(e), + })?; + + let unresolved_datasets = entries_resolution + .unresolved_entries + .into_iter() + .map(|id| { + ( + id.clone(), + GetDatasetError::NotFound(DatasetNotFoundError { + dataset_ref: id.into_local_ref(), + }), + ) + }) + .collect(); + + Ok(DatasetHandlesResolution { + resolved_handles, + unresolved_datasets, + }) + } + + // Note: in future we will be resolving storage repository, + // but for now we have just a single one + fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> ResolvedDataset { + let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); + ResolvedDataset::new(dataset, dataset_handle.clone()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl MessageConsumer for DatasetEntryServiceImpl {} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl MessageConsumerT for DatasetEntryServiceImpl { + #[tracing::instrument( + level = "debug", + skip_all, + name = "DatasetEntryService[DatasetLifecycleMessage]" + )] + async fn consume_message( + &self, + _: &Catalog, + message: &DatasetLifecycleMessage, + ) -> Result<(), InternalError> { + tracing::debug!(received_message = ?message, "Received dataset lifecycle message"); + + match message { + DatasetLifecycleMessage::Created(message) => { + self.handle_dataset_lifecycle_created_message(message).await + } + + DatasetLifecycleMessage::Deleted(message) => { + self.handle_dataset_lifecycle_deleted_message(message).await + } + + DatasetLifecycleMessage::Renamed(message) => { + self.handle_dataset_lifecycle_renamed_message(message).await + } + + DatasetLifecycleMessage::DependenciesUpdated(_) => { + // No action required + Ok(()) + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/datasets/services/src/lib.rs b/src/domain/datasets/services/src/lib.rs index be0a922914..2664e16064 100644 --- a/src/domain/datasets/services/src/lib.rs +++ b/src/domain/datasets/services/src/lib.rs @@ -13,7 +13,7 @@ pub use kamu_datasets as domain; mod dataset_entry_indexer; -mod dataset_entry_service; +mod dataset_entry_service_impl; mod dataset_env_var_service_impl; mod dataset_env_var_service_null; mod dataset_key_value_service_impl; @@ -22,7 +22,7 @@ mod jobs; mod messages; pub use dataset_entry_indexer::*; -pub use dataset_entry_service::*; +pub use dataset_entry_service_impl::*; pub use dataset_env_var_service_impl::*; pub use dataset_env_var_service_null::*; pub use dataset_key_value_service_impl::*; diff --git a/src/domain/datasets/services/tests/tests/test_dataset_entry_service.rs b/src/domain/datasets/services/tests/tests/test_dataset_entry_service.rs index 3e3b665bea..8e751cb3f3 100644 --- a/src/domain/datasets/services/tests/tests/test_dataset_entry_service.rs +++ b/src/domain/datasets/services/tests/tests/test_dataset_entry_service.rs @@ -20,10 +20,17 @@ use kamu_core::{ DatasetLifecycleMessage, DatasetRepository, DatasetVisibility, + TenancyConfig, MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, }; -use kamu_datasets::{DatasetEntry, DatasetEntryRepository, MockDatasetEntryRepository}; -use kamu_datasets_services::{DatasetEntryIndexer, DatasetEntryService}; +use kamu_datasets::{ + DatasetEntry, + DatasetEntryNotFoundError, + DatasetEntryRepository, + GetDatasetEntryError, + MockDatasetEntryRepository, +}; +use kamu_datasets_services::{DatasetEntryIndexer, DatasetEntryServiceImpl}; use messaging_outbox::{register_message_dispatcher, Outbox, OutboxExt, OutboxImmediateImpl}; use mockall::predicate::eq; use opendatafabric::{AccountID, AccountName, DatasetAlias, DatasetHandle, DatasetID, DatasetName}; @@ -39,6 +46,10 @@ async fn test_correctly_handles_outbox_messages() { let new_dataset_name = DatasetName::new_unchecked("new-name"); let mut mock_dataset_entry_repository = MockDatasetEntryRepository::new(); + DatasetEntryServiceHarness::add_get_dataset_entry_expectation( + &mut mock_dataset_entry_repository, + dataset_id.clone(), + ); DatasetEntryServiceHarness::add_save_dataset_entry_expectation( &mut mock_dataset_entry_repository, dataset_id.clone(), @@ -198,7 +209,7 @@ impl DatasetEntryServiceHarness { let catalog = { let mut b = CatalogBuilder::new(); - b.add::(); + b.add::(); b.add::(); b.add_value(mock_dataset_entry_repository); @@ -227,6 +238,8 @@ impl DatasetEntryServiceHarness { b.add_value(CurrentAccountSubject::new_test()); + b.add_value(TenancyConfig::SingleTenant); + register_message_dispatcher::( &mut b, MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, @@ -338,6 +351,21 @@ impl DatasetEntryServiceHarness { .returning(|_| Ok(())); } + fn add_get_dataset_entry_expectation( + mock_dataset_entry_repository: &mut MockDatasetEntryRepository, + dataset_id: DatasetID, + ) { + mock_dataset_entry_repository + .expect_get_dataset_entry() + .with(eq(dataset_id.clone())) + .times(1) + .returning(move |_| { + Err(GetDatasetEntryError::NotFound(DatasetEntryNotFoundError { + dataset_id: dataset_id.clone(), + })) + }); + } + fn add_save_dataset_entry_expectation( mock_dataset_entry_repository: &mut MockDatasetEntryRepository, dataset_id: DatasetID, @@ -374,11 +402,10 @@ impl DatasetEntryServiceHarness { dataset_handles: Vec, ) { mock_dataset_repository - .expect_get_all_datasets() + .expect_all_dataset_handles() .times(1) .returning(move || { let stream = futures::stream::iter(dataset_handles.clone().into_iter().map(Ok)); - Box::pin(stream) }); } diff --git a/src/domain/flow-system/domain/src/entities/flow/flow_outcome.rs b/src/domain/flow-system/domain/src/entities/flow/flow_outcome.rs index d29d9b6c5d..0601a692f8 100644 --- a/src/domain/flow-system/domain/src/entities/flow/flow_outcome.rs +++ b/src/domain/flow-system/domain/src/entities/flow/flow_outcome.rs @@ -121,9 +121,7 @@ impl From for FlowResult { ts::TaskResult::UpdateDatasetResult(task_update_result) => { match task_update_result.pull_result { PullResult::UpToDate(up_to_date_result) => match up_to_date_result { - PullResultUpToDate::Sync - | PullResultUpToDate::Transform - | PullResultUpToDate::SetWatermark => Self::Empty, + PullResultUpToDate::Sync | PullResultUpToDate::Transform => Self::Empty, PullResultUpToDate::PollingIngest(result) => Self::DatasetUpdate( FlowResultDatasetUpdate::UpToDate(FlowResultDatasetUpdateUpToDate { uncacheable: result.uncacheable, diff --git a/src/domain/flow-system/services/src/flow/flow_executor_impl.rs b/src/domain/flow-system/services/src/flow/flow_executor_impl.rs index f71b651cdf..347da3b741 100644 --- a/src/domain/flow-system/services/src/flow/flow_executor_impl.rs +++ b/src/domain/flow-system/services/src/flow/flow_executor_impl.rs @@ -25,7 +25,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, Outbox, OutboxExt, }; @@ -65,7 +65,7 @@ pub struct FlowExecutorImpl { MESSAGE_PRODUCER_KAMU_TASK_EXECUTOR, MESSAGE_PRODUCER_KAMU_FLOW_CONFIGURATION_SERVICE ], - durability: MessageConsumptionDurability::Durable, + delivery: MessageDeliveryMechanism::Transactional, })] #[interface(dyn InitOnStartup)] #[meta(InitOnStartupMeta { @@ -376,7 +376,7 @@ impl FlowExecutorImpl { { fetch_uncacheable = ingest_rule.fetch_uncacheable; } - Ok(LogicalPlan::UpdateDataset(UpdateDataset { + Ok(LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: flow_key.dataset_id.clone(), fetch_uncacheable, })) @@ -396,18 +396,20 @@ impl FlowExecutorImpl { matches!(compaction_rule, CompactionRule::MetadataOnly(_)); }; - Ok(LogicalPlan::HardCompactionDataset(HardCompactionDataset { - dataset_id: flow_key.dataset_id.clone(), - max_slice_size, - max_slice_records, - keep_metadata_only, - })) + Ok(LogicalPlan::HardCompactDataset( + LogicalPlanHardCompactDataset { + dataset_id: flow_key.dataset_id.clone(), + max_slice_size, + max_slice_records, + keep_metadata_only, + }, + )) } DatasetFlowType::Reset => { if let Some(config_rule) = maybe_config_snapshot && let FlowConfigurationSnapshot::Reset(reset_rule) = config_rule { - return Ok(LogicalPlan::Reset(ResetDataset { + return Ok(LogicalPlan::ResetDataset(LogicalPlanResetDataset { dataset_id: flow_key.dataset_id.clone(), new_head_hash: reset_rule.new_head_hash.clone(), old_head_hash: reset_rule.old_head_hash.clone(), @@ -420,7 +422,7 @@ impl FlowExecutorImpl { FlowKey::System(flow_key) => { match flow_key.flow_type { // TODO: replace on correct logical plan - SystemFlowType::GC => Ok(LogicalPlan::Probe(Probe { + SystemFlowType::GC => Ok(LogicalPlan::Probe(LogicalPlanProbe { dataset_id: None, busy_time: Some(std::time::Duration::from_secs(20)), end_with_outcome: Some(TaskOutcome::Success(TaskResult::Empty)), diff --git a/src/domain/flow-system/services/src/flow_configuration/flow_configuration_service_impl.rs b/src/domain/flow-system/services/src/flow_configuration/flow_configuration_service_impl.rs index ee09d82873..134f1fd03e 100644 --- a/src/domain/flow-system/services/src/flow_configuration/flow_configuration_service_impl.rs +++ b/src/domain/flow-system/services/src/flow_configuration/flow_configuration_service_impl.rs @@ -18,7 +18,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, Outbox, OutboxExt, }; @@ -47,7 +47,7 @@ pub struct FlowConfigurationServiceImpl { #[meta(MessageConsumerMeta { consumer_name: MESSAGE_CONSUMER_KAMU_FLOW_CONFIGURATION_SERVICE, feeding_producers: &[MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE], - durability: MessageConsumptionDurability::Durable, + delivery: MessageDeliveryMechanism::Transactional, })] impl FlowConfigurationServiceImpl { pub fn new( diff --git a/src/domain/flow-system/services/tests/tests/test_flow_configuration_service_impl.rs b/src/domain/flow-system/services/tests/tests/test_flow_configuration_service_impl.rs index 4b2a332f55..068aac9f46 100644 --- a/src/domain/flow-system/services/tests/tests/test_flow_configuration_service_impl.rs +++ b/src/domain/flow-system/services/tests/tests/test_flow_configuration_service_impl.rs @@ -450,13 +450,11 @@ impl FlowConfigurationHarness { .add::() .add::() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add_value(CurrentAccountSubject::new_test()) .add::() .add::() diff --git a/src/domain/flow-system/services/tests/tests/test_flow_executor_impl.rs b/src/domain/flow-system/services/tests/tests/test_flow_executor_impl.rs index ad2eb0a1fc..d4b5ec746d 100644 --- a/src/domain/flow-system/services/tests/tests/test_flow_executor_impl.rs +++ b/src/domain/flow-system/services/tests/tests/test_flow_executor_impl.rs @@ -69,7 +69,7 @@ async fn test_read_initial_config_and_queue_without_waiting() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -83,7 +83,7 @@ async fn test_read_initial_config_and_queue_without_waiting() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(90), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -244,7 +244,7 @@ async fn test_read_initial_config_shouldnt_queue_in_recovery_case() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(110), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -333,7 +333,7 @@ async fn test_cron_config() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::seconds(6), finish_in_with: Some((Duration::seconds(1), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -463,7 +463,7 @@ async fn test_manual_trigger() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -477,7 +477,7 @@ async fn test_manual_trigger() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(60), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -491,7 +491,7 @@ async fn test_manual_trigger() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(100), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -674,7 +674,7 @@ async fn test_ingest_trigger_with_ingest_config() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: true }), @@ -688,7 +688,7 @@ async fn test_ingest_trigger_with_ingest_config() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(60), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: true }), @@ -702,7 +702,7 @@ async fn test_ingest_trigger_with_ingest_config() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(100), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -875,7 +875,7 @@ async fn test_manual_trigger_compaction() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(20), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: None, max_slice_records: None, @@ -890,7 +890,7 @@ async fn test_manual_trigger_compaction() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(60), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1036,7 +1036,7 @@ async fn test_manual_trigger_reset() { dataset_id: Some(create_dataset_result.dataset_handle.id.clone()), run_since_start: Duration::milliseconds(20), finish_in_with: Some((Duration::milliseconds(90), TaskOutcome::Success(TaskResult::ResetDatasetResult(TaskResetDatasetResult { new_head: Multihash::from_digest_sha3_256(b"new-head") })))), - expected_logical_plan: LogicalPlan::Reset(ResetDataset { + expected_logical_plan: LogicalPlan::ResetDataset(LogicalPlanResetDataset { dataset_id: create_dataset_result.dataset_handle.id.clone(), // By deafult should reset to seed block new_head_hash: Some(dataset_blocks[1].0.clone()), @@ -1178,7 +1178,7 @@ async fn test_reset_trigger_keep_metadata_compaction_for_derivatives() { dataset_id: Some(create_foo_result.dataset_handle.id.clone()), run_since_start: Duration::milliseconds(20), finish_in_with: Some((Duration::milliseconds(70), TaskOutcome::Success(TaskResult::ResetDatasetResult(TaskResetDatasetResult { new_head: Multihash::from_digest_sha3_256(b"new-head") })))), - expected_logical_plan: LogicalPlan::Reset(ResetDataset { + expected_logical_plan: LogicalPlan::ResetDataset(LogicalPlanResetDataset { dataset_id: create_foo_result.dataset_handle.id.clone(), new_head_hash: Some(dataset_blocks[1].0.clone()), old_head_hash: Some(dataset_blocks[0].0.clone()), @@ -1206,7 +1206,7 @@ async fn test_reset_trigger_keep_metadata_compaction_for_derivatives() { } )) )), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_baz_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1234,7 +1234,7 @@ async fn test_reset_trigger_keep_metadata_compaction_for_derivatives() { } )) )), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1369,7 +1369,7 @@ async fn test_manual_trigger_compaction_with_config() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(30), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: Some(max_slice_size), max_slice_records: Some(max_slice_records), @@ -1509,7 +1509,7 @@ async fn test_full_hard_compaction_trigger_keep_metadata_compaction_for_derivati })) ) ), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: Some(max_slice_size), max_slice_records: Some(max_slice_records), @@ -1537,7 +1537,7 @@ async fn test_full_hard_compaction_trigger_keep_metadata_compaction_for_derivati } )) )), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_baz_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1565,7 +1565,7 @@ async fn test_full_hard_compaction_trigger_keep_metadata_compaction_for_derivati } )) )), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1737,7 +1737,7 @@ async fn test_manual_trigger_keep_metadata_only_with_recursive_compaction() { })) ) ), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1765,7 +1765,7 @@ async fn test_manual_trigger_keep_metadata_only_with_recursive_compaction() { } )) )), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1793,7 +1793,7 @@ async fn test_manual_trigger_keep_metadata_only_with_recursive_compaction() { } )) )), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_bar_baz_id.clone(), max_slice_size: None, max_slice_records: None, @@ -1967,7 +1967,7 @@ async fn test_manual_trigger_keep_metadata_only_without_recursive_compaction() { })) ) ), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: None, max_slice_records: None, @@ -2017,7 +2017,7 @@ async fn test_manual_trigger_keep_metadata_only_compaction_multiple_accounts() { let petya_account_name = AccountName::new_unchecked("petya"); let harness = FlowHarness::with_overrides(FlowHarnessOverrides { - is_multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, custom_account_names: vec![wasya_account_name.clone(), petya_account_name.clone()], ..Default::default() }) @@ -2089,7 +2089,7 @@ async fn test_manual_trigger_keep_metadata_only_compaction_multiple_accounts() { old_num_blocks: 5, new_num_blocks: 4, }})))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: None, max_slice_records: None, @@ -2120,7 +2120,7 @@ async fn test_manual_trigger_keep_metadata_only_compaction_multiple_accounts() { new_num_blocks: 4, }})))), // Make sure we will take config from root dataset - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -2252,7 +2252,7 @@ async fn test_dataset_flow_configuration_paused_resumed_modified() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -2266,7 +2266,7 @@ async fn test_dataset_flow_configuration_paused_resumed_modified() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(20), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -2484,7 +2484,7 @@ async fn test_respect_last_success_time_when_schedule_resumes() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -2498,7 +2498,7 @@ async fn test_respect_last_success_time_when_schedule_resumes() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(20), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -2705,7 +2705,7 @@ async fn test_dataset_deleted() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -2719,7 +2719,7 @@ async fn test_dataset_deleted() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(20), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -2894,7 +2894,7 @@ async fn test_task_completions_trigger_next_loop_on_success() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -2908,7 +2908,7 @@ async fn test_task_completions_trigger_next_loop_on_success() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(20), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Failed(TaskError::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -2922,7 +2922,7 @@ async fn test_task_completions_trigger_next_loop_on_success() { dataset_id: Some(baz_id.clone()), run_since_start: Duration::milliseconds(30), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Cancelled)), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: baz_id.clone(), fetch_uncacheable: false }), @@ -3119,7 +3119,7 @@ async fn test_derived_dataset_triggered_initially_and_after_input_change() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -3139,7 +3139,7 @@ async fn test_derived_dataset_triggered_initially_and_after_input_change() { new_head: Multihash::from_digest_sha3_256(b"new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -3159,7 +3159,7 @@ async fn test_derived_dataset_triggered_initially_and_after_input_change() { new_head: Multihash::from_digest_sha3_256(b"newest-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -3173,7 +3173,7 @@ async fn test_derived_dataset_triggered_initially_and_after_input_change() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(130), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -3371,7 +3371,7 @@ async fn test_throttling_manual_triggers() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(40), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -3532,7 +3532,7 @@ async fn test_throttling_derived_dataset_with_2_parents() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -3551,7 +3551,7 @@ async fn test_throttling_derived_dataset_with_2_parents() { new_head: Multihash::from_digest_sha3_256(b"fbar-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -3565,7 +3565,7 @@ async fn test_throttling_derived_dataset_with_2_parents() { dataset_id: Some(baz_id.clone()), run_since_start: Duration::milliseconds(30), finish_in_with: Some((Duration::milliseconds(20), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: baz_id.clone(), fetch_uncacheable: false }), @@ -3584,7 +3584,7 @@ async fn test_throttling_derived_dataset_with_2_parents() { new_head: Multihash::from_digest_sha3_256(b"foo-newest-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -3598,7 +3598,7 @@ async fn test_throttling_derived_dataset_with_2_parents() { dataset_id: Some(baz_id.clone()), run_since_start: Duration::milliseconds(160), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: baz_id.clone(), fetch_uncacheable: false }), @@ -3617,7 +3617,7 @@ async fn test_throttling_derived_dataset_with_2_parents() { new_head: Multihash::from_digest_sha3_256(b"bar-newest-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -3999,7 +3999,7 @@ async fn test_batching_condition_records_reached() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4018,7 +4018,7 @@ async fn test_batching_condition_records_reached() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4037,7 +4037,7 @@ async fn test_batching_condition_records_reached() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4056,7 +4056,7 @@ async fn test_batching_condition_records_reached() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice-3"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4075,7 +4075,7 @@ async fn test_batching_condition_records_reached() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4322,7 +4322,7 @@ async fn test_batching_condition_timeout() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4341,7 +4341,7 @@ async fn test_batching_condition_timeout() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4360,7 +4360,7 @@ async fn test_batching_condition_timeout() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4381,7 +4381,7 @@ async fn test_batching_condition_timeout() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4596,7 +4596,7 @@ async fn test_batching_condition_watermark() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4615,7 +4615,7 @@ async fn test_batching_condition_watermark() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4634,7 +4634,7 @@ async fn test_batching_condition_watermark() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4655,7 +4655,7 @@ async fn test_batching_condition_watermark() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4940,7 +4940,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -4959,7 +4959,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -4978,7 +4978,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"baz-new-slice"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: baz_id.clone(), fetch_uncacheable: false }), @@ -4997,7 +4997,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -5016,7 +5016,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"bar-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: bar_id.clone(), fetch_uncacheable: false }), @@ -5035,7 +5035,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"foo-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -5054,7 +5054,7 @@ async fn test_batching_condition_with_2_inputs() { new_head: Multihash::from_digest_sha3_256(b"baz-new-slice-2"), }, })))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: baz_id.clone(), fetch_uncacheable: false }), @@ -5340,7 +5340,7 @@ async fn test_list_all_flow_initiators() { let harness = FlowHarness::with_overrides(FlowHarnessOverrides { custom_account_names: vec![foo_account_name.clone(), bar_account_name.clone()], - is_multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, ..Default::default() }) .await; @@ -5399,7 +5399,7 @@ async fn test_list_all_flow_initiators() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(20), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: None, max_slice_records: None, @@ -5414,7 +5414,7 @@ async fn test_list_all_flow_initiators() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(60), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -5490,7 +5490,7 @@ async fn test_list_all_datasets_with_flow() { let harness = FlowHarness::with_overrides(FlowHarnessOverrides { custom_account_names: vec![foo_account_name.clone(), bar_account_name.clone()], - is_multi_tenant: true, + tenancy_config: TenancyConfig::MultiTenant, ..Default::default() }) .await; @@ -5559,7 +5559,7 @@ async fn test_list_all_datasets_with_flow() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(20), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: foo_id.clone(), max_slice_size: None, max_slice_records: None, @@ -5574,7 +5574,7 @@ async fn test_list_all_datasets_with_flow() { dataset_id: Some(bar_id.clone()), run_since_start: Duration::milliseconds(60), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::HardCompactionDataset(HardCompactionDataset { + expected_logical_plan: LogicalPlan::HardCompactDataset(LogicalPlanHardCompactDataset { dataset_id: bar_id.clone(), max_slice_size: None, max_slice_records: None, @@ -5707,7 +5707,7 @@ async fn test_abort_flow_before_scheduling_tasks() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -5803,7 +5803,7 @@ async fn test_abort_flow_after_scheduling_still_waiting_for_executor() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(10), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -5904,7 +5904,7 @@ async fn test_abort_flow_after_task_running_has_started() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(100), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -5994,7 +5994,7 @@ async fn test_abort_flow_after_task_finishes() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(10), finish_in_with: Some((Duration::milliseconds(20), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), @@ -6008,7 +6008,7 @@ async fn test_abort_flow_after_task_finishes() { dataset_id: Some(foo_id.clone()), run_since_start: Duration::milliseconds(90), finish_in_with: Some((Duration::milliseconds(20), TaskOutcome::Success(TaskResult::Empty))), - expected_logical_plan: LogicalPlan::UpdateDataset(UpdateDataset { + expected_logical_plan: LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset { dataset_id: foo_id.clone(), fetch_uncacheable: false }), diff --git a/src/domain/flow-system/services/tests/tests/utils/flow_config_test_listener.rs b/src/domain/flow-system/services/tests/tests/utils/flow_config_test_listener.rs index a725ad1d9c..14222e2d95 100644 --- a/src/domain/flow-system/services/tests/tests/utils/flow_config_test_listener.rs +++ b/src/domain/flow-system/services/tests/tests/utils/flow_config_test_listener.rs @@ -17,7 +17,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -33,7 +33,7 @@ pub(crate) struct FlowConfigTestListener { #[meta(MessageConsumerMeta { consumer_name: "FlowConfigTestListener", feeding_producers: &[MESSAGE_PRODUCER_KAMU_FLOW_CONFIGURATION_SERVICE], - durability: MessageConsumptionDurability::BestEffort, + delivery: MessageDeliveryMechanism::Immediate, })] impl FlowConfigTestListener { pub fn new() -> Self { diff --git a/src/domain/flow-system/services/tests/tests/utils/flow_harness_shared.rs b/src/domain/flow-system/services/tests/tests/utils/flow_harness_shared.rs index 925c521c3d..f70eaadf70 100644 --- a/src/domain/flow-system/services/tests/tests/utils/flow_harness_shared.rs +++ b/src/domain/flow-system/services/tests/tests/utils/flow_harness_shared.rs @@ -76,7 +76,7 @@ pub(crate) struct FlowHarnessOverrides { pub mandatory_throttling_period: Option, pub mock_dataset_changes: Option, pub custom_account_names: Vec, - pub is_multi_tenant: bool, + pub tenancy_config: TenancyConfig, } impl FlowHarness { @@ -150,13 +150,11 @@ impl FlowHarness { .add::() .add_value(fake_system_time_source.clone()) .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(overrides.is_multi_tenant), - ) + .add_value(overrides.tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add_value(mock_dataset_changes) .bind::() .add_value(CurrentAccountSubject::new_test()) diff --git a/src/domain/flow-system/services/tests/tests/utils/flow_system_test_listener.rs b/src/domain/flow-system/services/tests/tests/utils/flow_system_test_listener.rs index 5e5518a235..50be32a163 100644 --- a/src/domain/flow-system/services/tests/tests/utils/flow_system_test_listener.rs +++ b/src/domain/flow-system/services/tests/tests/utils/flow_system_test_listener.rs @@ -23,7 +23,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, }; use opendatafabric::DatasetID; use time_source::FakeSystemTimeSource; @@ -52,7 +52,7 @@ struct FlowSystemTestListenerState { #[meta(MessageConsumerMeta { consumer_name: "FlowSystemTestListener", feeding_producers: &[MESSAGE_PRODUCER_KAMU_FLOW_EXECUTOR, MESSAGE_PRODUCER_KAMU_FLOW_PROGRESS_SERVICE], - durability: MessageConsumptionDurability::BestEffort, + delivery: MessageDeliveryMechanism::Immediate, })] impl FlowSystemTestListener { pub(crate) fn new( diff --git a/src/domain/flow-system/services/tests/tests/utils/task_driver.rs b/src/domain/flow-system/services/tests/tests/utils/task_driver.rs index 0a0bae2c2f..e648a21923 100644 --- a/src/domain/flow-system/services/tests/tests/utils/task_driver.rs +++ b/src/domain/flow-system/services/tests/tests/utils/task_driver.rs @@ -112,7 +112,7 @@ impl TaskDriver { assert_eq!(&ud.dataset_id, self.args.dataset_id.as_ref().unwrap()); } LogicalPlan::Probe(_) => assert!(self.args.dataset_id.is_none()), - LogicalPlan::HardCompactionDataset(_) | LogicalPlan::Reset(_) => (), + LogicalPlan::HardCompactDataset(_) | LogicalPlan::ResetDataset(_) => (), } } } diff --git a/src/domain/task-system/domain/Cargo.toml b/src/domain/task-system/domain/Cargo.toml index e8ac01fdd3..db4b883eff 100644 --- a/src/domain/task-system/domain/Cargo.toml +++ b/src/domain/task-system/domain/Cargo.toml @@ -25,6 +25,7 @@ doctest = false database-common = { workspace = true } enum-variants = { workspace = true } event-sourcing = { workspace = true } +internal-error = { workspace = true } messaging-outbox = { workspace = true } opendatafabric = { workspace = true } diff --git a/src/domain/task-system/domain/src/entities/logical_plan.rs b/src/domain/task-system/domain/src/entities/logical_plan.rs index 8915b63dfd..d9200c114f 100644 --- a/src/domain/task-system/domain/src/entities/logical_plan.rs +++ b/src/domain/task-system/domain/src/entities/logical_plan.rs @@ -20,13 +20,13 @@ use crate::TaskOutcome; pub enum LogicalPlan { /// Perform an update on a dataset like update from polling source or a /// derivative transform - UpdateDataset(UpdateDataset), + UpdateDataset(LogicalPlanUpdateDataset), /// A task that can be used for testing the scheduling system - Probe(Probe), + Probe(LogicalPlanProbe), /// Perform a dataset hard compaction - HardCompactionDataset(HardCompactionDataset), + HardCompactDataset(LogicalPlanHardCompactDataset), /// Perform a dataset resetting - Reset(ResetDataset), + ResetDataset(LogicalPlanResetDataset), } impl LogicalPlan { @@ -35,10 +35,8 @@ impl LogicalPlan { match self { LogicalPlan::UpdateDataset(upd) => Some(&upd.dataset_id), LogicalPlan::Probe(p) => p.dataset_id.as_ref(), - LogicalPlan::HardCompactionDataset(hard_compaction) => { - Some(&hard_compaction.dataset_id) - } - LogicalPlan::Reset(reset) => Some(&reset.dataset_id), + LogicalPlan::HardCompactDataset(hard_compaction) => Some(&hard_compaction.dataset_id), + LogicalPlan::ResetDataset(reset) => Some(&reset.dataset_id), } } } @@ -48,7 +46,7 @@ impl LogicalPlan { /// Perform an update on a dataset like update from polling source or a /// derivative transform #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct UpdateDataset { +pub struct LogicalPlanUpdateDataset { /// ID of the dataset to update pub dataset_id: DatasetID, pub fetch_uncacheable: bool, @@ -58,7 +56,7 @@ pub struct UpdateDataset { /// A task that can be used for testing the scheduling system #[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] -pub struct Probe { +pub struct LogicalPlanProbe { /// ID of the dataset this task should be associated with pub dataset_id: Option, pub busy_time: Option, @@ -69,7 +67,7 @@ pub struct Probe { /// A task to perform a hard compaction of dataset #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct HardCompactionDataset { +pub struct LogicalPlanHardCompactDataset { pub dataset_id: DatasetID, pub max_slice_size: Option, pub max_slice_records: Option, @@ -80,7 +78,7 @@ pub struct HardCompactionDataset { /// A task to perform the resetting of a dataset #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ResetDataset { +pub struct LogicalPlanResetDataset { pub dataset_id: DatasetID, pub new_head_hash: Option, pub old_head_hash: Option, @@ -91,5 +89,11 @@ pub struct ResetDataset { // TODO: Replace with derive macro impl_enum_with_variants!(LogicalPlan); -impl_enum_variant!(LogicalPlan::UpdateDataset(UpdateDataset)); -impl_enum_variant!(LogicalPlan::Probe(Probe)); +impl_enum_variant!(LogicalPlan::UpdateDataset(LogicalPlanUpdateDataset)); +impl_enum_variant!(LogicalPlan::Probe(LogicalPlanProbe)); +impl_enum_variant!(LogicalPlan::ResetDataset(LogicalPlanResetDataset)); +impl_enum_variant!(LogicalPlan::HardCompactDataset( + LogicalPlanHardCompactDataset +)); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/domain/src/services/mod.rs b/src/domain/task-system/domain/src/services/mod.rs index 0129d8a8ca..58ed9cb87b 100644 --- a/src/domain/task-system/domain/src/services/mod.rs +++ b/src/domain/task-system/domain/src/services/mod.rs @@ -7,10 +7,12 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +mod task_definition_planner; mod task_executor; -mod task_logical_plan_runner; +mod task_runner; mod task_scheduler; +pub use task_definition_planner::*; pub use task_executor::*; -pub use task_logical_plan_runner::*; +pub use task_runner::*; pub use task_scheduler::*; diff --git a/src/domain/task-system/domain/src/services/task_definition_planner.rs b/src/domain/task-system/domain/src/services/task_definition_planner.rs new file mode 100644 index 0000000000..6354a5333d --- /dev/null +++ b/src/domain/task-system/domain/src/services/task_definition_planner.rs @@ -0,0 +1,68 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use internal_error::InternalError; +use kamu_core::{CompactionOptions, PullOptions, PullPlanIterationJob, ResolvedDataset}; +use opendatafabric::Multihash; + +use crate::{LogicalPlan, LogicalPlanProbe}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait TaskDefinitionPlanner: Send + Sync { + async fn prepare_task_definition( + &self, + logical_plan: &LogicalPlan, + ) -> Result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub enum TaskDefinition { + Probe(TaskDefinitionProbe), + Update(TaskDefinitionUpdate), + Reset(TaskDefinitionReset), + HardCompact(TaskDefinitionHardCompact), +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct TaskDefinitionProbe { + pub probe: LogicalPlanProbe, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct TaskDefinitionUpdate { + pub pull_options: PullOptions, + pub pull_job: PullPlanIterationJob, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct TaskDefinitionReset { + pub target: ResolvedDataset, + pub new_head_hash: Option, + pub old_head_hash: Option, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct TaskDefinitionHardCompact { + pub target: ResolvedDataset, + pub compaction_options: CompactionOptions, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/domain/src/services/task_logical_plan_runner.rs b/src/domain/task-system/domain/src/services/task_runner.rs similarity index 77% rename from src/domain/task-system/domain/src/services/task_logical_plan_runner.rs rename to src/domain/task-system/domain/src/services/task_runner.rs index 3a39935020..f42b59605d 100644 --- a/src/domain/task-system/domain/src/services/task_logical_plan_runner.rs +++ b/src/domain/task-system/domain/src/services/task_runner.rs @@ -9,13 +9,15 @@ use event_sourcing::InternalError; -use crate::{LogicalPlan, TaskOutcome}; +use super::TaskDefinition; +use crate::TaskOutcome; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[async_trait::async_trait] -pub trait TaskLogicalPlanRunner: Send + Sync { - async fn run_plan(&self, logical_plan: &LogicalPlan) -> Result; +pub trait TaskRunner: Send + Sync { + async fn run_task(&self, task_definition: TaskDefinition) + -> Result; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/services/Cargo.toml b/src/domain/task-system/services/Cargo.toml index 61e4079866..abbae33233 100644 --- a/src/domain/task-system/services/Cargo.toml +++ b/src/domain/task-system/services/Cargo.toml @@ -43,8 +43,13 @@ tokio = { version = "1", default-features = false } tracing = { version = "0.1", default-features = false } [dev-dependencies] +kamu = { workspace = true } +kamu-accounts = { workspace = true } +kamu-datasets-services = { workspace = true } +kamu-datasets-inmem = { workspace = true } kamu-task-system-inmem = { workspace = true } chrono = { version = "0.4", default-features = false } mockall = "0.13" +tempfile = "3" test-log = { version = "0.2", features = ["trace"] } diff --git a/src/domain/task-system/services/src/dependencies.rs b/src/domain/task-system/services/src/dependencies.rs index e39492404d..7edf1c3545 100644 --- a/src/domain/task-system/services/src/dependencies.rs +++ b/src/domain/task-system/services/src/dependencies.rs @@ -16,7 +16,8 @@ use crate::*; pub fn register_dependencies(catalog_builder: &mut CatalogBuilder) { catalog_builder.add::(); catalog_builder.add::(); - catalog_builder.add::(); + catalog_builder.add::(); + catalog_builder.add::(); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/services/src/lib.rs b/src/domain/task-system/services/src/lib.rs index 6f22e3a9fc..2b53b3bcc4 100644 --- a/src/domain/task-system/services/src/lib.rs +++ b/src/domain/task-system/services/src/lib.rs @@ -11,11 +11,13 @@ pub use kamu_task_system as domain; mod dependencies; +mod task_definition_planner_impl; mod task_executor_impl; -mod task_logical_plan_runner_impl; +mod task_runner_impl; mod task_scheduler_impl; pub use dependencies::*; +pub use task_definition_planner_impl::*; pub use task_executor_impl::*; -pub use task_logical_plan_runner_impl::*; +pub use task_runner_impl::*; pub use task_scheduler_impl::*; diff --git a/src/domain/task-system/services/src/task_definition_planner_impl.rs b/src/domain/task-system/services/src/task_definition_planner_impl.rs new file mode 100644 index 0000000000..641fa6f970 --- /dev/null +++ b/src/domain/task-system/services/src/task_definition_planner_impl.rs @@ -0,0 +1,172 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; +use std::sync::Arc; + +use dill::*; +use internal_error::InternalError; +use kamu_core::*; +use kamu_datasets::{DatasetEnvVar, DatasetEnvVarService}; +use kamu_task_system::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TaskDefinitionPlannerImpl { + dataset_registry: Arc, + dataset_env_vars_svc: Arc, + pull_request_planner: Arc, + tenancy_config: Arc, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn TaskDefinitionPlanner)] +impl TaskDefinitionPlannerImpl { + pub fn new( + dataset_registry: Arc, + dataset_env_vars_svc: Arc, + pull_request_planner: Arc, + tenancy_config: Arc, + ) -> Self { + Self { + dataset_registry, + dataset_env_vars_svc, + pull_request_planner, + tenancy_config, + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?probe_plan))] + async fn plan_probe( + &self, + probe_plan: &LogicalPlanProbe, + ) -> Result { + Ok(TaskDefinition::Probe(TaskDefinitionProbe { + probe: probe_plan.clone(), + })) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?args))] + async fn plan_update( + &self, + args: &LogicalPlanUpdateDataset, + ) -> Result { + let dataset_env_vars = self + .dataset_env_vars_svc + .get_all_dataset_env_vars_by_dataset_id(&args.dataset_id, None) + .await + .map(|listing| listing.list) + .int_err()?; + + let pull_options = PullOptions { + ingest_options: PollingIngestOptions { + dataset_env_vars: dataset_env_vars + .into_iter() + .map(|dataset_env_var| (dataset_env_var.key.clone(), dataset_env_var)) + .collect::>(), + fetch_uncacheable: args.fetch_uncacheable, + ..Default::default() + }, + ..Default::default() + }; + + let plan_res = self + .pull_request_planner + .build_pull_plan( + PullRequest::local(args.dataset_id.as_local_ref()), + &pull_options, + *self.tenancy_config, + ) + .await; + + match plan_res { + Ok(pull_job) => Ok(TaskDefinition::Update(TaskDefinitionUpdate { + pull_options, + pull_job, + })), + Err(e) => { + assert!(e.result.is_err()); + tracing::error!( + args = ?args, + error = ?e, + "Update failed", + ); + Err("Update task planning failed".int_err()) + } + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?args))] + async fn plan_reset( + &self, + args: &LogicalPlanResetDataset, + ) -> Result { + let target = self + .dataset_registry + .get_dataset_by_ref(&args.dataset_id.as_local_ref()) + .await + .int_err()?; + + Ok(TaskDefinition::Reset(TaskDefinitionReset { + target, + new_head_hash: args.new_head_hash.clone(), + old_head_hash: args.old_head_hash.clone(), + })) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?args))] + async fn plan_hard_compaction( + &self, + args: &LogicalPlanHardCompactDataset, + ) -> Result { + let target = self + .dataset_registry + .get_dataset_by_ref(&args.dataset_id.as_local_ref()) + .await + .int_err()?; + + let compaction_options = CompactionOptions { + max_slice_size: args.max_slice_size, + max_slice_records: args.max_slice_records, + keep_metadata_only: args.keep_metadata_only, + }; + + Ok(TaskDefinition::HardCompact(TaskDefinitionHardCompact { + target, + compaction_options, + })) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl TaskDefinitionPlanner for TaskDefinitionPlannerImpl { + async fn prepare_task_definition( + &self, + logical_plan: &LogicalPlan, + ) -> Result { + tracing::debug!(?logical_plan, "Preparing task definition"); + + let task_definition = match logical_plan { + LogicalPlan::Probe(probe) => self.plan_probe(probe).await?, + LogicalPlan::UpdateDataset(upd) => self.plan_update(upd).await?, + LogicalPlan::ResetDataset(reset) => self.plan_reset(reset).await?, + LogicalPlan::HardCompactDataset(compaction) => { + self.plan_hard_compaction(compaction).await? + } + }; + + Ok(task_definition) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/services/src/task_executor_impl.rs b/src/domain/task-system/services/src/task_executor_impl.rs index 2a8a3dea24..3350e2b9d5 100644 --- a/src/domain/task-system/services/src/task_executor_impl.rs +++ b/src/domain/task-system/services/src/task_executor_impl.rs @@ -9,7 +9,7 @@ use std::sync::Arc; -use database_common::PaginationOpts; +use database_common::{DatabaseTransactionRunner, PaginationOpts}; use database_common_macros::{transactional_method1, transactional_method2}; use dill::*; use init_on_startup::{InitOnStartup, InitOnStartupMeta}; @@ -22,7 +22,7 @@ use tracing::Instrument as _; pub struct TaskExecutorImpl { catalog: Catalog, - task_logical_plan_runner: Arc, + task_runner: Arc, time_source: Arc, } @@ -40,12 +40,12 @@ pub struct TaskExecutorImpl { impl TaskExecutorImpl { pub fn new( catalog: Catalog, - task_logical_plan_runner: Arc, + task_runner: Arc, time_source: Arc, ) -> Self { Self { catalog, - task_logical_plan_runner, + task_runner, time_source, } } @@ -143,14 +143,22 @@ impl TaskExecutorImpl { tracing::debug!( task_id = %task.task_id, logical_plan = ?task.logical_plan, - "Running task", + "Preparing task to run", ); - // Run task via logical plan - let task_run_result = self - .task_logical_plan_runner - .run_plan(&task.logical_plan) - .await; + // Prepare task definition (requires transaction) + let task_definition = DatabaseTransactionRunner::new(self.catalog.clone()) + .transactional_with( + |task_definition_planner: Arc| async move { + task_definition_planner + .prepare_task_definition(&task.logical_plan) + .await + }, + ) + .await?; + + // Run task via ldefinition + let task_run_result = self.task_runner.run_task(task_definition).await; // Deal with errors: we should not interrupt the main loop if task fails let task_outcome = match task_run_result { diff --git a/src/domain/task-system/services/src/task_logical_plan_runner_impl.rs b/src/domain/task-system/services/src/task_logical_plan_runner_impl.rs deleted file mode 100644 index 4fad2235ea..0000000000 --- a/src/domain/task-system/services/src/task_logical_plan_runner_impl.rs +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::collections::HashMap; -use std::sync::Arc; - -use database_common_macros::transactional_method1; -use dill::*; -use internal_error::InternalError; -use kamu_core::{ - CompactionOptions, - CompactionService, - DatasetRepository, - PollingIngestOptions, - PullError, - PullOptions, - PullService, - ResetError, - ResetService, - TransformError, -}; -use kamu_datasets::{DatasetEnvVar, DatasetEnvVarService}; -use kamu_task_system::*; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub struct TaskLogicalPlanRunnerImpl { - catalog: Catalog, -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[component(pub)] -#[interface(dyn TaskLogicalPlanRunner)] -impl TaskLogicalPlanRunnerImpl { - pub fn new(catalog: Catalog) -> Self { - Self { catalog } - } - - async fn run_probe(&self, probe_plan: &Probe) -> Result { - if let Some(busy_time) = &probe_plan.busy_time { - tokio::time::sleep(*busy_time).await; - } - Ok(probe_plan - .end_with_outcome - .clone() - .unwrap_or(TaskOutcome::Success(TaskResult::Empty))) - } - - async fn run_update(&self, args: &UpdateDataset) -> Result { - let dataset_env_vars = self.query_dataset_env_vars(args).await?; - let dataset_env_vars_hash_map = dataset_env_vars - .into_iter() - .map(|dataset_env_var| (dataset_env_var.key.clone(), dataset_env_var)) - .collect::>(); - - let pull_options = PullOptions { - ingest_options: PollingIngestOptions { - dataset_env_vars: dataset_env_vars_hash_map, - fetch_uncacheable: args.fetch_uncacheable, - ..Default::default() - }, - ..Default::default() - }; - - let pull_svc = self.catalog.get_one::().int_err()?; - let maybe_pull_result = pull_svc - .pull(&args.dataset_id.as_any_ref(), pull_options, None) - .await; - - match maybe_pull_result { - Ok(pull_result) => Ok(TaskOutcome::Success(TaskResult::UpdateDatasetResult( - TaskUpdateDatasetResult { pull_result }, - ))), - Err(err) => match err { - PullError::TransformError(TransformError::InvalidInputInterval(e)) => { - Ok(TaskOutcome::Failed(TaskError::UpdateDatasetError( - UpdateDatasetTaskError::InputDatasetCompacted(InputDatasetCompactedError { - dataset_id: e.input_dataset_id, - }), - ))) - } - err => { - tracing::error!( - args = ?args, - error = ?err, - error_msg = %err, - "Update failed", - ); - - Ok(TaskOutcome::Failed(TaskError::Empty)) - } - }, - } - } - - #[transactional_method1(dataset_env_vars_svc: Arc)] - async fn query_dataset_env_vars( - &self, - args: &UpdateDataset, - ) -> Result, InternalError> { - dataset_env_vars_svc - .get_all_dataset_env_vars_by_dataset_id(&args.dataset_id, None) - .await - .map(|listing| listing.list) - .int_err() - } - - async fn run_reset(&self, args: &ResetDataset) -> Result { - let reset_svc = self.catalog.get_one::().int_err()?; - let dataset_repo = self.catalog.get_one::().int_err()?; - let dataset_handle = dataset_repo - .resolve_dataset_ref(&args.dataset_id.as_local_ref()) - .await - .int_err()?; - - let reset_result_maybe = reset_svc - .reset_dataset( - &dataset_handle, - args.new_head_hash.as_ref(), - args.old_head_hash.as_ref(), - ) - .await; - match reset_result_maybe { - Ok(new_head) => Ok(TaskOutcome::Success(TaskResult::ResetDatasetResult( - TaskResetDatasetResult { new_head }, - ))), - Err(err) => match err { - ResetError::BlockNotFound(_) => Ok(TaskOutcome::Failed( - TaskError::ResetDatasetError(ResetDatasetTaskError::ResetHeadNotFound), - )), - err => { - tracing::error!( - args = ?args, - error = ?err, - error_msg = %err, - "Reset failed", - ); - - Ok(TaskOutcome::Failed(TaskError::Empty)) - } - }, - } - } - - async fn run_hard_compaction( - &self, - args: &HardCompactionDataset, - ) -> Result { - let compaction_svc = self.catalog.get_one::().int_err()?; - let dataset_repo = self.catalog.get_one::().int_err()?; - let dataset_handle = dataset_repo - .resolve_dataset_ref(&args.dataset_id.as_local_ref()) - .await - .int_err()?; - - let compaction_result = compaction_svc - .compact_dataset( - &dataset_handle, - CompactionOptions { - max_slice_size: args.max_slice_size, - max_slice_records: args.max_slice_records, - keep_metadata_only: args.keep_metadata_only, - }, - None, - ) - .await; - - match compaction_result { - Ok(result) => Ok(TaskOutcome::Success(TaskResult::CompactionDatasetResult( - result.into(), - ))), - Err(err) => { - tracing::error!( - args = ?args, - error = ?err, - error_msg = %err, - "Hard compaction failed", - ); - - Ok(TaskOutcome::Failed(TaskError::Empty)) - } - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[async_trait::async_trait] -impl TaskLogicalPlanRunner for TaskLogicalPlanRunnerImpl { - #[tracing::instrument(level = "debug", skip_all)] - async fn run_plan(&self, logical_plan: &LogicalPlan) -> Result { - tracing::debug!(?logical_plan, "Running task plan"); - - let task_outcome = match logical_plan { - LogicalPlan::UpdateDataset(upd) => self.run_update(upd).await?, - LogicalPlan::Probe(probe) => self.run_probe(probe).await?, - LogicalPlan::Reset(reset) => self.run_reset(reset).await?, - LogicalPlan::HardCompactionDataset(compaction) => { - self.run_hard_compaction(compaction).await? - } - }; - - Ok(task_outcome) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/services/src/task_runner_impl.rs b/src/domain/task-system/services/src/task_runner_impl.rs new file mode 100644 index 0000000000..8afd2e15e0 --- /dev/null +++ b/src/domain/task-system/services/src/task_runner_impl.rs @@ -0,0 +1,236 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::*; +use internal_error::InternalError; +use kamu_core::*; +use kamu_task_system::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TaskRunnerImpl { + polling_ingest_service: Arc, + transform_elaboration_service: Arc, + transform_execution_service: Arc, + reset_service: Arc, + compaction_service: Arc, +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn TaskRunner)] +impl TaskRunnerImpl { + pub fn new( + polling_ingest_service: Arc, + transform_elaboration_service: Arc, + transform_execution_service: Arc, + reset_service: Arc, + compaction_service: Arc, + ) -> Self { + Self { + polling_ingest_service, + transform_elaboration_service, + transform_execution_service, + reset_service, + compaction_service, + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?task_probe))] + async fn run_probe( + &self, + task_probe: TaskDefinitionProbe, + ) -> Result { + if let Some(busy_time) = task_probe.probe.busy_time { + tokio::time::sleep(busy_time).await; + } + Ok(task_probe + .probe + .end_with_outcome + .clone() + .unwrap_or(TaskOutcome::Success(TaskResult::Empty))) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?task_update))] + async fn run_update( + &self, + task_update: TaskDefinitionUpdate, + ) -> Result { + match task_update.pull_job { + PullPlanIterationJob::Ingest(ingest_item) => { + self.run_ingest_update(ingest_item, task_update.pull_options.ingest_options) + .await + } + PullPlanIterationJob::Transform(transform_item) => { + self.run_transform_update(transform_item).await + } + PullPlanIterationJob::Sync(_) => { + unreachable!("No Sync jobs possible from update requests"); + } + } + } + + async fn run_ingest_update( + &self, + ingest_item: PullIngestItem, + ingest_options: PollingIngestOptions, + ) -> Result { + let ingest_response = self + .polling_ingest_service + .ingest(ingest_item.target, ingest_options, None) + .await; + match ingest_response { + Ok(ingest_result) => Ok(TaskOutcome::Success(TaskResult::UpdateDatasetResult( + TaskUpdateDatasetResult { + pull_result: ingest_result.into(), + }, + ))), + Err(_) => Ok(TaskOutcome::Failed(TaskError::Empty)), + } + } + + async fn run_transform_update( + &self, + transform_item: PullTransformItem, + ) -> Result { + let transform_elaboration = match self + .transform_elaboration_service + .elaborate_transform( + transform_item.target.clone(), + transform_item.plan, + TransformOptions::default(), + None, + ) + .await + { + Ok(request) => Ok(request), + // Special case: input dataset compacted + Err(TransformElaborateError::InvalidInputInterval(e)) => { + return Ok(TaskOutcome::Failed(TaskError::UpdateDatasetError( + UpdateDatasetTaskError::InputDatasetCompacted(InputDatasetCompactedError { + dataset_id: e.input_dataset_id, + }), + ))); + } + Err(e) => { + tracing::error!(error = ?e, "Update failed"); + Err("Transform request elaboration failed".int_err()) + } + }?; + + match transform_elaboration { + TransformElaboration::Elaborated(transform_plan) => { + let (_, execution_result) = self + .transform_execution_service + .execute_transform(transform_item.target, transform_plan, None) + .await; + + match execution_result { + Ok(transform_result) => Ok(TaskOutcome::Success( + TaskResult::UpdateDatasetResult(TaskUpdateDatasetResult { + pull_result: transform_result.into(), + }), + )), + Err(e) => { + tracing::error!(error = ?e, "Transform execution failed"); + Ok(TaskOutcome::Failed(TaskError::Empty)) + } + } + } + TransformElaboration::UpToDate => Ok(TaskOutcome::Success(TaskResult::Empty)), + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?task_reset))] + async fn run_reset( + &self, + task_reset: TaskDefinitionReset, + ) -> Result { + let reset_result_maybe = self + .reset_service + .reset_dataset( + task_reset.target, + task_reset.new_head_hash.as_ref(), + task_reset.old_head_hash.as_ref(), + ) + .await; + match reset_result_maybe { + Ok(new_head) => Ok(TaskOutcome::Success(TaskResult::ResetDatasetResult( + TaskResetDatasetResult { new_head }, + ))), + Err(err) => match err { + ResetError::BlockNotFound(_) => Ok(TaskOutcome::Failed( + TaskError::ResetDatasetError(ResetDatasetTaskError::ResetHeadNotFound), + )), + err => { + tracing::error!( + error = ?err, + error_msg = %err, + "Reset failed", + ); + + Ok(TaskOutcome::Failed(TaskError::Empty)) + } + }, + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?task_compact))] + async fn run_hard_compaction( + &self, + task_compact: TaskDefinitionHardCompact, + ) -> Result { + let compaction_result = self + .compaction_service + .compact_dataset(task_compact.target, task_compact.compaction_options, None) + .await; + + match compaction_result { + Ok(result) => Ok(TaskOutcome::Success(TaskResult::CompactionDatasetResult( + result.into(), + ))), + Err(err) => { + tracing::error!( + error = ?err, + error_msg = %err, + "Hard compaction failed", + ); + + Ok(TaskOutcome::Failed(TaskError::Empty)) + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl TaskRunner for TaskRunnerImpl { + #[tracing::instrument(level = "debug", skip_all)] + async fn run_task( + &self, + task_definition: TaskDefinition, + ) -> Result { + tracing::debug!(?task_definition, "Running task"); + + let task_outcome = match task_definition { + TaskDefinition::Probe(td_probe) => self.run_probe(td_probe).await?, + TaskDefinition::Update(td_update) => self.run_update(td_update).await?, + TaskDefinition::Reset(td_reset) => self.run_reset(td_reset).await?, + TaskDefinition::HardCompact(td_compact) => self.run_hard_compaction(td_compact).await?, + }; + + Ok(task_outcome) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/services/tests/tests/test_task_aggregate.rs b/src/domain/task-system/services/tests/tests/test_task_aggregate.rs index 57600f2b69..428980285c 100644 --- a/src/domain/task-system/services/tests/tests/test_task_aggregate.rs +++ b/src/domain/task-system/services/tests/tests/test_task_aggregate.rs @@ -24,7 +24,7 @@ async fn test_task_agg_create_new() { let mut task = Task::new( Utc::now(), event_store.new_task_id().await.unwrap(), - Probe::default().into(), + LogicalPlanProbe::default().into(), Some(metadata.clone()), ); @@ -38,7 +38,10 @@ async fn test_task_agg_create_new() { let task = Task::load(task.task_id, &event_store).await.unwrap(); assert_eq!(task.status(), TaskStatus::Queued); - assert_eq!(task.logical_plan, LogicalPlan::Probe(Probe::default())); + assert_eq!( + task.logical_plan, + LogicalPlan::Probe(LogicalPlanProbe::default()) + ); assert_eq!(task.metadata, metadata); } @@ -49,7 +52,12 @@ async fn test_task_save_load_update() { let event_store = InMemoryTaskEventStore::new(); let task_id = event_store.new_task_id().await.unwrap(); - let mut task = Task::new(Utc::now(), task_id, Probe::default().into(), None); + let mut task = Task::new( + Utc::now(), + task_id, + LogicalPlanProbe::default().into(), + None, + ); task.save(&event_store).await.unwrap(); task.run(Utc::now()).unwrap(); @@ -97,7 +105,7 @@ async fn test_task_agg_illegal_transition() { let mut task = Task::new( Utc::now(), event_store.new_task_id().await.unwrap(), - Probe::default().into(), + LogicalPlanProbe::default().into(), None, ); task.finish(Utc::now(), TaskOutcome::Cancelled).unwrap(); @@ -114,7 +122,7 @@ async fn test_task_requeue() { let mut task = Task::new( Utc::now(), event_store.new_task_id().await.unwrap(), - Probe::default().into(), + LogicalPlanProbe::default().into(), None, ); task.run(Utc::now()).unwrap(); diff --git a/src/domain/task-system/services/tests/tests/test_task_executor_impl.rs b/src/domain/task-system/services/tests/tests/test_task_executor_impl.rs index afecb59075..8c7fea6188 100644 --- a/src/domain/task-system/services/tests/tests/test_task_executor_impl.rs +++ b/src/domain/task-system/services/tests/tests/test_task_executor_impl.rs @@ -11,19 +11,40 @@ use std::assert_matches::assert_matches; use std::sync::Arc; use database_common::NoOpDatabasePlugin; -use dill::{Catalog, CatalogBuilder}; +use dill::{Catalog, CatalogBuilder, Component}; +use kamu::utils::ipfs_wrapper::IpfsClient; +use kamu::{ + DatasetFactoryImpl, + DatasetRegistryRepoBridge, + DatasetRepositoryLocalFs, + DatasetRepositoryWriter, + IpfsGateway, + PullRequestPlannerImpl, + RemoteAliasesRegistryImpl, + RemoteReposDir, + RemoteRepositoryRegistryImpl, + SyncRequestBuilder, + TransformRequestPlannerImpl, +}; +use kamu_accounts::CurrentAccountSubject; +use kamu_core::auth::DummyOdfServerAccessTokenResolver; +use kamu_core::{DatasetRepository, TenancyConfig}; +use kamu_datasets::DatasetEnvVarsConfig; +use kamu_datasets_inmem::InMemoryDatasetEnvVarRepository; +use kamu_datasets_services::DatasetEnvVarServiceImpl; use kamu_task_system::*; use kamu_task_system_inmem::InMemoryTaskEventStore; use kamu_task_system_services::*; use messaging_outbox::{MockOutbox, Outbox}; use mockall::predicate::{eq, function}; +use tempfile::TempDir; use time_source::SystemTimeSourceDefault; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_pre_run_requeues_running_tasks() { - let harness = TaskExecutorHarness::new(MockOutbox::new(), MockTaskLogicalPlanRunner::new()); + let harness = TaskExecutorHarness::new(MockOutbox::new(), MockTaskRunner::new()); // Schedule 3 tasks let task_id_1 = harness.schedule_probe_task().await; @@ -67,15 +88,15 @@ async fn test_run_single_task() { TaskExecutorHarness::add_outbox_task_expectations(&mut mock_outbox, TaskID::new(0)); // Expect logical plan runner to run probe - let mut mock_plan_runner = MockTaskLogicalPlanRunner::new(); + let mut mock_task_runner = MockTaskRunner::new(); TaskExecutorHarness::add_run_probe_plan_expectations( - &mut mock_plan_runner, - Probe::default(), + &mut mock_task_runner, + LogicalPlanProbe::default(), 1, ); // Schedule the only task - let harness = TaskExecutorHarness::new(mock_outbox, mock_plan_runner); + let harness = TaskExecutorHarness::new(mock_outbox, mock_task_runner); let task_id = harness.schedule_probe_task().await; let task = harness.get_task(task_id).await; assert_eq!(task.status(), TaskStatus::Queued); @@ -98,15 +119,15 @@ async fn test_run_two_of_three_tasks() { TaskExecutorHarness::add_outbox_task_expectations(&mut mock_outbox, TaskID::new(1)); // Expect logical plan runner to run probe twice - let mut mock_plan_runner = MockTaskLogicalPlanRunner::new(); + let mut mock_task_runner = MockTaskRunner::new(); TaskExecutorHarness::add_run_probe_plan_expectations( - &mut mock_plan_runner, - Probe::default(), + &mut mock_task_runner, + LogicalPlanProbe::default(), 2, ); // Schedule 3 tasks - let harness = TaskExecutorHarness::new(mock_outbox, mock_plan_runner); + let harness = TaskExecutorHarness::new(mock_outbox, mock_task_runner); let task_id_1 = harness.schedule_probe_task().await; let task_id_2 = harness.schedule_probe_task().await; let task_id_3 = harness.schedule_probe_task().await; @@ -135,22 +156,51 @@ async fn test_run_two_of_three_tasks() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct TaskExecutorHarness { + _tempdir: TempDir, catalog: Catalog, task_executor: Arc, task_scheduler: Arc, } impl TaskExecutorHarness { - pub fn new(mock_outbox: MockOutbox, mock_plan_runner: MockTaskLogicalPlanRunner) -> Self { + pub fn new(mock_outbox: MockOutbox, mock_task_runner: MockTaskRunner) -> Self { + let tempdir = tempfile::tempdir().unwrap(); + + let datasets_dir = tempdir.path().join("datasets"); + std::fs::create_dir(&datasets_dir).unwrap(); + + let repos_dir = tempdir.path().join("repos"); + std::fs::create_dir(&repos_dir).unwrap(); + let mut b = CatalogBuilder::new(); b.add::() .add::() .add::() - .add_value(mock_plan_runner) - .bind::() + .add::() + .add_value(mock_task_runner) + .bind::() .add_value(mock_outbox) .bind::() - .add::(); + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add_value(RemoteReposDir::new(repos_dir)) + .add::() + .add_value(IpfsGateway::default()) + .add_value(IpfsClient::default()) + .add::() + .add::() + .add::() + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) + .bind::() + .bind::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(TenancyConfig::SingleTenant) + .add_value(DatasetEnvVarsConfig::sample()); NoOpDatabasePlugin::init_database_components(&mut b); @@ -160,6 +210,7 @@ impl TaskExecutorHarness { let task_scheduler = catalog.get_one().unwrap(); Self { + _tempdir: tempdir, catalog, task_executor, task_scheduler, @@ -168,7 +219,13 @@ impl TaskExecutorHarness { async fn schedule_probe_task(&self) -> TaskID { self.task_scheduler - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id @@ -221,13 +278,19 @@ impl TaskExecutorHarness { } fn add_run_probe_plan_expectations( - mock_plan_runner: &mut MockTaskLogicalPlanRunner, - probe: Probe, + mock_task_runner: &mut MockTaskRunner, + probe: LogicalPlanProbe, times: usize, ) { - mock_plan_runner - .expect_run_plan() - .with(eq(LogicalPlan::Probe(probe))) + mock_task_runner + .expect_run_task() + .withf(move |td| { + matches!( + td, + TaskDefinition::Probe(TaskDefinitionProbe { probe: probe_ }) + if probe_ == &probe + ) + }) .times(times) .returning(|_| Ok(TaskOutcome::Success(TaskResult::Empty))); } @@ -236,11 +299,11 @@ impl TaskExecutorHarness { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// mockall::mock! { - pub TaskLogicalPlanRunner {} + pub TaskRunner {} #[async_trait::async_trait] - impl TaskLogicalPlanRunner for TaskLogicalPlanRunner { - async fn run_plan(&self, logical_plan: &LogicalPlan) -> Result; + impl TaskRunner for TaskRunner { + async fn run_task(&self, task_definition: TaskDefinition) -> Result; } } diff --git a/src/domain/task-system/services/tests/tests/test_task_scheduler_impl.rs b/src/domain/task-system/services/tests/tests/test_task_scheduler_impl.rs index 57bd28fad5..e1894f6299 100644 --- a/src/domain/task-system/services/tests/tests/test_task_scheduler_impl.rs +++ b/src/domain/task-system/services/tests/tests/test_task_scheduler_impl.rs @@ -10,7 +10,14 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use kamu_task_system::{LogicalPlan, Probe, TaskMetadata, TaskScheduler, TaskState, TaskStatus}; +use kamu_task_system::{ + LogicalPlan, + LogicalPlanProbe, + TaskMetadata, + TaskScheduler, + TaskState, + TaskStatus, +}; use kamu_task_system_inmem::InMemoryTaskEventStore; use kamu_task_system_services::TaskSchedulerImpl; use time_source::SystemTimeSourceStub; @@ -21,7 +28,10 @@ use time_source::SystemTimeSourceStub; async fn test_creates_task() { let task_sched = create_task_scheduler(); - let logical_plan_expected: LogicalPlan = Probe { ..Probe::default() }.into(); + let logical_plan_expected: LogicalPlan = LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(); let metadata_expected = TaskMetadata::from(vec![("foo", "x"), ("bar", "y")]); @@ -55,13 +65,25 @@ async fn test_queues_tasks() { assert!(maybe_task_0.is_none()); let task_id_1 = task_sched - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id; let task_id_2 = task_sched - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id; @@ -83,13 +105,25 @@ async fn test_task_taken_task_is_running() { let task_sched = create_task_scheduler(); let task_id_1 = task_sched - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id; let task_id_2 = task_sched - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id; @@ -114,13 +148,25 @@ async fn test_task_cancellation() { let task_sched = create_task_scheduler(); let task_id_1 = task_sched - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id; let task_id_2 = task_sched - .create_task(Probe { ..Probe::default() }.into(), None) + .create_task( + LogicalPlanProbe { + ..LogicalPlanProbe::default() + } + .into(), + None, + ) .await .unwrap() .task_id; diff --git a/src/e2e/app/cli/repo-tests/src/commands/test_compact_command.rs b/src/e2e/app/cli/repo-tests/src/commands/test_compact_command.rs index 7b5292a5f9..ce4261cf89 100644 --- a/src/e2e/app/cli/repo-tests/src/commands/test_compact_command.rs +++ b/src/e2e/app/cli/repo-tests/src/commands/test_compact_command.rs @@ -145,7 +145,7 @@ pub async fn test_compact_verify(kamu: KamuCliPuppet) { ], None, Some([ - "verify with dataset_ref: player-scores", + "verify with target_alias: player-scores", "1 dataset(s) were compacted", ]), ) diff --git a/src/infra/core/Cargo.toml b/src/infra/core/Cargo.toml index 830898ec64..d9424e41c1 100644 --- a/src/infra/core/Cargo.toml +++ b/src/infra/core/Cargo.toml @@ -154,6 +154,7 @@ libc = "0.2" # For getting uid:gid [dev-dependencies] database-common = { workspace = true } +kamu = { workspace = true, features = ["testing"] } kamu-accounts-inmem = { workspace = true } kamu-accounts-services = { workspace = true } kamu-data-utils = { workspace = true, features = ["testing"] } @@ -164,9 +165,11 @@ datafusion = { version = "42", default-features = false, features = [ "parquet", ] } filetime = "0.2" +fs_extra = "1.3" indoc = "2" mockall = { version = "0.13", default-features = false } nanoid = "0.4.0" +oop = "0.0.2" pretty_assertions = { version = "1" } test-group = { version = "1" } test-log = { version = "0.2", features = ["trace"] } diff --git a/src/infra/core/src/compaction_service_impl.rs b/src/infra/core/src/compaction_service_impl.rs index 8addd053f5..cd76b2a93d 100644 --- a/src/infra/core/src/compaction_service_impl.rs +++ b/src/infra/core/src/compaction_service_impl.rs @@ -18,7 +18,6 @@ use dill::{component, interface}; use domain::{ CompactionError, CompactionListener, - CompactionMultiListener, CompactionOptions, CompactionPhase, CompactionResult, @@ -33,10 +32,7 @@ use internal_error::ResultIntoInternal; use kamu_core::*; use opendatafabric::{ Checkpoint, - DatasetHandle, DatasetKind, - DatasetRef, - DatasetRefPattern, DatasetVocabulary, MetadataEvent, Multihash, @@ -48,12 +44,11 @@ use random_names::get_random_name; use time_source::SystemTimeSource; use url::Url; -use crate::utils::datasets_filtering::filter_datasets_by_local_pattern; use crate::*; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + pub struct CompactionServiceImpl { - dataset_repo: Arc, - dataset_authorizer: Arc, object_store_registry: Arc, time_source: Arc, run_info_dir: Arc, @@ -99,19 +94,17 @@ struct ChainFilesInfo { data_slice_batches: Vec, } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[component(pub)] #[interface(dyn CompactionService)] impl CompactionServiceImpl { pub fn new( - dataset_authorizer: Arc, - dataset_repo: Arc, object_store_registry: Arc, time_source: Arc, run_info_dir: Arc, ) -> Self { Self { - dataset_repo, - dataset_authorizer, object_store_registry, time_source, run_info_dir, @@ -120,7 +113,7 @@ impl CompactionServiceImpl { async fn gather_chain_info( &self, - dataset: Arc, + target: &ResolvedDataset, max_slice_size: u64, max_slice_records: u64, keep_metadata_only: bool, @@ -137,10 +130,10 @@ impl CompactionServiceImpl { //////////////////////////////////////////////////////////////////////////////// - let chain = dataset.as_metadata_chain(); + let chain = target.as_metadata_chain(); let head = chain.resolve_ref(&BlockRef::Head).await?; let mut block_stream = chain.iter_blocks_interval(&head, None, false); - let object_data_repo = dataset.as_data_repo(); + let object_data_repo = target.as_data_repo(); while let Some((block_hash, block)) = block_stream.try_next().await? { old_num_blocks += 1; @@ -315,10 +308,10 @@ impl CompactionServiceImpl { async fn commit_new_blocks( &self, - dataset: Arc, + target: &ResolvedDataset, chain_files_info: &ChainFilesInfo, ) -> Result<(Vec, Multihash, usize), CompactionError> { - let chain = dataset.as_metadata_chain(); + let chain = target.as_metadata_chain(); let mut current_head = chain_files_info.old_head.clone(); let mut old_data_slices: Vec = vec![]; // set it to 1 to include seed block @@ -329,7 +322,7 @@ impl CompactionServiceImpl { DataSliceBatch::SingleBlock(block_hash) => { let block = chain.get_block(block_hash).await.int_err()?; - let commit_result = dataset + let commit_result = target .commit_event( block.event, CommitOpts { @@ -366,7 +359,7 @@ impl CompactionServiceImpl { .clone() .map(|r| CheckpointRef::Existed(r.physical_hash)); - let commit_result = dataset + let commit_result = target .commit_add_data( add_data_params, Some(OwnedFile::new( @@ -394,10 +387,10 @@ impl CompactionServiceImpl { Ok((old_data_slices, current_head, new_num_blocks)) } - #[tracing::instrument(level = "info", skip_all)] + #[tracing::instrument(level = "debug", skip_all)] async fn compact_dataset_impl( &self, - dataset: Arc, + target: ResolvedDataset, max_slice_size: u64, max_slice_records: u64, keep_metadata_only: bool, @@ -408,7 +401,7 @@ impl CompactionServiceImpl { listener.begin_phase(CompactionPhase::GatherChainInfo); let mut chain_files_info = self .gather_chain_info( - dataset.clone(), + &target, max_slice_size, max_slice_records, keep_metadata_only, @@ -430,11 +423,10 @@ impl CompactionServiceImpl { .await?; listener.begin_phase(CompactionPhase::CommitNewBlocks); - let (_old_data_slices, new_head, new_num_blocks) = self - .commit_new_blocks(dataset.clone(), &chain_files_info) - .await?; + let (_old_data_slices, new_head, new_num_blocks) = + self.commit_new_blocks(&target, &chain_files_info).await?; - dataset + target .as_metadata_chain() .set_ref( &BlockRef::Head, @@ -459,22 +451,18 @@ impl CompactionServiceImpl { } } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[async_trait::async_trait] impl CompactionService for CompactionServiceImpl { - #[tracing::instrument(level = "info", skip_all)] + #[tracing::instrument(level = "info", skip_all, fields(target=?target.get_handle(), ?options))] async fn compact_dataset( &self, - dataset_handle: &DatasetHandle, + target: ResolvedDataset, options: CompactionOptions, maybe_listener: Option>, ) -> Result { - self.dataset_authorizer - .check_action_allowed(dataset_handle, domain::auth::DatasetAction::Write) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - - let dataset_kind = dataset + let dataset_kind = target .get_summary(GetSummaryOpts::default()) .await .int_err()? @@ -483,7 +471,7 @@ impl CompactionService for CompactionServiceImpl { if !options.keep_metadata_only && dataset_kind != DatasetKind::Root { return Err(CompactionError::InvalidDatasetKind( InvalidDatasetKindError { - dataset_name: dataset_handle.alias.dataset_name.clone(), + dataset_alias: target.get_alias().clone(), }, )); } @@ -497,7 +485,7 @@ impl CompactionService for CompactionServiceImpl { match self .compact_dataset_impl( - dataset, + target, max_slice_size, max_slice_records, options.keep_metadata_only, @@ -515,44 +503,6 @@ impl CompactionService for CompactionServiceImpl { } } } - - async fn compact_multi( - &self, - dataset_refs: Vec, - options: CompactionOptions, - multi_listener: Option>, - ) -> Vec { - let filtered_dataset_results = filter_datasets_by_local_pattern( - self.dataset_repo.as_ref(), - dataset_refs - .into_iter() - .map(DatasetRefPattern::Ref) - .collect(), - ) - .try_collect() - .await; - - let dataset_handles: Vec<_> = if let Ok(matched_datasets) = filtered_dataset_results { - matched_datasets - } else { - return vec![]; - }; - - let listener = multi_listener.unwrap_or(Arc::new(NullCompactionMultiListener {})); - - let mut result = vec![]; - for dataset_handle in &dataset_handles { - result.push(CompactionResponse { - dataset_ref: dataset_handle.as_local_ref(), - result: self - .compact_dataset( - dataset_handle, - options.clone(), - listener.begin_compact(dataset_handle), - ) - .await, - }); - } - result - } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/dataset_changes_service_impl.rs b/src/infra/core/src/dataset_changes_service_impl.rs index aba602796c..972972fbdb 100644 --- a/src/infra/core/src/dataset_changes_service_impl.rs +++ b/src/infra/core/src/dataset_changes_service_impl.rs @@ -14,14 +14,15 @@ use futures::TryStreamExt; use internal_error::{InternalError, ResultIntoInternal}; use kamu_core::{ BlockRef, - Dataset, DatasetChangesService, DatasetIntervalIncrement, - DatasetRepository, + DatasetRegistry, + DatasetRegistryExt, GetDatasetError, GetIncrementError, GetRefError, MetadataChainExt, + ResolvedDataset, SearchSingleDataBlockVisitor, }; use opendatafabric::{DataSlice, DatasetID, MetadataEvent, Multihash}; @@ -29,7 +30,7 @@ use opendatafabric::{DataSlice, DatasetID, MetadataEvent, Multihash}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct DatasetChangesServiceImpl { - dataset_repo: Arc, + dataset_registry: Arc, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -37,16 +38,16 @@ pub struct DatasetChangesServiceImpl { #[component(pub)] #[interface(dyn DatasetChangesService)] impl DatasetChangesServiceImpl { - pub fn new(dataset_repo: Arc) -> Self { - Self { dataset_repo } + pub fn new(dataset_registry: Arc) -> Self { + Self { dataset_registry } } async fn resolve_dataset_by_id( &self, dataset_id: &DatasetID, - ) -> Result, GetIncrementError> { - self.dataset_repo - .find_dataset_by_ref(&dataset_id.as_local_ref()) + ) -> Result { + self.dataset_registry + .get_dataset_by_ref(&dataset_id.as_local_ref()) .await .map_err(|e| match e { GetDatasetError::NotFound(e) => GetIncrementError::DatasetNotFound(e), @@ -56,9 +57,9 @@ impl DatasetChangesServiceImpl { async fn resolve_dataset_head( &self, - dataset: &dyn Dataset, + resolved_dataset: &ResolvedDataset, ) -> Result { - dataset + resolved_dataset .as_metadata_chain() .as_reference_repo() .get(&BlockRef::Head) @@ -73,7 +74,7 @@ impl DatasetChangesServiceImpl { // TODO: PERF: Avoid multiple passes over metadata chain async fn make_increment_from_interval( &self, - dataset: Arc, + resolved_dataset: &ResolvedDataset, old_head: Option<&Multihash>, new_head: &Multihash, ) -> Result { @@ -86,7 +87,7 @@ impl DatasetChangesServiceImpl { let mut latest_watermark = None; // Scan blocks (from new head to old head) - let mut block_stream = dataset + let mut block_stream = resolved_dataset .as_metadata_chain() .iter_blocks_interval(new_head, old_head, false); @@ -147,7 +148,7 @@ impl DatasetChangesServiceImpl { // Did we have any head before? if let Some(old_head) = &old_head { // Yes, so try locating the previous watermark containing node - let previous_nearest_watermark = dataset + let previous_nearest_watermark = resolved_dataset .as_metadata_chain() .accept_one_by_hash(old_head, SearchSingleDataBlockVisitor::next()) .await @@ -196,10 +197,10 @@ impl DatasetChangesService for DatasetChangesServiceImpl { old_head: Option<&'a Multihash>, new_head: &'a Multihash, ) -> Result { - let dataset = self.resolve_dataset_by_id(dataset_id).await?; + let resolved_dataset = self.resolve_dataset_by_id(dataset_id).await?; let increment = self - .make_increment_from_interval(dataset, old_head, new_head) + .make_increment_from_interval(&resolved_dataset, old_head, new_head) .await .map_err(GetIncrementError::Internal)?; @@ -211,11 +212,11 @@ impl DatasetChangesService for DatasetChangesServiceImpl { dataset_id: &'a DatasetID, old_head: Option<&'a Multihash>, ) -> Result { - let dataset = self.resolve_dataset_by_id(dataset_id).await?; - let current_head = self.resolve_dataset_head(dataset.as_ref()).await?; + let resolved_dataset = self.resolve_dataset_by_id(dataset_id).await?; + let current_head = self.resolve_dataset_head(&resolved_dataset).await?; let increment = self - .make_increment_from_interval(dataset, old_head, ¤t_head) + .make_increment_from_interval(&resolved_dataset, old_head, ¤t_head) .await .map_err(GetIncrementError::Internal)?; diff --git a/src/infra/core/src/dataset_ownership_service_inmem.rs b/src/infra/core/src/dataset_ownership_service_inmem.rs index 8d8de06f0c..be6a805c6e 100644 --- a/src/infra/core/src/dataset_ownership_service_inmem.rs +++ b/src/infra/core/src/dataset_ownership_service_inmem.rs @@ -19,7 +19,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, }; use opendatafabric::{AccountID, AccountName, DatasetID}; @@ -45,7 +45,7 @@ struct State { #[meta(MessageConsumerMeta { consumer_name: MESSAGE_CONSUMER_KAMU_CORE_DATASET_OWNERSHIP_SERVICE, feeding_producers: &[MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE], - durability: MessageConsumptionDurability::BestEffort, + delivery: MessageDeliveryMechanism::Immediate, })] #[scope(Singleton)] impl DatasetOwnershipServiceInMemory { @@ -203,7 +203,7 @@ impl MessageConsumerT for DatasetOwnershipServiceInMemo pub struct DatasetOwnershipServiceInMemoryStateInitializer { current_account_subject: Arc, - dataset_repo: Arc, + dataset_registry: Arc, authentication_service: Arc, dataset_ownership_service: Arc, } @@ -218,13 +218,13 @@ pub struct DatasetOwnershipServiceInMemoryStateInitializer { impl DatasetOwnershipServiceInMemoryStateInitializer { pub fn new( current_account_subject: Arc, - dataset_repo: Arc, + dataset_registry: Arc, authentication_service: Arc, dataset_ownership_service: Arc, ) -> Self { Self { current_account_subject, - dataset_repo, + dataset_registry, authentication_service, dataset_ownership_service, } @@ -254,7 +254,7 @@ impl InitOnStartup for DatasetOwnershipServiceInMemoryStateInitializer { let mut account_ids_by_name: HashMap = HashMap::new(); - let mut datasets_stream = self.dataset_repo.get_all_datasets(); + let mut datasets_stream = self.dataset_registry.all_dataset_handles(); while let Some(Ok(dataset_handle)) = datasets_stream.next().await { let account_name = match dataset_handle.alias.account_name { Some(account_name) => account_name, diff --git a/src/infra/core/src/dataset_registry_repo_bridge.rs b/src/infra/core/src/dataset_registry_repo_bridge.rs new file mode 100644 index 0000000000..c0527041b2 --- /dev/null +++ b/src/infra/core/src/dataset_registry_repo_bridge.rs @@ -0,0 +1,81 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::*; +use kamu_core::{ + DatasetHandleStream, + DatasetHandlesResolution, + DatasetRegistry, + DatasetRepository, + GetDatasetError, + GetMultipleDatasetsError, + ResolvedDataset, +}; +use opendatafabric::{AccountName, DatasetHandle, DatasetID, DatasetRef}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct DatasetRegistryRepoBridge { + dataset_repo: Arc, +} + +#[component(pub)] +#[interface(dyn DatasetRegistry)] +impl DatasetRegistryRepoBridge { + pub fn new(dataset_repo: Arc) -> Self { + Self { dataset_repo } + } +} + +#[async_trait::async_trait] +impl DatasetRegistry for DatasetRegistryRepoBridge { + fn all_dataset_handles(&self) -> DatasetHandleStream<'_> { + self.dataset_repo.all_dataset_handles() + } + + fn all_dataset_handles_by_owner(&self, owner_name: &AccountName) -> DatasetHandleStream<'_> { + self.dataset_repo.all_dataset_handles_by_owner(owner_name) + } + + async fn resolve_dataset_handle_by_ref( + &self, + dataset_ref: &DatasetRef, + ) -> Result { + self.dataset_repo + .resolve_dataset_handle_by_ref(dataset_ref) + .await + } + + async fn resolve_multiple_dataset_handles_by_ids( + &self, + dataset_ids: Vec, + ) -> Result { + let mut res: DatasetHandlesResolution = Default::default(); + + for dataset_id in dataset_ids { + let dataset_ref = dataset_id.as_local_ref(); + let resolve_res = self.resolve_dataset_handle_by_ref(&dataset_ref).await; + match resolve_res { + Ok(hdl) => res.resolved_handles.push(hdl), + Err(e) => res.unresolved_datasets.push((dataset_id, e)), + } + } + + Ok(res) + } + + fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> ResolvedDataset { + let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); + ResolvedDataset::new(dataset, dataset_handle.clone()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/dependency_graph_repository_inmem.rs b/src/infra/core/src/dependency_graph_repository_inmem.rs index bdf410093e..d0011968e0 100644 --- a/src/infra/core/src/dependency_graph_repository_inmem.rs +++ b/src/infra/core/src/dependency_graph_repository_inmem.rs @@ -33,7 +33,7 @@ impl DependencyGraphRepository for DependencyGraphRepositoryInMemory { use tokio_stream::StreamExt; Box::pin(async_stream::try_stream! { - let mut datasets_stream = self.dataset_repo.get_all_datasets(); + let mut datasets_stream = self.dataset_repo.all_dataset_handles(); while let Some(Ok(dataset_handle)) = datasets_stream.next().await { let span = tracing::debug_span!("Scanning dataset dependencies", dataset = %dataset_handle); diff --git a/src/infra/core/src/dependency_graph_service_inmem.rs b/src/infra/core/src/dependency_graph_service_inmem.rs index 89fb439758..6651a671e7 100644 --- a/src/infra/core/src/dependency_graph_service_inmem.rs +++ b/src/infra/core/src/dependency_graph_service_inmem.rs @@ -17,7 +17,7 @@ use messaging_outbox::{ MessageConsumer, MessageConsumerMeta, MessageConsumerT, - MessageConsumptionDurability, + MessageDeliveryMechanism, }; use opendatafabric::DatasetID; use petgraph::stable_graph::{NodeIndex, StableDiGraph}; @@ -75,7 +75,7 @@ impl State { #[meta(MessageConsumerMeta { consumer_name: MESSAGE_CONSUMER_KAMU_CORE_DEPENDENCY_GRAPH_SERVICE, feeding_producers: &[MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE], - durability: MessageConsumptionDurability::BestEffort, + delivery: MessageDeliveryMechanism::Immediate, })] #[scope(Singleton)] impl DependencyGraphServiceInMemory { diff --git a/src/infra/core/src/engine/engine_datafusion_inproc.rs b/src/infra/core/src/engine/engine_datafusion_inproc.rs index bd3968d0f3..90d57ca16f 100644 --- a/src/infra/core/src/engine/engine_datafusion_inproc.rs +++ b/src/infra/core/src/engine/engine_datafusion_inproc.rs @@ -12,6 +12,7 @@ use std::sync::Arc; use datafusion::prelude::*; use internal_error::*; use kamu_core::engine::*; +use kamu_core::ResolvedDatasetsMap; use opendatafabric::*; /// An in-process engine using Apache Arrow Datafusion framework. @@ -114,6 +115,7 @@ impl Engine for EngineDatafusionInproc { async fn execute_transform( &self, _request: TransformRequestExt, + _datasets_map: &ResolvedDatasetsMap, ) -> Result { unimplemented!( "Derivative transformations must be executed by a versioned out-of-process engine" diff --git a/src/infra/core/src/engine/engine_io_strategy.rs b/src/infra/core/src/engine/engine_io_strategy.rs index 840795b723..6d0ed67029 100644 --- a/src/infra/core/src/engine/engine_io_strategy.rs +++ b/src/infra/core/src/engine/engine_io_strategy.rs @@ -8,7 +8,6 @@ // by the Apache License, Version 2.0. use std::path::{Path, PathBuf}; -use std::sync::Arc; use container_runtime::*; use datafusion::arrow::datatypes::SchemaRef; @@ -25,8 +24,8 @@ use crate::ObjectRepositoryLocalFSSha3; pub trait EngineIoStrategy: Send + Sync { async fn materialize_request( &self, - dataset: &dyn Dataset, request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, operation_dir: &Path, ) -> Result; } @@ -46,15 +45,9 @@ pub struct MaterializedEngineRequest { /// This IO strategy materializes all inputs as local file system files and pass /// them to the engines via mounted volumes. -pub struct EngineIoStrategyLocalVolume { - dataset_repo: Arc, -} +pub struct EngineIoStrategyLocalVolume {} impl EngineIoStrategyLocalVolume { - pub fn new(dataset_repo: Arc) -> Self { - Self { dataset_repo } - } - async fn materialize_object( &self, repo: &dyn ObjectRepository, @@ -92,8 +85,8 @@ impl EngineIoStrategy for EngineIoStrategyLocalVolume { #[tracing::instrument(skip_all)] async fn materialize_request( &self, - dataset: &dyn Dataset, request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, operation_dir: &Path, ) -> Result { let host_in_dir = operation_dir.join("in"); @@ -110,9 +103,11 @@ impl EngineIoStrategy for EngineIoStrategyLocalVolume { let mut volumes = vec![(host_out_dir, container_out_dir, VolumeAccess::ReadWrite).into()]; + let target = datasets_map.get_by_handle(&request.dataset_handle); + let prev_checkpoint_path = self .maybe_materialize_object( - dataset.as_checkpoint_repo(), + target.as_checkpoint_repo(), request.prev_checkpoint.as_ref(), &container_in_dir, &mut volumes, @@ -121,15 +116,13 @@ impl EngineIoStrategy for EngineIoStrategyLocalVolume { let mut query_inputs = Vec::new(); for input in request.inputs { - let input_dataset = self - .dataset_repo - .get_dataset_by_handle(&input.dataset_handle); + let input_resolved = datasets_map.get_by_handle(&input.dataset_handle); let mut data_paths = Vec::new(); for hash in input.data_slices { let container_path = self .materialize_object( - input_dataset.as_data_repo(), + input_resolved.as_data_repo(), &hash, &container_in_dir, &mut volumes, @@ -199,15 +192,9 @@ impl EngineIoStrategy for EngineIoStrategyLocalVolume { /// This IO strategy is used for engines that cannot work directly with remote /// storage. It will download the input data and checkpoint locally and mount it /// as files. -pub struct EngineIoStrategyRemoteProxy { - dataset_repo: Arc, -} +pub struct EngineIoStrategyRemoteProxy {} impl EngineIoStrategyRemoteProxy { - pub fn new(dataset_repo: Arc) -> Self { - Self { dataset_repo } - } - async fn materialize_object( &self, repo: &dyn ObjectRepository, @@ -262,8 +249,8 @@ impl EngineIoStrategy for EngineIoStrategyRemoteProxy { #[tracing::instrument(skip_all)] async fn materialize_request( &self, - dataset: &dyn Dataset, request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, operation_dir: &Path, ) -> Result { // TODO: PERF: Parallel data transfer @@ -281,9 +268,11 @@ impl EngineIoStrategy for EngineIoStrategyRemoteProxy { let mut volumes = vec![(host_out_dir, container_out_dir, VolumeAccess::ReadWrite).into()]; + let target = datasets_map.get_by_handle(&request.dataset_handle); + let prev_checkpoint_path = self .maybe_materialize_object( - dataset.as_checkpoint_repo(), + target.as_checkpoint_repo(), request.prev_checkpoint.as_ref(), &host_in_dir, &container_in_dir, @@ -293,15 +282,13 @@ impl EngineIoStrategy for EngineIoStrategyRemoteProxy { let mut query_inputs = Vec::new(); for input in request.inputs { - let input_dataset = self - .dataset_repo - .get_dataset_by_handle(&input.dataset_handle); + let input_resolved = datasets_map.get_by_handle(&input.dataset_handle); let mut data_paths = Vec::new(); for hash in input.data_slices { let container_path = self .materialize_object( - input_dataset.as_data_repo(), + input_resolved.as_data_repo(), &hash, &host_in_dir, &container_in_dir, diff --git a/src/infra/core/src/engine/engine_odf.rs b/src/infra/core/src/engine/engine_odf.rs index f311ebb917..1adda51539 100644 --- a/src/infra/core/src/engine/engine_odf.rs +++ b/src/infra/core/src/engine/engine_odf.rs @@ -19,7 +19,7 @@ use kamu_core::engine::*; use kamu_core::*; use odf::engine::{EngineGrpcClient, ExecuteRawQueryError, ExecuteTransformError}; use odf::TransformResponseSuccess; -use opendatafabric as odf; +use opendatafabric::{self as odf}; use super::engine_container::{EngineContainer, LogsConfig}; use super::engine_io_strategy::*; @@ -30,7 +30,6 @@ pub struct ODFEngine { engine_config: ODFEngineConfig, image: String, run_info_dir: Arc, - dataset_repo: Arc, } impl ODFEngine { @@ -39,34 +38,24 @@ impl ODFEngine { engine_config: ODFEngineConfig, image: &str, run_info_dir: Arc, - dataset_repo: Arc, ) -> Self { Self { container_runtime, engine_config, image: image.to_owned(), run_info_dir, - dataset_repo, } } // TODO: Currently we are always proxying remote inputs, but in future we should // have a capabilities mechanism for engines to declare that they can work // with some remote storages directly without us needing to proxy data. - fn get_io_strategy(&self, request: &TransformRequestExt) -> Arc { - let dataset = self - .dataset_repo - .get_dataset_by_handle(&request.dataset_handle); - - match dataset.as_data_repo().protocol() { - ObjectRepositoryProtocol::LocalFs { .. } => { - Arc::new(EngineIoStrategyLocalVolume::new(self.dataset_repo.clone())) - } + fn get_io_strategy(&self, target_dataset: &dyn Dataset) -> Arc { + match target_dataset.as_data_repo().protocol() { + ObjectRepositoryProtocol::LocalFs { .. } => Arc::new(EngineIoStrategyLocalVolume {}), ObjectRepositoryProtocol::Memory | ObjectRepositoryProtocol::Http - | ObjectRepositoryProtocol::S3 => { - Arc::new(EngineIoStrategyRemoteProxy::new(self.dataset_repo.clone())) - } + | ObjectRepositoryProtocol::S3 => Arc::new(EngineIoStrategyRemoteProxy {}), } } @@ -355,11 +344,8 @@ impl Engine for ODFEngine { async fn execute_transform( &self, request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, ) -> Result { - let dataset = self - .dataset_repo - .get_dataset_by_handle(&request.dataset_handle); - let operation_id = request.operation_id.clone(); let operation_dir = self .run_info_dir @@ -368,10 +354,11 @@ impl Engine for ODFEngine { std::fs::create_dir(&operation_dir).int_err()?; std::fs::create_dir(&logs_dir).int_err()?; - let io_strategy = self.get_io_strategy(&request); + let target = datasets_map.get_by_handle(&request.dataset_handle); + let io_strategy = self.get_io_strategy(target.as_ref()); let materialized_request = io_strategy - .materialize_request(dataset.as_ref(), request, &operation_dir) + .materialize_request(request, datasets_map, &operation_dir) .await .int_err()?; diff --git a/src/infra/core/src/engine/engine_provisioner_local.rs b/src/infra/core/src/engine/engine_provisioner_local.rs index 1e8aaae3b3..079aa1f64b 100644 --- a/src/infra/core/src/engine/engine_provisioner_local.rs +++ b/src/infra/core/src/engine/engine_provisioner_local.rs @@ -49,7 +49,6 @@ impl EngineProvisionerLocal { pub fn new( config: EngineProvisionerLocalConfig, container_runtime: Arc, - dataset_repo: Arc, run_info_dir: Arc, ) -> Self { let engine_config = ODFEngineConfig { @@ -63,28 +62,24 @@ impl EngineProvisionerLocal { engine_config.clone(), &config.spark_image, run_info_dir.clone(), - dataset_repo.clone(), )), flink_engine: Arc::new(ODFEngine::new( container_runtime.clone(), engine_config.clone(), &config.flink_image, run_info_dir.clone(), - dataset_repo.clone(), )), datafusion_engine: Arc::new(ODFEngine::new( container_runtime.clone(), engine_config.clone(), &config.datafusion_image, run_info_dir.clone(), - dataset_repo.clone(), )), risingwave_engine: Arc::new(ODFEngine::new( container_runtime.clone(), engine_config.clone(), &config.risingwave_image, run_info_dir.clone(), - dataset_repo.clone(), )), container_runtime, inner: Arc::new(Inner { @@ -306,8 +301,9 @@ impl Engine for EngineHandle { async fn execute_transform( &self, request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, ) -> Result { - self.engine.execute_transform(request).await + self.engine.execute_transform(request, datasets_map).await } } diff --git a/src/infra/core/src/ingest/polling_ingest_service_impl.rs b/src/infra/core/src/ingest/polling_ingest_service_impl.rs index 795dff8656..c8db4de931 100644 --- a/src/infra/core/src/ingest/polling_ingest_service_impl.rs +++ b/src/infra/core/src/ingest/polling_ingest_service_impl.rs @@ -27,8 +27,6 @@ use super::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct PollingIngestServiceImpl { - dataset_repo: Arc, - dataset_action_authorizer: Arc, fetch_service: Arc, engine_provisioner: Arc, object_store_registry: Arc, @@ -44,8 +42,6 @@ pub struct PollingIngestServiceImpl { #[dill::interface(dyn PollingIngestService)] impl PollingIngestServiceImpl { pub fn new( - dataset_repo: Arc, - dataset_action_authorizer: Arc, fetch_service: Arc, engine_provisioner: Arc, object_store_registry: Arc, @@ -55,8 +51,6 @@ impl PollingIngestServiceImpl { time_source: Arc, ) -> Self { Self { - dataset_repo, - dataset_action_authorizer, fetch_service, engine_provisioner, object_store_registry, @@ -69,24 +63,15 @@ impl PollingIngestServiceImpl { async fn do_ingest( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, options: PollingIngestOptions, get_listener: impl FnOnce(&DatasetHandle) -> Option>, ) -> Result { - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; - - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Write) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - - let listener = - get_listener(&dataset_handle).unwrap_or_else(|| Arc::new(NullPollingIngestListener)); + let listener = get_listener(target.get_handle()) + .unwrap_or_else(|| Arc::new(NullPollingIngestListener)); self.ingest_loop(IngestLoopArgs { - dataset_handle, - dataset, + target, options, listener, }) @@ -97,7 +82,7 @@ impl PollingIngestServiceImpl { level = "info", skip_all, fields( - dataset_handle = %args.dataset_handle, + dataset_handle = %args.target.get_handle(), ) )] async fn ingest_loop( @@ -105,7 +90,7 @@ impl PollingIngestServiceImpl { args: IngestLoopArgs, ) -> Result { let ctx = ingest_common::new_session_context(self.object_store_registry.clone()); - let mut data_writer = DataWriterDataFusion::builder(args.dataset.clone(), ctx.clone()) + let mut data_writer = DataWriterDataFusion::builder((*args.target).clone(), ctx.clone()) .with_metadata_state_scanned(None) .await .int_err()? @@ -140,7 +125,7 @@ impl PollingIngestServiceImpl { // TODO: Avoid excessive cloning let iteration_args = IngestIterationArgs { - dataset_handle: args.dataset_handle.clone(), + dataset_handle: args.target.get_handle().clone(), iteration, operation_id, operation_dir, @@ -609,15 +594,13 @@ impl PollingIngestServiceImpl { #[async_trait::async_trait] impl PollingIngestService for PollingIngestServiceImpl { - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref))] + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle()))] async fn get_active_polling_source( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, ) -> Result)>, GetDatasetError> { - let dataset = self.dataset_repo.find_dataset_by_ref(dataset_ref).await?; - // TODO: Support source evolution - Ok(dataset + Ok(target .as_metadata_chain() .accept_one(SearchSetPollingSourceVisitor::new()) .await @@ -625,45 +608,14 @@ impl PollingIngestService for PollingIngestServiceImpl { .into_hashed_block()) } - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref))] + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle()))] async fn ingest( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, options: PollingIngestOptions, maybe_listener: Option>, ) -> Result { - self.do_ingest(dataset_ref, options, |_| maybe_listener) - .await - } - - #[tracing::instrument(level = "info", skip_all)] - async fn ingest_multi( - &self, - dataset_refs: Vec, - options: PollingIngestOptions, - maybe_multi_listener: Option>, - ) -> Vec { - let multi_listener = - maybe_multi_listener.unwrap_or_else(|| Arc::new(NullPollingIngestMultiListener)); - - let futures: Vec<_> = dataset_refs - .iter() - .map(|dataset_ref| { - self.do_ingest(dataset_ref, options.clone(), |hdl| { - multi_listener.begin_ingest(hdl) - }) - }) - .collect(); - - let results = futures::future::join_all(futures).await; - dataset_refs - .into_iter() - .zip(results) - .map(|(dataset_ref, result)| PollingIngestResponse { - dataset_ref, - result, - }) - .collect() + self.do_ingest(target, options, |_| maybe_listener).await } } @@ -681,8 +633,7 @@ pub(crate) struct PrepStepResult { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct IngestLoopArgs { - dataset_handle: DatasetHandle, - dataset: Arc, + target: ResolvedDataset, options: PollingIngestOptions, listener: Arc, } diff --git a/src/infra/core/src/ingest/push_ingest_service_impl.rs b/src/infra/core/src/ingest/push_ingest_service_impl.rs index 93d22a590f..e8bfd99823 100644 --- a/src/infra/core/src/ingest/push_ingest_service_impl.rs +++ b/src/infra/core/src/ingest/push_ingest_service_impl.rs @@ -27,8 +27,6 @@ use super::ingest_common; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct PushIngestServiceImpl { - dataset_repo: Arc, - dataset_action_authorizer: Arc, object_store_registry: Arc, data_format_registry: Arc, time_source: Arc, @@ -42,8 +40,6 @@ pub struct PushIngestServiceImpl { #[dill::interface(dyn PushIngestService)] impl PushIngestServiceImpl { pub fn new( - dataset_repo: Arc, - dataset_action_authorizer: Arc, object_store_registry: Arc, data_format_registry: Arc, time_source: Arc, @@ -51,8 +47,6 @@ impl PushIngestServiceImpl { run_info_dir: Arc, ) -> Self { Self { - dataset_repo, - dataset_action_authorizer, object_store_registry, data_format_registry, time_source, @@ -63,20 +57,12 @@ impl PushIngestServiceImpl { async fn do_ingest( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, source_name: Option<&str>, source: DataSource, opts: PushIngestOpts, listener: Arc, ) -> Result { - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; - - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Write) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - let operation_id = get_random_name(None, 10); let operation_dir = self.run_info_dir.join(format!("ingest-{operation_id}")); std::fs::create_dir_all(&operation_dir).int_err()?; @@ -85,19 +71,19 @@ impl PushIngestServiceImpl { ingest_common::new_session_context(self.object_store_registry.clone()); let mut data_writer = self - .make_data_writer(dataset.clone(), source_name, ctx.clone()) + .make_data_writer((*target).clone(), source_name, ctx.clone()) .await?; let push_source = match (data_writer.source_event(), opts.auto_create_push_source) { // No push source, and it's allowed to create (None, true) => { let add_push_source_event = self - .auto_create_push_source(dataset.clone(), "auto", &opts) + .auto_create_push_source((*target).clone(), "auto", &opts) .await?; // Update data writer, as we've modified the dataset data_writer = self - .make_data_writer(dataset.clone(), source_name, ctx.clone()) + .make_data_writer((*target).clone(), source_name, ctx.clone()) .await?; Ok(add_push_source_event) } @@ -432,16 +418,15 @@ impl PushIngestServiceImpl { #[async_trait::async_trait] impl PushIngestService for PushIngestServiceImpl { - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref))] + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle()))] async fn get_active_push_sources( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, ) -> Result)>, GetDatasetError> { use futures::TryStreamExt; // TODO: Support source disabling and evolution - let dataset = self.dataset_repo.find_dataset_by_ref(dataset_ref).await?; - let stream = dataset + let stream = target .as_metadata_chain() .iter_blocks() .filter_map_ok(|(h, b)| b.into_typed().map(|b| (h, b))); @@ -449,10 +434,10 @@ impl PushIngestService for PushIngestServiceImpl { Ok(stream.try_collect().await.int_err()?) } - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref))] + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle()))] async fn ingest_from_url( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, source_name: Option<&str>, url: url::Url, opts: PushIngestOpts, @@ -462,20 +447,14 @@ impl PushIngestService for PushIngestServiceImpl { tracing::info!(%url, ?opts, "Ingesting from url"); - self.do_ingest( - dataset_ref, - source_name, - DataSource::Url(url), - opts, - listener, - ) - .await + self.do_ingest(target, source_name, DataSource::Url(url), opts, listener) + .await } - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref))] + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle()))] async fn ingest_from_file_stream( &self, - dataset_ref: &DatasetRef, + target: ResolvedDataset, source_name: Option<&str>, data: Box, opts: PushIngestOpts, @@ -486,7 +465,7 @@ impl PushIngestService for PushIngestServiceImpl { tracing::info!(?opts, "Ingesting from file stream"); self.do_ingest( - dataset_ref, + target, source_name, DataSource::Stream(data), opts, diff --git a/src/infra/core/src/lib.rs b/src/infra/core/src/lib.rs index afe7dfa8d2..2358a937e8 100644 --- a/src/infra/core/src/lib.rs +++ b/src/infra/core/src/lib.rs @@ -14,6 +14,7 @@ #![feature(error_generic_member_access)] #![feature(trait_upcasting)] #![feature(let_chains)] +#![feature(iter_collect_into)] // Re-exports pub use kamu_core as domain; @@ -24,6 +25,7 @@ mod query; mod repos; #[cfg(any(feature = "testing", test))] pub mod testing; +mod transform; mod use_cases; pub mod utils; @@ -32,11 +34,12 @@ mod dataset_changes_service_impl; mod dataset_config; mod dataset_layout; mod dataset_ownership_service_inmem; +mod dataset_registry_repo_bridge; mod dependency_graph_repository_inmem; mod dependency_graph_service_inmem; mod provenance_service_impl; -mod pull_service_impl; -mod push_service_impl; +mod pull_request_planner_impl; +mod push_request_planner_impl; mod query_service_impl; mod remote_alias_resolver_impl; mod remote_aliases_registry_impl; @@ -44,22 +47,24 @@ mod remote_repository_registry_impl; mod reset_service_impl; mod resource_loader_impl; mod search_service_impl; +mod sync_request_builder; mod sync_service_impl; -mod transform_service_impl; mod verification_service_impl; +mod watermark_service_impl; pub use compaction_service_impl::*; pub use dataset_changes_service_impl::*; pub use dataset_config::*; pub use dataset_layout::*; pub use dataset_ownership_service_inmem::*; +pub use dataset_registry_repo_bridge::*; pub use dependency_graph_repository_inmem::*; pub use dependency_graph_service_inmem::*; pub use engine::*; pub use ingest::*; pub use provenance_service_impl::*; -pub use pull_service_impl::*; -pub use push_service_impl::*; +pub use pull_request_planner_impl::*; +pub use push_request_planner_impl::*; pub use query_service_impl::*; pub use remote_alias_resolver_impl::*; pub use remote_aliases_registry_impl::*; @@ -68,7 +73,9 @@ pub use repos::*; pub use reset_service_impl::*; pub use resource_loader_impl::*; pub use search_service_impl::*; +pub use sync_request_builder::*; pub use sync_service_impl::*; -pub use transform_service_impl::*; +pub use transform::*; pub use use_cases::*; pub use verification_service_impl::*; +pub use watermark_service_impl::*; diff --git a/src/infra/core/src/provenance_service_impl.rs b/src/infra/core/src/provenance_service_impl.rs index e4f3417fdd..e1963dd0d6 100644 --- a/src/infra/core/src/provenance_service_impl.rs +++ b/src/infra/core/src/provenance_service_impl.rs @@ -20,7 +20,7 @@ use opendatafabric::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct ProvenanceServiceImpl { - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, } @@ -28,11 +28,11 @@ pub struct ProvenanceServiceImpl { #[interface(dyn ProvenanceService)] impl ProvenanceServiceImpl { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, dataset_action_authorizer, } } @@ -47,53 +47,41 @@ impl ProvenanceServiceImpl { .check_action_allowed(dataset_handle, auth::DatasetAction::Read) .await?; - if let Some(dataset) = self - .dataset_repo - .try_get_dataset(&dataset_handle.as_local_ref()) - .await? - { - let summary = dataset - .get_summary(GetSummaryOpts::default()) - .await - .int_err()?; - - let mut resolved_inputs = Vec::new(); - for input_id in &summary.dependencies { - let handle = self - .dataset_repo - .resolve_dataset_ref(&input_id.as_local_ref()) - .await?; - - resolved_inputs.push(ResolvedTransformInput { - // TODO: This likely needs to be changed into query alias - name: handle.alias.dataset_name.clone(), - handle, - }); - } + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); + + let summary = resolved_dataset + .get_summary(GetSummaryOpts::default()) + .await + .int_err()?; + + let mut resolved_inputs = Vec::new(); + for input_id in &summary.dependencies { + let handle = self + .dataset_registry + .resolve_dataset_handle_by_ref(&input_id.as_local_ref()) + .await?; + + resolved_inputs.push(ResolvedTransformInput { + // TODO: This likely needs to be changed into query alias + name: handle.alias.dataset_name.clone(), + handle, + }); + } - let dataset_info = NodeInfo::Local { - id: summary.id.clone(), - alias: dataset_handle.alias.clone(), - kind: summary.kind, - dependencies: &resolved_inputs, - }; - - if visitor.enter(&dataset_info) { - for input in &resolved_inputs { - self.visit_upstream_dependencies_rec(&input.handle, visitor) - .await?; - } + let dataset_info = NodeInfo::Local { + id: summary.id.clone(), + alias: dataset_handle.alias.clone(), + kind: summary.kind, + dependencies: &resolved_inputs, + }; - visitor.exit(&dataset_info); + if visitor.enter(&dataset_info) { + for input in &resolved_inputs { + self.visit_upstream_dependencies_rec(&input.handle, visitor) + .await?; } - } else { - // Remote dataset - let dataset_info = NodeInfo::Remote { - id: dataset_handle.id.clone(), - alias: dataset_handle.alias.clone(), - }; - - visitor.enter(&dataset_info); + + visitor.exit(&dataset_info); } Ok(()) @@ -108,7 +96,10 @@ impl ProvenanceService for ProvenanceServiceImpl { visitor: &mut dyn LineageVisitor, _options: LineageOptions, ) -> Result<(), GetLineageError> { - let hdl = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; + let hdl = self + .dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await?; self.visit_upstream_dependencies_rec(&hdl, visitor).await } } diff --git a/src/infra/core/src/pull_request_planner_impl.rs b/src/infra/core/src/pull_request_planner_impl.rs new file mode 100644 index 0000000000..815f04e6de --- /dev/null +++ b/src/infra/core/src/pull_request_planner_impl.rs @@ -0,0 +1,759 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::cmp::Ordering; +use std::collections::HashMap; +use std::sync::Arc; + +use dill::*; +use internal_error::{InternalError, ResultIntoInternal}; +use kamu_accounts::CurrentAccountSubject; +use kamu_core::*; +use opendatafabric::*; +use url::Url; + +use crate::SyncRequestBuilder; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct PullRequestPlannerImpl { + dataset_registry: Arc, + remote_alias_registry: Arc, + transform_request_planner: Arc, + sync_request_builder: Arc, + current_account_subject: Arc, +} + +#[component(pub)] +#[interface(dyn PullRequestPlanner)] +impl PullRequestPlannerImpl { + pub fn new( + dataset_registry: Arc, + remote_alias_registry: Arc, + transform_request_planner: Arc, + sync_request_builder: Arc, + current_account_subject: Arc, + ) -> Self { + Self { + dataset_registry, + remote_alias_registry, + transform_request_planner, + sync_request_builder, + current_account_subject, + } + } + + // This function descends down the dependency tree of datasets (starting with + // provided references) assigning depth index to every dataset in the + // graph(s). Datasets that share the same depth level are independent and + // can be pulled in parallel. + #[tracing::instrument(level = "debug", skip_all, fields(?requests, ?options))] + async fn collect_pull_graph( + &self, + requests: &[PullRequest], + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> (Vec, Vec) { + let mut errors = Vec::new(); + + let mut depth_first_traversal = PullGraphDepthFirstTraversal::new( + self.dataset_registry.clone(), + self.remote_alias_registry.clone(), + self.current_account_subject.clone(), + options, + tenancy_config, + ); + + for pr in requests { + match depth_first_traversal + .traverse_pull_graph( + pr, /* pull request */ + true, /* referenced_explicitly */ + true, /* scan dependent */ + ) + .await + { + Ok(_) => {} + Err(e) => errors.push(PullResponse { + maybe_original_request: Some(pr.clone()), + maybe_local_ref: None, + maybe_remote_ref: None, + result: Err(e), + }), + } + } + + let visited = depth_first_traversal.visited; + let mut ordered = Vec::with_capacity(visited.len()); + ordered.extend(visited.into_values()); + ordered.sort(); + (ordered, errors) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?request, ?options))] + async fn build_single_node_pull_graph( + &self, + request: &PullRequest, + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> (Vec, Vec) { + let mut depth_first_traversal = PullGraphDepthFirstTraversal::new( + self.dataset_registry.clone(), + self.remote_alias_registry.clone(), + self.current_account_subject.clone(), + options, + tenancy_config, + ); + + if let Err(e) = depth_first_traversal + .traverse_pull_graph( + request, /* pull request */ + true, /* referenced_explicitly */ + false, /* scan dependent */ + ) + .await + { + let error_response = PullResponse { + maybe_original_request: Some(request.clone()), + maybe_local_ref: None, + maybe_remote_ref: None, + result: Err(e), + }; + return (vec![], vec![error_response]); + } + + let visited = depth_first_traversal.visited; + assert_eq!(visited.len(), 1); + let the_item = visited.into_values().next().unwrap(); + (vec![the_item], vec![]) + } + + fn slice_by_depth(&self, mut plan: Vec) -> (i32, Vec, Vec) { + let first_depth = plan[0].depth; + let count = plan.iter().take_while(|pi| pi.depth == first_depth).count(); + let rest = plan.split_off(count); + (first_depth, plan, rest) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?pi))] + fn build_ingest_item(&self, pi: PullItem) -> PullIngestItem { + assert!(pi.maybe_remote_ref.is_none()); + + let hdl = match pi.local_target { + PullLocalTarget::Existing(local_handle) => local_handle, + PullLocalTarget::ToCreate(_) => { + unreachable!("Ingest flows expect to work with existing local targets") + } + }; + + PullIngestItem { + depth: pi.depth, + target: self.dataset_registry.get_dataset_by_handle(&hdl), + maybe_original_request: pi.maybe_original_request, + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?pi))] + async fn build_transform_item(&self, pi: PullItem) -> Result { + assert!(pi.maybe_remote_ref.is_none()); + + let hdl = match pi.local_target { + PullLocalTarget::Existing(local_handle) => local_handle, + PullLocalTarget::ToCreate(_) => { + unreachable!("Transform flows expect to work with existing local targets") + } + }; + + let target = self.dataset_registry.get_dataset_by_handle(&hdl); + + match self + .transform_request_planner + .build_transform_preliminary_plan(target.clone()) + .await + { + Ok(plan) => Ok(PullTransformItem { + depth: pi.depth, + target, + maybe_original_request: pi.maybe_original_request, + plan, + }), + Err(e) => Err(PullResponse { + maybe_original_request: pi.maybe_original_request, + maybe_local_ref: Some(hdl.as_local_ref()), + maybe_remote_ref: None, + result: Err(PullError::TransformError(TransformError::Plan(e))), + }), + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?pi, ?sync_options))] + async fn build_sync_item( + &self, + pi: PullItem, + sync_options: SyncOptions, + ) -> Result { + assert!(pi.maybe_remote_ref.is_some()); + + let remote_ref = pi.maybe_remote_ref.unwrap(); + + match self + .sync_request_builder + .build_sync_request( + remote_ref.as_any_ref(), + pi.local_target.as_any_ref(), + sync_options.create_if_not_exists, + ) + .await + { + Ok(sync_request) => Ok(PullSyncItem { + depth: pi.depth, + local_target: pi.local_target, + remote_ref, + maybe_original_request: pi.maybe_original_request, + sync_request: Box::new(sync_request), + }), + Err(e) => Err(PullResponse { + maybe_original_request: pi.maybe_original_request, + maybe_local_ref: Some(pi.local_target.as_local_ref()), + maybe_remote_ref: Some(remote_ref), + result: Err(PullError::SyncError(e)), + }), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl PullRequestPlanner for PullRequestPlannerImpl { + #[tracing::instrument(level = "debug", skip_all, fields(?request, ?options))] + async fn build_pull_plan( + &self, + request: PullRequest, + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> Result { + assert!(!options.recursive); + + let (mut plan, mut errors) = self + .build_pull_multi_plan(&[request], options, tenancy_config) + .await; + assert!(plan.len() == 1 && errors.is_empty() || plan.is_empty() && errors.len() == 1); + if plan.is_empty() { + let the_error = errors.remove(0); + Err(the_error) + } else { + let mut the_iteration = plan.remove(0).jobs; + assert_eq!(the_iteration.len(), 1); + let the_job = the_iteration.remove(0); + Ok(the_job) + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?requests, ?options))] + async fn build_pull_multi_plan( + &self, + requests: &[PullRequest], + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> (Vec, Vec) { + // If there is just 1 dataset, and no recursion set, do a simplified procedure. + // Otherwise, do a hierarchical scan trying to find relations + let (mut plan, errors) = if requests.len() == 1 && !options.recursive { + self.build_single_node_pull_graph(&requests[0], options, tenancy_config) + .await + } else { + self.collect_pull_graph(requests, options, tenancy_config) + .await + }; + + tracing::info!( + num_iterations = plan.len(), + num_errors = errors.len(), + ?plan, + "Resolved pull graph" + ); + if !errors.is_empty() { + return (vec![], errors); + } + + if !options.recursive { + // Leave only datasets explicitly mentioned, preserving the depth order + plan.retain(|pi| pi.maybe_original_request.is_some()); + } + + tracing::info!(num_items = plan.len(), ?plan, "Retained pull graph"); + + let mut iterations = Vec::new(); + let mut errors = Vec::new(); + + let mut rest = plan; + while !rest.is_empty() { + let (depth, batch, tail) = self.slice_by_depth(rest); + rest = tail; + + tracing::debug!( + depth, + num_items = batch.len(), + ?batch, + "Detailing pull graph iteration" + ); + + let mut jobs = Vec::new(); + for item in batch { + // Ingest? + if depth == 0 && item.maybe_remote_ref.is_none() { + let pii = self.build_ingest_item(item); + tracing::debug!(depth, ?pii, "Added ingest item to pull plan"); + jobs.push(PullPlanIterationJob::Ingest(pii)); + + // Sync? + } else if depth == 0 && item.maybe_remote_ref.is_some() { + match self.build_sync_item(item, options.sync_options).await { + Ok(psi) => { + tracing::debug!(depth, ?psi, "Added sync item to pull plan"); + jobs.push(PullPlanIterationJob::Sync(psi)); + } + Err(sync_error) => { + errors.push(sync_error); + } + } + } + // Transform otherwise + else { + match self.build_transform_item(item).await { + Ok(pti) => { + tracing::debug!(depth, ?pti, "Added transform item to pull plan"); + jobs.push(PullPlanIterationJob::Transform(pti)); + } + Err(transform_error) => { + errors.push(transform_error); + } + } + } + } + + if !jobs.is_empty() { + iterations.push(PullPlanIteration { depth, jobs }); + } + } + + (iterations, errors) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?options))] + async fn build_pull_plan_all_owner_datasets( + &self, + options: &PullOptions, + tenancy_config: TenancyConfig, + ) -> Result<(Vec, Vec), InternalError> { + use futures::TryStreamExt; + let requests: Vec<_> = self + .dataset_registry + .all_dataset_handles_by_owner(self.current_account_subject.account_name()) + .map_ok(|hdl| PullRequest::local(hdl.as_local_ref())) + .try_collect() + .await?; + + Ok(self + .build_pull_multi_plan(&requests, options, tenancy_config) + .await) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +struct PullGraphDepthFirstTraversal<'a> { + dataset_registry: Arc, + remote_alias_registry: Arc, + current_account_subject: Arc, + options: &'a PullOptions, + tenancy_config: TenancyConfig, + visited: HashMap, +} + +impl<'a> PullGraphDepthFirstTraversal<'a> { + fn new( + dataset_registry: Arc, + remote_alias_registry: Arc, + current_account_subject: Arc, + options: &'a PullOptions, + tenancy_config: TenancyConfig, + ) -> Self { + Self { + dataset_registry, + remote_alias_registry, + current_account_subject, + options, + tenancy_config, + visited: HashMap::new(), + } + } + + #[async_recursion::async_recursion] + async fn traverse_pull_graph( + &mut self, + request: &PullRequest, + referenced_explicitly: bool, + traverse_dependencies: bool, + ) -> Result { + tracing::debug!(?request, %referenced_explicitly, %traverse_dependencies, "Entering pull graph node"); + + // Resolve local dataset handle, if dataset exists + let maybe_local_handle = self.try_resolve_local_handle(request).await?; + + // If dataset is not found, and auto-create is disabled, it's en error + if maybe_local_handle.is_none() && !self.options.sync_options.create_if_not_exists { + return Err(PullError::InvalidOperation( + "Dataset does not exist and auto-create is switched off".to_owned(), + )); + } + + // Resolve the name of a local dataset if it exists + // or a name to create dataset with if syncing from remote and creation is + // allowed + let local_alias = self.form_local_alias(maybe_local_handle.as_ref(), request)?; + + // Already visited? + if let Some(pi) = self.visited.get_mut(&local_alias) { + tracing::debug!("Already visited - continuing"); + if referenced_explicitly { + pi.maybe_original_request = Some(request.clone()); + } + return Ok(pi.depth); + } + + // Resolve remote alias, if any + let maybe_remote_ref = self + .resolve_remote_ref(request, maybe_local_handle.as_ref()) + .await?; + + let mut pull_item = if maybe_remote_ref.is_some() { + // Datasets synced from remotes are depth 0 + let local_target = if let Some(local_handle) = maybe_local_handle { + PullLocalTarget::existing(local_handle) + } else { + PullLocalTarget::to_create(local_alias.clone()) + }; + PullItem { + maybe_original_request: None, // May be set below + depth: 0, + local_target, + maybe_remote_ref, + } + } else { + // Pulling an existing local root or derivative dataset + let local_handle = maybe_local_handle.unwrap(); + + // Read summary + let summary = self + .dataset_registry + .get_dataset_by_handle(&local_handle) + .get_summary(GetSummaryOpts::default()) + .await + .int_err()?; + + // Plan up-stream dependencies first + let max_dep_depth = if traverse_dependencies { + self.traverse_upstream_datasets(summary).await? + } else { + // Without scanning upstreams, decide on depth based on Root/Derived kind. + // The exact depth is not important, as long as we keep `depth=>0` for derived + // datasets. + match summary.kind { + DatasetKind::Root => -1, + DatasetKind::Derivative => 0, + } + }; + + // Plan the current dataset as last + PullItem { + maybe_original_request: None, // May be set below + depth: max_dep_depth + 1, + local_target: PullLocalTarget::existing(local_handle), + maybe_remote_ref: None, + } + }; + + if referenced_explicitly { + pull_item.maybe_original_request = Some(request.clone()); + } + + tracing::debug!(?pull_item, "Resolved pull graph node"); + + let depth = pull_item.depth; + self.visited.insert(local_alias.clone(), pull_item); + Ok(depth) + } + + async fn try_resolve_local_handle( + &self, + request: &PullRequest, + ) -> Result, PullError> { + let maybe_local_handle = match request { + PullRequest::Local(local_ref) => { + match self + .dataset_registry + .try_resolve_dataset_handle_by_ref(local_ref) + .await? + { + Some(hdl) => Some(hdl), + None => { + return Err(PullError::NotFound(DatasetNotFoundError { + dataset_ref: local_ref.clone(), + })) + } + } + } + PullRequest::Remote(remote) => { + let maybe_local_handle = if let Some(local_alias) = &remote.maybe_local_alias { + self.dataset_registry + .try_resolve_dataset_handle_by_ref(&local_alias.as_local_ref()) + .await? + } else { + None + }; + if maybe_local_handle.is_none() { + self.try_inverse_lookup_dataset_by_pull_alias(&remote.remote_ref) + .await? + } else { + maybe_local_handle + } + } + }; + + Ok(maybe_local_handle) + } + + // TODO: avoid traversing all datasets for every alias + async fn try_inverse_lookup_dataset_by_pull_alias( + &self, + remote_ref: &DatasetRefRemote, + ) -> Result, InternalError> { + // Do a quick check when remote and local names match + if let Some(remote_name) = remote_ref.dataset_name() { + if let Some(local_handle) = self + .dataset_registry + .try_resolve_dataset_handle_by_ref( + &DatasetAlias::new(None, remote_name.clone()).as_local_ref(), + ) + .await? + { + if self + .remote_alias_registry + .get_remote_aliases(&local_handle) + .await + .int_err()? + .contains(remote_ref, RemoteAliasKind::Pull) + { + return Ok(Some(local_handle)); + } + } + } + + // No luck - now have to search through aliases (of current user) + if let CurrentAccountSubject::Logged(l) = self.current_account_subject.as_ref() { + use tokio_stream::StreamExt; + let mut datasets = self + .dataset_registry + .all_dataset_handles_by_owner(&l.account_name); + while let Some(dataset_handle) = datasets.next().await { + let dataset_handle = dataset_handle?; + if self + .remote_alias_registry + .get_remote_aliases(&dataset_handle) + .await + .int_err()? + .contains(remote_ref, RemoteAliasKind::Pull) + { + return Ok(Some(dataset_handle)); + } + } + } + + Ok(None) + } + + fn form_local_alias( + &self, + maybe_local_handle: Option<&DatasetHandle>, + request: &PullRequest, + ) -> Result { + let local_alias = if let Some(hdl) = maybe_local_handle { + // Target exists + hdl.alias.clone() + } else { + match request { + PullRequest::Local(local_ref) => { + // Target does not exist but was provided + if let Some(alias) = local_ref.alias() { + alias.clone() + } else { + return Err(PullError::NotFound(DatasetNotFoundError { + dataset_ref: local_ref.clone(), + })); + } + } + PullRequest::Remote(remote) => { + if let Some(local_alias) = &remote.maybe_local_alias { + local_alias.clone() + } else { + self.infer_alias_from_remote_ref(&remote.remote_ref)? + } + } + } + }; + + Ok(local_alias) + } + + fn infer_alias_from_remote_ref( + &self, + remote_ref: &DatasetRefRemote, + ) -> Result { + Ok(match &remote_ref { + DatasetRefRemote::ID(_, _) => { + unimplemented!("Pulling from remote by ID is not supported") + } + + DatasetRefRemote::Alias(alias) + | DatasetRefRemote::Handle(DatasetHandleRemote { alias, .. }) => { + DatasetAlias::new(None, alias.dataset_name.clone()) + } + + DatasetRefRemote::Url(url) => DatasetAlias::new( + if self.tenancy_config == TenancyConfig::MultiTenant { + Some(self.current_account_subject.account_name().clone()) + } else { + None + }, + self.infer_local_name_from_url(url)?, + ), + }) + } + + fn infer_local_name_from_url(&self, url: &Url) -> Result { + // Try to use last path segment for a name (ignoring the trailing slash) + if let Some(path) = url.path_segments() { + if let Some(last_segment) = path.rev().find(|s| !s.is_empty()) { + if let Ok(name) = DatasetName::try_from(last_segment) { + return Ok(name); + } + } + } + // Fall back to using domain name + if let Some(url::Host::Domain(host)) = url.host() { + if let Ok(name) = DatasetName::try_from(host) { + return Ok(name); + } + } + Err(PullError::InvalidOperation( + "Unable to infer local name from remote URL, please specify the destination explicitly" + .to_owned(), + )) + } + + async fn resolve_remote_ref( + &self, + request: &PullRequest, + maybe_local_handle: Option<&DatasetHandle>, + ) -> Result, PullError> { + let remote_ref = if let PullRequest::Remote(remote) = request { + Ok(Some(remote.remote_ref.clone())) + } else if let Some(hdl) = &maybe_local_handle { + self.resolve_pull_alias(hdl).await + } else { + Ok(None) + }?; + + Ok(remote_ref) + } + + async fn resolve_pull_alias( + &self, + hdl: &DatasetHandle, + ) -> Result, PullError> { + let remote_aliases = match self.remote_alias_registry.get_remote_aliases(hdl).await { + Ok(v) => Ok(v), + Err(e) => match e { + GetAliasesError::Internal(e) => Err(PullError::Internal(e)), + }, + }?; + + let mut pull_aliases: Vec<_> = remote_aliases.get_by_kind(RemoteAliasKind::Pull).collect(); + + match pull_aliases.len() { + 0 => Ok(None), + 1 => Ok(Some(pull_aliases.remove(0).clone())), + _ => Err(PullError::AmbiguousSource), + } + } + + // TODO: consider using data from dependency graph + async fn traverse_upstream_datasets( + &mut self, + summary: DatasetSummary, + ) -> Result { + // TODO: EVO: Should be accounting for historical dependencies, not only current + // ones? + let mut max_dep_depth = -1; + + for dependency_id in summary.dependencies { + tracing::debug!(%dependency_id, "Descending into dependency"); + + let depth = self + .traverse_pull_graph( + &PullRequest::local(dependency_id.as_local_ref()), + false, + true, + ) + .await?; + max_dep_depth = std::cmp::max(max_dep_depth, depth); + } + + Ok(max_dep_depth) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +struct PullItem { + depth: i32, + local_target: PullLocalTarget, + maybe_remote_ref: Option, + maybe_original_request: Option, +} + +impl PartialOrd for PullItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for PullItem { + fn cmp(&self, other: &Self) -> Ordering { + let depth_ord = self.depth.cmp(&other.depth); + if depth_ord != Ordering::Equal { + return depth_ord; + } + + if self.maybe_remote_ref.is_some() != other.maybe_remote_ref.is_some() { + return if self.maybe_remote_ref.is_some() { + Ordering::Less + } else { + Ordering::Greater + }; + } + + self.local_target.alias().cmp(other.local_target.alias()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/pull_service_impl.rs b/src/infra/core/src/pull_service_impl.rs deleted file mode 100644 index a24a4a446f..0000000000 --- a/src/infra/core/src/pull_service_impl.rs +++ /dev/null @@ -1,791 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::cmp::Ordering; -use std::collections::HashMap; -use std::sync::Arc; - -use chrono::prelude::*; -use dill::*; -use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::*; -use kamu_ingest_datafusion::DataWriterDataFusion; -use opendatafabric::*; -use time_source::SystemTimeSource; -use url::Url; - -pub struct PullServiceImpl { - dataset_repo: Arc, - remote_alias_reg: Arc, - ingest_svc: Arc, - transform_svc: Arc, - sync_svc: Arc, - system_time_source: Arc, - current_account_subject: Arc, - dataset_action_authorizer: Arc, -} - -#[component(pub)] -#[interface(dyn PullService)] -impl PullServiceImpl { - pub fn new( - dataset_repo: Arc, - remote_alias_reg: Arc, - ingest_svc: Arc, - transform_svc: Arc, - sync_svc: Arc, - system_time_source: Arc, - current_account_subject: Arc, - dataset_action_authorizer: Arc, - ) -> Self { - Self { - dataset_repo, - remote_alias_reg, - ingest_svc, - transform_svc, - sync_svc, - system_time_source, - current_account_subject, - dataset_action_authorizer, - } - } - - // This function descends down the dependency tree of datasets (starting with - // provided references) assigning depth index to every dataset in the - // graph(s). Datasets that share the same depth level are independent and - // can be pulled in parallel. - async fn collect_pull_graph( - &self, - requests: impl IntoIterator, - options: &PullMultiOptions, - ) -> (Vec, Vec) { - let mut visited = HashMap::new(); - let mut errors = Vec::new(); - - for pr in requests { - match self - .collect_pull_graph_depth_first(pr, true, options, &mut visited) - .await - { - Ok(_) => {} - Err(e) => errors.push(PullResponse { - original_request: Some(pr.clone()), - local_ref: None, - remote_ref: None, - result: Err(e), - }), - } - } - - let mut ordered = Vec::with_capacity(visited.len()); - ordered.extend(visited.into_values()); - ordered.sort(); - (ordered, errors) - } - - #[async_recursion::async_recursion] - async fn collect_pull_graph_depth_first( - &self, - request: &PullRequest, - referenced_explicitly: bool, - options: &PullMultiOptions, - visited: &mut HashMap, - ) -> Result { - tracing::debug!(?request, "Entering node"); - - // Resolve local dataset if it exists - let local_handle = if let Some(local_ref) = &request.local_ref { - let local_handle = self.dataset_repo.try_resolve_dataset_ref(local_ref).await?; - if local_handle.is_none() && request.remote_ref.is_none() { - // Dataset does not exist locally nor remote ref was provided - return Err(PullError::NotFound(DatasetNotFoundError { - dataset_ref: local_ref.clone(), - })); - } - local_handle - } else if let Some(remote_ref) = &request.remote_ref { - self.try_inverse_lookup_dataset_by_pull_alias(remote_ref) - .await? - } else { - panic!("Pull request must contain either local or remote reference") - }; - - // Resolve the name of a local dataset if it exists - // or a name to create dataset with if syncing from remote and creation is - // allowed - let local_alias = if let Some(hdl) = &local_handle { - // Target exists - hdl.alias.clone() - } else if let Some(local_ref) = &request.local_ref { - // Target does not exist but was provided - if let Some(alias) = local_ref.alias() { - alias.clone() - } else { - return Err(PullError::NotFound(DatasetNotFoundError { - dataset_ref: local_ref.clone(), - })); - } - } else { - // Infer target name from remote reference - // TODO: Inferred name can already exist, should we care? - match &request.remote_ref { - Some(DatasetRefRemote::ID(_, _)) => { - unimplemented!("Pulling from remote by ID is not supported") - } - Some( - DatasetRefRemote::Alias(alias) - | DatasetRefRemote::Handle(DatasetHandleRemote { alias, .. }), - ) => DatasetAlias::new(None, alias.dataset_name.clone()), - Some(DatasetRefRemote::Url(url)) => DatasetAlias::new( - if self.dataset_repo.is_multi_tenant() { - match self.current_account_subject.as_ref() { - CurrentAccountSubject::Anonymous(_) => { - panic!("Anonymous account misused, use multi-tenant alias"); - } - CurrentAccountSubject::Logged(l) => Some(l.account_name.clone()), - } - } else { - None - }, - self.infer_local_name_from_url(url)?, - ), - None => unreachable!(), - } - }; - - if local_handle.is_none() && !options.sync_options.create_if_not_exists { - return Err(PullError::InvalidOperation( - "Dataset does not exist and auto-create is switched off".to_owned(), - )); - } - - // Already visited? - if let Some(pi) = visited.get_mut(&local_alias) { - tracing::debug!("Already visited - continuing"); - if referenced_explicitly { - pi.original_request = Some(request.clone()); - } - return Ok(pi.depth); - } - - // Resolve remote alias, if any - let remote_ref = if let Some(remote_ref) = &request.remote_ref { - Ok(Some(remote_ref.clone())) - } else if let Some(hdl) = &local_handle { - self.resolve_pull_alias(&hdl.as_local_ref()).await - } else { - Ok(None) - }?; - - let mut pull_item = if remote_ref.is_some() { - // Datasets synced from remotes are depth 0 - PullItem { - original_request: None, // May be set below - depth: 0, - local_ref: local_handle - .map(Into::into) - .unwrap_or(local_alias.clone().into()), - remote_ref, - } - } else { - // Pulling an existing local root or derivative dataset - let local_handle = local_handle.unwrap(); - - let summary = self - .dataset_repo - .get_dataset_by_handle(&local_handle) - .get_summary(GetSummaryOpts::default()) - .await - .int_err()?; - - // TODO: EVO: Should be accounting for historical dependencies, not only current - // ones? - let mut max_dep_depth = -1; - - for dependency_id in summary.dependencies { - tracing::debug!(%dependency_id, "Descending into dependency"); - - let depth = self - .collect_pull_graph_depth_first( - &PullRequest { - local_ref: Some(dependency_id.as_local_ref()), - remote_ref: None, - }, - false, - options, - visited, - ) - .await?; - max_dep_depth = std::cmp::max(max_dep_depth, depth); - } - - PullItem { - original_request: None, // May be set below - depth: max_dep_depth + 1, - local_ref: local_handle.into(), - remote_ref: None, - } - }; - - if referenced_explicitly { - pull_item.original_request = Some(request.clone()); - } - - tracing::debug!(?pull_item, "Resolved node"); - - let depth = pull_item.depth; - visited.insert(local_alias.clone(), pull_item); - Ok(depth) - } - - // TODO: avoid traversing all datasets for every alias - async fn try_inverse_lookup_dataset_by_pull_alias( - &self, - remote_ref: &DatasetRefRemote, - ) -> Result, InternalError> { - // Do a quick check when remote and local names match - if let Some(remote_name) = remote_ref.dataset_name() { - if let Some(local_handle) = self - .dataset_repo - .try_resolve_dataset_ref( - &DatasetAlias::new(None, remote_name.clone()).as_local_ref(), - ) - .await? - { - if self - .remote_alias_reg - .get_remote_aliases(&local_handle.as_local_ref()) - .await - .int_err()? - .contains(remote_ref, RemoteAliasKind::Pull) - { - return Ok(Some(local_handle)); - } - } - } - - // No luck - now have to search through aliases (of current user) - if let CurrentAccountSubject::Logged(l) = self.current_account_subject.as_ref() { - use tokio_stream::StreamExt; - let mut datasets = self.dataset_repo.get_datasets_by_owner(&l.account_name); - while let Some(dataset_handle) = datasets.next().await { - let dataset_handle = dataset_handle?; - - if self - .remote_alias_reg - .get_remote_aliases(&dataset_handle.as_local_ref()) - .await - .int_err()? - .contains(remote_ref, RemoteAliasKind::Pull) - { - return Ok(Some(dataset_handle)); - } - } - } - - Ok(None) - } - - async fn resolve_pull_alias( - &self, - local_ref: &DatasetRef, - ) -> Result, PullError> { - let remote_aliases = match self.remote_alias_reg.get_remote_aliases(local_ref).await { - Ok(v) => Ok(v), - Err(GetAliasesError::DatasetNotFound(e)) => Err(PullError::NotFound(e)), - Err(e) => Err(e.int_err().into()), - }?; - - let mut pull_aliases: Vec<_> = remote_aliases.get_by_kind(RemoteAliasKind::Pull).collect(); - - match pull_aliases.len() { - 0 => Ok(None), - 1 => Ok(Some(pull_aliases.remove(0).clone())), - _ => Err(PullError::AmbiguousSource), - } - } - - fn infer_local_name_from_url(&self, url: &Url) -> Result { - // Try to use last path segment for a name (ignoring the trailing slash) - if let Some(path) = url.path_segments() { - if let Some(last_segment) = path.rev().find(|s| !s.is_empty()) { - if let Ok(name) = DatasetName::try_from(last_segment) { - return Ok(name); - } - } - } - // Fall back to using domain name - if let Some(url::Host::Domain(host)) = url.host() { - if let Ok(name) = DatasetName::try_from(host) { - return Ok(name); - } - } - Err(PullError::InvalidOperation( - "Unable to infer local name from remote URL, please specify the destination explicitly" - .to_owned(), - )) - } - - fn slice<'a>(&self, to_slice: &'a [PullItem]) -> (i32, bool, &'a [PullItem], &'a [PullItem]) { - let first = &to_slice[0]; - let count = to_slice - .iter() - .take_while(|pi| { - pi.depth == first.depth && pi.remote_ref.is_some() == first.remote_ref.is_some() - }) - .count(); - ( - first.depth, - first.remote_ref.is_some(), - &to_slice[..count], - &to_slice[count..], - ) - } - - async fn ingest_multi( - &self, - batch: &[PullItem], // TODO: Move to avoid cloning - options: &PullMultiOptions, - listener: Option>, - ) -> Result, InternalError> { - let ingest_requests = batch.iter().map(|pi| pi.local_ref.clone()).collect(); - - let ingest_responses = self - .ingest_svc - .ingest_multi(ingest_requests, options.ingest_options.clone(), listener) - .await; - - assert_eq!(batch.len(), ingest_responses.len()); - - Ok(std::iter::zip(batch, ingest_responses) - .map(|(pi, res)| { - assert_eq!(pi.local_ref, res.dataset_ref); - pi.clone().into_response_ingest(res) - }) - .collect()) - } - - async fn sync_multi( - &self, - batch: &[PullItem], // TODO: Move to avoid cloning - options: &PullMultiOptions, - listener: Option>, - ) -> Result, InternalError> { - let sync_requests = batch - .iter() - .map(|pi| SyncRequest { - src: pi.remote_ref.as_ref().unwrap().into(), - dst: pi.local_ref.as_any_ref(), - }) - .collect(); - - let sync_results = self - .sync_svc - .sync_multi(sync_requests, options.sync_options.clone(), listener) - .await; - - assert_eq!(batch.len(), sync_results.len()); - - let results: Vec<_> = std::iter::zip(batch, sync_results) - .map(|(pi, res)| { - assert_eq!(pi.local_ref.as_any_ref(), res.dst); - pi.clone().into_response_sync(res) - }) - .collect(); - - // Associate newly-synced datasets with remotes - if options.add_aliases { - for res in &results { - if let Ok(PullResult::Updated { old_head: None, .. }) = res.result { - if let Some(remote_ref) = &res.remote_ref { - self.remote_alias_reg - .get_remote_aliases(res.local_ref.as_ref().unwrap()) - .await - .int_err()? - .add(remote_ref, RemoteAliasKind::Pull) - .await?; - } - } - } - } - - Ok(results) - } - - async fn transform_multi( - &self, - batch: &[PullItem], // TODO: Move to avoid cloning - transform_listener: Option>, - reset_derivatives_on_diverged_input: bool, - ) -> Result, InternalError> { - let transform_requests = batch.iter().map(|pi| pi.local_ref.clone()).collect(); - - let transform_results = self - .transform_svc - .transform_multi( - transform_requests, - TransformOptions { - reset_derivatives_on_diverged_input, - }, - transform_listener, - ) - .await; - - assert_eq!(batch.len(), transform_results.len()); - - Ok(std::iter::zip(batch, transform_results) - .map(|(pi, res)| { - assert_eq!(pi.local_ref, res.0); - pi.clone().into_response_transform(res) - }) - .collect()) - } -} - -#[async_trait::async_trait] -impl PullService for PullServiceImpl { - #[tracing::instrument(level = "info", skip_all)] - async fn pull( - &self, - dataset_ref: &DatasetRefAny, - options: PullOptions, - listener: Option>, - ) -> Result { - let request = - PullRequest::from_any_ref(dataset_ref, |_| !self.dataset_repo.is_multi_tenant()); - - self.pull_ext(&request, options, listener).await - } - - #[tracing::instrument(level = "info", skip_all)] - async fn pull_ext( - &self, - request: &PullRequest, - options: PullOptions, - listener: Option>, - ) -> Result { - let listener = - listener.map(|l| Arc::new(ListenerMultiAdapter(l)) as Arc); - - // TODO: PERF: If we are updating a single dataset using pull_multi will do A - // LOT of unnecessary work like analyzing the whole dependency graph. - let mut responses = self - .pull_multi_ext( - vec![request.clone()], - PullMultiOptions { - recursive: false, - all: false, - reset_derivatives_on_diverged_input: options - .reset_derivatives_on_diverged_input, - add_aliases: options.add_aliases, - ingest_options: options.ingest_options, - sync_options: options.sync_options, - }, - listener, - ) - .await?; - - assert_eq!(responses.len(), 1); - responses.pop().unwrap().result - } - - #[tracing::instrument(level = "info", skip_all)] - async fn pull_multi( - &self, - dataset_refs: Vec, - options: PullMultiOptions, - listener: Option>, - ) -> Result, InternalError> { - let requests = dataset_refs - .into_iter() - .map(|r| PullRequest::from_any_ref(&r, |_| !self.dataset_repo.is_multi_tenant())) - .collect(); - - self.pull_multi_ext(requests, options, listener).await - } - - #[tracing::instrument(level = "info", skip_all)] - async fn pull_multi_ext( - &self, - requests: Vec, - options: PullMultiOptions, - listener: Option>, - ) -> Result, InternalError> { - let current_account_name = match self.current_account_subject.as_ref() { - CurrentAccountSubject::Anonymous(_) => { - return Err("Anonymous account misused, use multi-tenant alias").int_err() - } - CurrentAccountSubject::Logged(l) => l.account_name.clone(), - }; - let requests: Vec<_> = if !options.all { - requests - } else { - use futures::TryStreamExt; - self.dataset_repo - .get_datasets_by_owner(¤t_account_name) - .map_ok(|hdl| PullRequest { - local_ref: Some(hdl.into()), - remote_ref: None, - }) - .try_collect() - .await? - }; - - tracing::info!(?requests, ?options, "Performing pull"); - - let (mut plan, errors) = self.collect_pull_graph(&requests, &options).await; - tracing::info!( - num_items = plan.len(), - num_errors = errors.len(), - ?plan, - "Resolved pull plan" - ); - if !errors.is_empty() { - return Ok(errors); - } - - if !(options.recursive || options.all) { - // Leave only datasets explicitly mentioned, preserving the depth order - plan.retain(|pi| pi.original_request.is_some()); - } - - tracing::info!(num_items = plan.len(), ?plan, "Retained pull plan"); - - let mut results = Vec::with_capacity(plan.len()); - - let mut rest = &plan[..]; - while !rest.is_empty() { - let (depth, is_remote, batch, tail) = self.slice(rest); - rest = tail; - - let results_level: Vec<_> = if depth == 0 && !is_remote { - tracing::info!(%depth, ?batch, "Running ingest batch"); - self.ingest_multi( - batch, - &options, - listener - .as_ref() - .and_then(|l| l.clone().get_ingest_listener()), - ) - .await? - } else if depth == 0 && is_remote { - tracing::info!(%depth, ?batch, "Running sync batch"); - self.sync_multi( - batch, - &options, - listener - .as_ref() - .and_then(|l| l.clone().get_sync_listener()), - ) - .await? - } else { - tracing::info!(%depth, ?batch, "Running transform batch"); - self.transform_multi( - batch, - listener - .as_ref() - .and_then(|l| l.clone().get_transform_listener()), - options.reset_derivatives_on_diverged_input, - ) - .await? - }; - - let errors = results_level.iter().any(|r| r.result.is_err()); - results.extend(results_level); - if errors { - break; - } - } - - Ok(results) - } - - async fn set_watermark( - &self, - dataset_ref: &DatasetRef, - new_watermark: DateTime, - ) -> Result { - let aliases = match self.remote_alias_reg.get_remote_aliases(dataset_ref).await { - Ok(v) => Ok(v), - Err(GetAliasesError::DatasetNotFound(e)) => Err(SetWatermarkError::NotFound(e)), - Err(GetAliasesError::Internal(e)) => Err(SetWatermarkError::Internal(e)), - }?; - - if !aliases.is_empty(RemoteAliasKind::Pull) { - return Err(SetWatermarkError::IsRemote); - } - - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Write) - .await?; - - let dataset = self.dataset_repo.find_dataset_by_ref(dataset_ref).await?; - let summary = dataset - .get_summary(GetSummaryOpts::default()) - .await - .int_err()?; - - if summary.kind != DatasetKind::Root { - return Err(SetWatermarkError::IsDerivative); - } - - let mut writer = - DataWriterDataFusion::builder(dataset, datafusion::prelude::SessionContext::new()) - .with_metadata_state_scanned(None) - .await - .int_err()? - .build(); - - match writer - .write_watermark( - new_watermark, - WriteWatermarkOpts { - system_time: self.system_time_source.now(), - new_source_state: None, - }, - ) - .await - { - Ok(res) => Ok(PullResult::Updated { - old_head: Some(res.old_head), - new_head: res.new_head, - }), - Err( - WriteWatermarkError::EmptyCommit(_) - | WriteWatermarkError::CommitError(CommitError::MetadataAppendError( - AppendError::InvalidBlock(AppendValidationError::WatermarkIsNotMonotonic), - )), - ) => Ok(PullResult::UpToDate(PullResultUpToDate::SetWatermark)), - Err(e) => Err(e.int_err().into()), - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone, PartialEq, Eq)] -struct PullItem { - depth: i32, - local_ref: DatasetRef, - remote_ref: Option, - original_request: Option, -} - -impl PullItem { - fn into_response_ingest(self, r: PollingIngestResponse) -> PullResponse { - PullResponse { - original_request: self.original_request, - local_ref: Some(r.dataset_ref), - remote_ref: None, - result: match r.result { - Ok(r) => Ok(r.into()), - Err(e) => Err(e.into()), - }, - } - } - - fn into_response_sync(self, r: SyncResultMulti) -> PullResponse { - PullResponse { - original_request: self.original_request, - local_ref: r.dst.as_local_ref(|_| true).ok(), // TODO: multi-tenancy - remote_ref: r.src.as_remote_ref(|_| true).ok(), - result: match r.result { - Ok(r) => Ok(r.into()), - Err(e) => Err(e.into()), - }, - } - } - - fn into_response_transform( - self, - r: (DatasetRef, Result), - ) -> PullResponse { - PullResponse { - original_request: self.original_request, - local_ref: Some(r.0), - remote_ref: None, - result: match r.1 { - Ok(r) => Ok(r.into()), - Err(e) => Err(e.into()), - }, - } - } -} - -impl PartialOrd for PullItem { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for PullItem { - fn cmp(&self, other: &Self) -> Ordering { - let depth_ord = self.depth.cmp(&other.depth); - if depth_ord != Ordering::Equal { - return depth_ord; - } - - if self.remote_ref.is_some() != other.remote_ref.is_some() { - return if self.remote_ref.is_some() { - Ordering::Less - } else { - Ordering::Greater - }; - } - - match (self.local_ref.alias(), other.local_ref.alias()) { - (Some(lhs), Some(rhs)) => lhs.cmp(rhs), - (Some(_), None) => Ordering::Greater, - (None, Some(_)) => Ordering::Less, - _ => Ordering::Equal, - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct ListenerMultiAdapter(Arc); - -impl PullMultiListener for ListenerMultiAdapter { - fn get_ingest_listener(self: Arc) -> Option> { - Some(self) - } - - fn get_transform_listener(self: Arc) -> Option> { - Some(self) - } - - fn get_sync_listener(self: Arc) -> Option> { - Some(self) - } -} - -impl PollingIngestMultiListener for ListenerMultiAdapter { - fn begin_ingest(&self, _dataset: &DatasetHandle) -> Option> { - self.0.clone().get_ingest_listener() - } -} - -impl TransformMultiListener for ListenerMultiAdapter { - fn begin_transform(&self, _dataset: &DatasetHandle) -> Option> { - self.0.clone().get_transform_listener() - } -} - -impl SyncMultiListener for ListenerMultiAdapter { - fn begin_sync( - &self, - _src: &DatasetRefAny, - _dst: &DatasetRefAny, - ) -> Option> { - self.0.clone().get_sync_listener() - } -} diff --git a/src/infra/core/src/push_request_planner_impl.rs b/src/infra/core/src/push_request_planner_impl.rs new file mode 100644 index 0000000000..5f30fe5f42 --- /dev/null +++ b/src/infra/core/src/push_request_planner_impl.rs @@ -0,0 +1,79 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::*; +use kamu_core::*; +use opendatafabric::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct PushRequestPlannerImpl { + remote_alias_resolver: Arc, +} + +#[component(pub)] +#[interface(dyn PushRequestPlanner)] +impl PushRequestPlannerImpl { + pub fn new(remote_alias_resolver: Arc) -> Self { + Self { + remote_alias_resolver, + } + } + + async fn collect_push_plan_item( + &self, + local_handle: DatasetHandle, + push_target: Option<&DatasetPushTarget>, + ) -> Result { + tracing::debug!(%local_handle, "Resolved push plan local target"); + + match self + .remote_alias_resolver + .resolve_push_target(&local_handle, push_target.cloned()) + .await + { + Ok(remote_target) => Ok(PushItem { + local_handle, + remote_target, + push_target: push_target.cloned(), + }), + Err(e) => Err(PushResponse { + local_handle: Some(local_handle), + target: push_target.cloned(), + result: Err(e.into()), + }), + } + } +} + +#[async_trait::async_trait] +impl PushRequestPlanner for PushRequestPlannerImpl { + #[tracing::instrument(level = "debug", skip_all, fields(?dataset_handles, ?push_target))] + async fn collect_plan( + &self, + dataset_handles: &[DatasetHandle], + push_target: Option<&DatasetPushTarget>, + ) -> (Vec, Vec) { + let mut plan = Vec::new(); + let mut errors = Vec::new(); + + for hdl in dataset_handles { + match self.collect_push_plan_item(hdl.clone(), push_target).await { + Ok(item) => plan.push(item), + Err(err) => errors.push(err), + } + } + + (plan, errors) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/push_service_impl.rs b/src/infra/core/src/push_service_impl.rs deleted file mode 100644 index 3b213fe8d3..0000000000 --- a/src/infra/core/src/push_service_impl.rs +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::sync::Arc; - -use dill::*; -use kamu_core::*; -use opendatafabric::*; - -pub struct PushServiceImpl { - dataset_repo: Arc, - remote_alias_reg: Arc, - remote_alias_resolver: Arc, - sync_svc: Arc, -} - -#[component(pub)] -#[interface(dyn PushService)] -impl PushServiceImpl { - pub fn new( - dataset_repo: Arc, - remote_alias_reg: Arc, - remote_alias_resolver: Arc, - sync_svc: Arc, - ) -> Self { - Self { - dataset_repo, - remote_alias_reg, - remote_alias_resolver, - sync_svc, - } - } - - async fn collect_plan( - &self, - items: &Vec, - push_target: &Option, - ) -> (Vec, Vec) { - let mut plan = Vec::new(); - let mut errors = Vec::new(); - - for dataset_ref in items { - match self.collect_plan_item(dataset_ref, push_target).await { - Ok(item) => plan.push(item), - Err(err) => errors.push(err), - } - } - - (plan, errors) - } - - async fn collect_plan_item( - &self, - dataset_ref: &DatasetRef, - push_target: &Option, - ) -> Result { - // Resolve local dataset if we have a local reference - let local_handle = match self.dataset_repo.resolve_dataset_ref(dataset_ref).await { - Ok(h) => h, - Err(e) => { - return Err(PushResponse { - local_handle: None, - target: push_target.clone(), - result: Err(e.into()), - }) - } - }; - - match self - .remote_alias_resolver - .resolve_push_target(&local_handle, push_target.clone()) - .await - { - Ok(remote_target) => Ok(PushItem { - local_handle, - remote_target, - push_target: push_target.clone(), - }), - Err(e) => Err(PushResponse { - local_handle: Some(local_handle), - target: push_target.clone(), - result: Err(e.into()), - }), - } - } -} - -#[async_trait::async_trait] -impl PushService for PushServiceImpl { - async fn push_multi( - &self, - dataset_refs: Vec, - options: PushMultiOptions, - sync_listener: Option>, - ) -> Vec { - if options.recursive { - unimplemented!("Recursive push is not yet supported") - } - if options.all { - unimplemented!("Pushing all datasets is not yet supported") - } - - let (plan, errors) = self - .collect_plan(&dataset_refs, &options.remote_target) - .await; - if !errors.is_empty() { - return errors; - } - - let sync_results = self - .sync_svc - .sync_multi( - plan.iter() - .map(|pi| SyncRequest { - src: pi.local_handle.as_any_ref(), - dst: (&pi.remote_target.url).into(), - }) - .collect(), - options.sync_options, - sync_listener, - ) - .await; - - assert_eq!(plan.len(), sync_results.len()); - - let results: Vec<_> = std::iter::zip(&plan, sync_results) - .map(|(pi, res)| { - let remote_ref: DatasetRefAny = (&pi.remote_target.url).into(); - assert_eq!(pi.local_handle.as_any_ref(), res.src); - assert_eq!(remote_ref, res.dst); - pi.as_response(res.result) - }) - .collect(); - - // If no errors - add aliases to initial items - if options.add_aliases && results.iter().all(|r| r.result.is_ok()) { - for push_item in &plan { - // TODO: Improve error handling - self.remote_alias_reg - .get_remote_aliases(&(push_item.local_handle.as_local_ref())) - .await - .unwrap() - .add( - &((&push_item.remote_target.url).into()), - RemoteAliasKind::Push, - ) - .await - .unwrap(); - } - } - - results - } -} - -#[derive(Debug)] -struct PushItem { - local_handle: DatasetHandle, - remote_target: RemoteTarget, - push_target: Option, -} - -impl PushItem { - fn as_response(&self, result: Result) -> PushResponse { - PushResponse { - local_handle: Some(self.local_handle.clone()), - target: self.push_target.clone(), - result: result.map_err(Into::into), - } - } -} diff --git a/src/infra/core/src/query/mod.rs b/src/infra/core/src/query/mod.rs index 71c8f22e7b..941e7489e7 100644 --- a/src/infra/core/src/query/mod.rs +++ b/src/infra/core/src/query/mod.rs @@ -11,6 +11,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::sync::{Arc, Mutex}; +use auth::DatasetAction; use datafusion::arrow::datatypes::{Schema, SchemaRef}; use datafusion::catalog::{CatalogProvider, SchemaProvider, Session}; use datafusion::common::{Constraints, Statistics}; @@ -81,40 +82,39 @@ struct KamuSchemaImpl { // from ever being released. session_config: Arc, table_options: Arc, - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, options: QueryOptions, - cache: Mutex, -} - -#[derive(Default)] -struct SchemaCache { - tables: Option>>, + tables: Mutex>>, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// impl KamuSchema { - pub fn new( + pub async fn prepare( session_context: &SessionContext, - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, options: QueryOptions, - ) -> Self { - Self { + ) -> Result { + let schema = Self { inner: Arc::new(KamuSchemaImpl { session_config: Arc::new(session_context.copied_config()), table_options: Arc::new(session_context.copied_table_options()), - dataset_repo, + dataset_registry, dataset_action_authorizer, options, - cache: Mutex::new(SchemaCache::default()), + tables: Mutex::new(HashMap::new()), }), - } + }; + + schema.init_schema_cache().await?; + + Ok(schema) } #[tracing::instrument(level = "info", skip_all)] - async fn init_schema_cache(&self) -> Result { + async fn init_schema_cache(&self) -> Result<(), InternalError> { let mut tables = HashMap::new(); let name_resolution_enabled = self.inner.options.input_datasets.is_empty(); @@ -123,8 +123,8 @@ impl KamuSchema { for (id, opts) in &self.inner.options.input_datasets { let hdl = self .inner - .dataset_repo - .resolve_dataset_ref(&id.as_local_ref()) + .dataset_registry + .resolve_dataset_handle_by_ref(&id.as_local_ref()) .await .int_err()?; @@ -138,15 +138,14 @@ impl KamuSchema { continue; } - let dataset = self.inner.dataset_repo.get_dataset_by_handle(&hdl); + let resolved_dataset = self.inner.dataset_registry.get_dataset_by_handle(&hdl); tables.insert( opts.alias.clone(), Arc::new(KamuTable::new( self.inner.session_config.clone(), self.inner.table_options.clone(), - hdl, - dataset, + resolved_dataset, opts.block_hash.clone(), opts.hints.clone(), )), @@ -156,27 +155,30 @@ impl KamuSchema { // TODO: PERF: Scanning all datasets is not just super expensive - it may not be // possible at the public node scale. We need to patch DataFusion to support // unbounded catalogs. - let mut dataset_handles = self.inner.dataset_repo.get_all_datasets(); + let all_dataset_handles: Vec<_> = self + .inner + .dataset_registry + .all_dataset_handles() + .try_collect() + .await + .int_err()?; - while let Some(hdl) = dataset_handles.try_next().await.int_err()? { - if !self - .inner - .dataset_action_authorizer - .is_action_allowed(&hdl, auth::DatasetAction::Read) - .await? - { - continue; - } + let readable_dataset_handles = self + .inner + .dataset_action_authorizer + .filter_datasets_allowing(all_dataset_handles, DatasetAction::Read) + .await + .int_err()?; - let dataset = self.inner.dataset_repo.get_dataset_by_handle(&hdl); + for hdl in readable_dataset_handles { + let resolved_dataset = self.inner.dataset_registry.get_dataset_by_handle(&hdl); tables.insert( hdl.alias.to_string(), Arc::new(KamuTable::new( self.inner.session_config.clone(), self.inner.table_options.clone(), - hdl, - dataset, + resolved_dataset, None, None, )), @@ -184,36 +186,10 @@ impl KamuSchema { } } - Ok(SchemaCache { - tables: Some(tables), - }) - } + let mut guard = self.inner.tables.lock().unwrap(); + *guard = tables; - async fn ensure_cache(&self) -> Result, InternalError> { - { - let cache = self.inner.cache.lock().unwrap(); - if cache.tables.is_some() { - return Ok(cache); - } - } - - let new_cache = self.init_schema_cache().await?; - - { - let mut cache = self.inner.cache.lock().unwrap(); - *cache = new_cache; - Ok(cache) - } - } - - async fn table_names_impl(&self) -> Result, InternalError> { - let cache = self.ensure_cache().await?; - Ok(cache.tables.as_ref().unwrap().keys().cloned().collect()) - } - - async fn table_exist_impl(&self, name: &str) -> Result { - let cache = self.ensure_cache().await?; - Ok(cache.tables.as_ref().unwrap().contains_key(name)) + Ok(()) } } @@ -225,48 +201,20 @@ impl SchemaProvider for KamuSchema { self } - // TODO: Datafusion should make this function async fn table_names(&self) -> Vec { - let this = self.clone(); - - std::thread::spawn(move || { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - runtime.block_on(this.table_names_impl()) - }) - .join() - .unwrap() - .unwrap() + let guard = self.inner.tables.lock().unwrap(); + guard.keys().cloned().collect() } fn table_exist(&self, name: &str) -> bool { - let this = self.clone(); - let name = name.to_owned(); - - std::thread::spawn(move || { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - runtime.block_on(this.table_exist_impl(&name)) - }) - .join() - .unwrap() - .unwrap_or(false) + let guard = self.inner.tables.lock().unwrap(); + guard.contains_key(name) } async fn table(&self, name: &str) -> Result>, DataFusionError> { let table = { - let cache = self - .ensure_cache() - .await - .map_err(|e| DataFusionError::External(e.into()))?; - - cache.tables.as_ref().unwrap().get(name).cloned() + let guard = self.inner.tables.lock().unwrap(); + guard.get(name).cloned() }; if let Some(table) = table { @@ -291,8 +239,7 @@ impl SchemaProvider for KamuSchema { pub(crate) struct KamuTable { session_config: Arc, table_options: Arc, - dataset_handle: DatasetHandle, - dataset: Arc, + resolved_dataset: ResolvedDataset, as_of: Option, hints: Option, cache: Mutex, @@ -308,26 +255,24 @@ impl KamuTable { pub(crate) fn new( session_config: Arc, table_options: Arc, - dataset_handle: DatasetHandle, - dataset: Arc, + resolved_dataset: ResolvedDataset, as_of: Option, hints: Option, ) -> Self { Self { session_config, table_options, - dataset_handle, - dataset, + resolved_dataset, as_of, hints, cache: Mutex::new(TableCache::default()), } } - #[tracing::instrument(level="info", skip_all, fields(dataset_handle = ?self.dataset_handle))] + #[tracing::instrument(level="info", skip_all, fields(dataset = ?self.resolved_dataset))] async fn init_table_schema(&self) -> Result { let maybe_set_data_schema = self - .dataset + .resolved_dataset .as_metadata_chain() .accept_one(SearchSetDataSchemaVisitor::new()) .await @@ -360,7 +305,7 @@ impl KamuTable { // TODO: A lot of duplication from `SessionContext::read_parquet` - code is // copied as we need table provider and not the `DataFrame` - #[tracing::instrument(level="info", skip_all, fields(dataset_handle = ?self.dataset_handle))] + #[tracing::instrument(level="info", skip_all, fields(dataset = ?self.resolved_dataset))] async fn init_table_provider( &self, schema: SchemaRef, @@ -371,7 +316,7 @@ impl KamuTable { return Ok(Arc::new(EmptyTable::new(schema))); } - let object_repo = self.dataset.as_data_repo(); + let object_repo = self.resolved_dataset.as_data_repo(); let file_urls: Vec = stream::iter(files) .then(|h| async move { object_repo.get_internal_url(&h).await }) .map(Into::into) @@ -426,7 +371,7 @@ impl KamuTable { let hash = if let Some(hash) = as_of { hash.clone() } else { - self.dataset + self.resolved_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) .await @@ -447,7 +392,7 @@ impl KamuTable { } let final_state = self - .dataset + .resolved_dataset .as_metadata_chain() .reduce_by_hash( &hash, diff --git a/src/infra/core/src/query_service_impl.rs b/src/infra/core/src/query_service_impl.rs index d38dd1f9ff..3498446def 100644 --- a/src/infra/core/src/query_service_impl.rs +++ b/src/infra/core/src/query_service_impl.rs @@ -31,7 +31,7 @@ use crate::utils::docker_images; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct QueryServiceImpl { - dataset_repo: Arc, + dataset_registry: Arc, object_store_registry: Arc, dataset_action_authorizer: Arc, } @@ -40,18 +40,21 @@ pub struct QueryServiceImpl { #[interface(dyn QueryService)] impl QueryServiceImpl { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, object_store_registry: Arc, dataset_action_authorizer: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, object_store_registry, dataset_action_authorizer, } } - fn session_context(&self, options: QueryOptions) -> SessionContext { + async fn session_context( + &self, + options: QueryOptions, + ) -> Result { let mut cfg = SessionConfig::new() .with_information_schema(true) .with_default_catalog_and_schema("kamu", "kamu"); @@ -72,16 +75,16 @@ impl QueryServiceImpl { let runtime = Arc::new(RuntimeEnv::new(runtime_config).unwrap()); let session_context = SessionContext::new_with_config_rt(cfg, runtime); - session_context.register_catalog( - "kamu", - Arc::new(KamuCatalog::new(Arc::new(KamuSchema::new( - &session_context, - self.dataset_repo.clone(), - self.dataset_action_authorizer.clone(), - options, - )))), - ); - session_context + let schema = KamuSchema::prepare( + &session_context, + self.dataset_registry.clone(), + self.dataset_action_authorizer.clone(), + options, + ) + .await?; + + session_context.register_catalog("kamu", Arc::new(KamuCatalog::new(Arc::new(schema)))); + Ok(session_context) } /// Unless state is already provided in the options this will attempt to @@ -103,15 +106,15 @@ impl QueryServiceImpl { // SECURITY: We expect that access permissions will be validated during // the query execution and that we're not leaking information here if the // user doesn't have access to this dataset. - let dataset = self - .dataset_repo - .find_dataset_by_ref(&id.as_local_ref()) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(&id.as_local_ref()) .await?; let block_hash = if let Some(block_hash) = opts.block_hash { // Validate that block the user is asking for exists // SECURITY: Are we leaking information here by doing this check before auth? - if !dataset + if !resolved_dataset .as_metadata_chain() .contains_block(&block_hash) .await @@ -122,7 +125,7 @@ impl QueryServiceImpl { block_hash } else { - dataset + resolved_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) .await @@ -173,7 +176,11 @@ impl QueryServiceImpl { tracing::warn!(alias, "Ignoring table with invalid alias"); continue; }; - let Ok(hdl) = self.dataset_repo.resolve_dataset_ref(&dataset_ref).await else { + let Ok(hdl) = self + .dataset_registry + .resolve_dataset_handle_by_ref(&dataset_ref) + .await + else { tracing::warn!(?dataset_ref, "Ignoring table with unresolvable alias"); continue; }; @@ -181,9 +188,9 @@ impl QueryServiceImpl { // SECURITY: We expect that access permissions will be validated during // the query execution and that we're not leaking information here if the user // doesn't have access to this dataset. - let dataset = self.dataset_repo.get_dataset_by_handle(&hdl); + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(&hdl); - let block_hash = dataset + let block_hash = resolved_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) .await @@ -201,33 +208,31 @@ impl QueryServiceImpl { &self, dataset_ref: &DatasetRef, last_records_to_consider: Option, - ) -> Result<(Arc, DataFrame), QueryError> { - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; - - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, DatasetAction::Read) + ) -> Result<(ResolvedDataset, DataFrame), QueryError> { + let resolved_dataset = self.resolve_dataset(dataset_ref).await?; + + let ctx = self + .session_context(QueryOptions { + input_datasets: BTreeMap::from([( + resolved_dataset.get_id().clone(), + QueryOptionsDataset { + alias: resolved_dataset.get_alias().to_string(), + block_hash: None, + hints: Some(DatasetQueryHints { + last_records_to_consider, + }), + }, + )]), + }) .await?; - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - - let ctx = self.session_context(QueryOptions { - input_datasets: BTreeMap::from([( - dataset_handle.id.clone(), - QueryOptionsDataset { - alias: dataset_handle.alias.to_string(), - block_hash: None, - hints: Some(DatasetQueryHints { - last_records_to_consider, - }), - }, - )]), - }); - let df = ctx - .table(TableReference::bare(dataset_handle.alias.to_string())) + .table(TableReference::bare( + resolved_dataset.get_alias().to_string(), + )) .await?; - Ok((dataset, df)) + Ok((resolved_dataset, df)) } #[tracing::instrument(level = "debug", skip_all)] @@ -235,15 +240,9 @@ impl QueryServiceImpl { &self, dataset_ref: &DatasetRef, ) -> Result, QueryError> { - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; + let resolved_dataset = self.resolve_dataset(dataset_ref).await?; - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, DatasetAction::Read) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - - let schema = dataset + let schema = resolved_dataset .as_metadata_chain() .accept_one(SearchSetDataSchemaVisitor::new()) .await @@ -262,16 +261,10 @@ impl QueryServiceImpl { session_context: &SessionContext, dataset_ref: &DatasetRef, ) -> Result, QueryError> { - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; - - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, DatasetAction::Read) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); + let resolved_dataset = self.resolve_dataset(dataset_ref).await?; // TODO: Update to use SetDataSchema event - let maybe_last_data_slice_hash = dataset + let maybe_last_data_slice_hash = resolved_dataset .as_metadata_chain() .last_data_block_with_new_data() .await @@ -288,7 +281,7 @@ impl QueryServiceImpl { Some(last_data_slice_hash) => { // TODO: Avoid boxing url - requires datafusion to fix API let data_url = Box::new( - dataset + resolved_dataset .as_data_repo() .get_internal_url(&last_data_slice_hash) .await, @@ -308,6 +301,22 @@ impl QueryServiceImpl { None => Ok(None), } } + + async fn resolve_dataset( + &self, + dataset_ref: &DatasetRef, + ) -> Result { + let dataset_handle = self + .dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await?; + + self.dataset_action_authorizer + .check_action_allowed(&dataset_handle, DatasetAction::Read) + .await?; + + Ok(self.dataset_registry.get_dataset_by_handle(&dataset_handle)) + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -316,7 +325,7 @@ impl QueryServiceImpl { impl QueryService for QueryServiceImpl { #[tracing::instrument(level = "info", skip_all)] async fn create_session(&self) -> Result { - Ok(self.session_context(QueryOptions::default())) + Ok(self.session_context(QueryOptions::default()).await?) } #[tracing::instrument( @@ -331,7 +340,7 @@ impl QueryService for QueryServiceImpl { skip: u64, limit: u64, ) -> Result { - let (dataset, df) = self.single_dataset(dataset_ref, Some(skip + limit)).await?; + let (resolved_dataset, df) = self.single_dataset(dataset_ref, Some(skip + limit)).await?; // Our custom catalog provider resolves schemas lazily, so the dataset will be // found even if it's empty and its schema will be empty, but we decide not to @@ -342,7 +351,7 @@ impl QueryService for QueryServiceImpl { })?; } - let vocab: DatasetVocabulary = dataset + let vocab: DatasetVocabulary = resolved_dataset .as_metadata_chain() .accept_one(SearchSetVocabVisitor::new()) .await @@ -398,7 +407,7 @@ impl QueryService for QueryServiceImpl { }) .collect(), }; - let ctx = self.session_context(options); + let ctx = self.session_context(options).await?; let df = ctx.sql(statement).await?; Ok(QueryResponse { df, state }) @@ -417,7 +426,7 @@ impl QueryService for QueryServiceImpl { &self, dataset_ref: &DatasetRef, ) -> Result, QueryError> { - let ctx = self.session_context(QueryOptions::default()); + let ctx = self.session_context(QueryOptions::default()).await?; self.get_schema_parquet_impl(&ctx, dataset_ref).await } diff --git a/src/infra/core/src/remote_alias_resolver_impl.rs b/src/infra/core/src/remote_alias_resolver_impl.rs index beef055a13..9a8edb2c3d 100644 --- a/src/infra/core/src/remote_alias_resolver_impl.rs +++ b/src/infra/core/src/remote_alias_resolver_impl.rs @@ -14,7 +14,7 @@ use auth::OdfServerAccessTokenResolver; use dill::*; use internal_error::{InternalError, ResultIntoInternal}; use kamu_core::*; -use opendatafabric as odf; +use opendatafabric::{self as odf, DatasetHandle}; use url::Url; use crate::UrlExt; @@ -44,12 +44,12 @@ impl RemoteAliasResolverImpl { async fn fetch_remote_url( &self, - local_handle: &odf::DatasetHandle, + dataset_handle: &DatasetHandle, remote_alias_kind: RemoteAliasKind, ) -> Result, ResolveAliasError> { let remote_aliases = self .remote_alias_reg - .get_remote_aliases(&local_handle.as_local_ref()) + .get_remote_aliases(dataset_handle) .await .int_err()?; @@ -75,13 +75,11 @@ impl RemoteAliasResolverImpl { ) -> Result { let mut res_url = repo_url.clone().as_odf_protocol().int_err()?; - { - let mut path_segments = res_url.path_segments_mut().unwrap(); - if let Some(account_name) = account_name_maybe { - path_segments.push(account_name); - } - path_segments.push(dataset_name); + if let Some(account_name) = account_name_maybe { + res_url = res_url.join(format!("{account_name}/").as_str()).unwrap(); } + res_url = res_url.join(dataset_name).unwrap(); + Ok(res_url) } @@ -111,9 +109,10 @@ impl RemoteAliasResolverImpl { #[async_trait::async_trait] impl RemoteAliasResolver for RemoteAliasResolverImpl { + #[tracing::instrument(level = "debug", skip_all, fields(dataset_handle, ?dataset_push_target_maybe))] async fn resolve_push_target( &self, - local_dataset_handle: &odf::DatasetHandle, + dataset_handle: &DatasetHandle, dataset_push_target_maybe: Option, ) -> Result { let (repo_name, mut account_name, dataset_name) = if let Some(dataset_push_target) = @@ -134,7 +133,7 @@ impl RemoteAliasResolver for RemoteAliasResolverImpl { } } else { if let Some(remote_url) = self - .fetch_remote_url(local_dataset_handle, RemoteAliasKind::Push) + .fetch_remote_url(dataset_handle, RemoteAliasKind::Push) .await? { return Ok(RemoteTarget::new(remote_url, None, None, None)); @@ -172,7 +171,7 @@ impl RemoteAliasResolver for RemoteAliasResolverImpl { dn } else { self.resolve_remote_dataset_name( - local_dataset_handle, + dataset_handle, &transfer_url, access_token_maybe.as_ref(), ) diff --git a/src/infra/core/src/remote_aliases_registry_impl.rs b/src/infra/core/src/remote_aliases_registry_impl.rs index 3b712ac681..5d5b52ed7d 100644 --- a/src/infra/core/src/remote_aliases_registry_impl.rs +++ b/src/infra/core/src/remote_aliases_registry_impl.rs @@ -14,26 +14,22 @@ use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; use kamu_core::*; use opendatafabric::serde::yaml::Manifest; use opendatafabric::*; +use thiserror::Error; use super::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[derive(Clone)] +#[component(pub)] +#[interface(dyn RemoteAliasesRegistry)] pub struct RemoteAliasesRegistryImpl { - dataset_repo: Arc, + dataset_registry: Arc, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[component(pub)] -#[interface(dyn RemoteAliasesRegistry)] impl RemoteAliasesRegistryImpl { - pub fn new(dataset_repo: Arc) -> Self { - Self { dataset_repo } - } - - async fn read_config(dataset: Arc) -> Result { + async fn read_config(dataset: &dyn Dataset) -> Result { match dataset.as_info_repo().get("config").await { Ok(bytes) => { let manifest: Manifest = @@ -48,7 +44,7 @@ impl RemoteAliasesRegistryImpl { } async fn write_config( - dataset: Arc, + dataset: &dyn Dataset, config: &DatasetConfig, ) -> Result<(), InternalError> { let manifest = Manifest { @@ -72,11 +68,11 @@ impl RemoteAliasesRegistryImpl { impl RemoteAliasesRegistry for RemoteAliasesRegistryImpl { async fn get_remote_aliases( &self, - dataset_ref: &DatasetRef, + dataset_handle: &DatasetHandle, ) -> Result, GetAliasesError> { - let dataset = self.dataset_repo.find_dataset_by_ref(dataset_ref).await?; - let config = Self::read_config(dataset.clone()).await?; - Ok(Box::new(RemoteAliasesImpl::new(dataset, config))) + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); + let config = Self::read_config(resolved_dataset.as_ref()).await?; + Ok(Box::new(RemoteAliasesImpl::new(resolved_dataset, config))) } } @@ -85,13 +81,16 @@ impl RemoteAliasesRegistry for RemoteAliasesRegistryImpl { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct RemoteAliasesImpl { - dataset: Arc, + resolved_dataset: ResolvedDataset, config: DatasetConfig, } impl RemoteAliasesImpl { - fn new(dataset: Arc, config: DatasetConfig) -> Self { - Self { dataset, config } + fn new(resolved_dataset: ResolvedDataset, config: DatasetConfig) -> Self { + Self { + resolved_dataset, + config, + } } } @@ -142,7 +141,8 @@ impl RemoteAliases for RemoteAliasesImpl { let remote_ref = remote_ref.to_owned(); if !aliases.contains(&remote_ref) { aliases.push(remote_ref); - RemoteAliasesRegistryImpl::write_config(self.dataset.clone(), &self.config).await?; + RemoteAliasesRegistryImpl::write_config(self.resolved_dataset.as_ref(), &self.config) + .await?; Ok(true) } else { Ok(false) @@ -161,7 +161,8 @@ impl RemoteAliases for RemoteAliasesImpl { if let Some(i) = aliases.iter().position(|r| *r == *remote_ref) { aliases.remove(i); - RemoteAliasesRegistryImpl::write_config(self.dataset.clone(), &self.config).await?; + RemoteAliasesRegistryImpl::write_config(self.resolved_dataset.as_ref(), &self.config) + .await?; Ok(true) } else { Ok(false) @@ -176,7 +177,8 @@ impl RemoteAliases for RemoteAliasesImpl { let len = aliases.len(); if !aliases.is_empty() { aliases.clear(); - RemoteAliasesRegistryImpl::write_config(self.dataset.clone(), &self.config).await?; + RemoteAliasesRegistryImpl::write_config(self.resolved_dataset.as_ref(), &self.config) + .await?; } Ok(len) } @@ -192,12 +194,14 @@ pub struct RemoteAliasesRegistryNull; impl RemoteAliasesRegistry for RemoteAliasesRegistryNull { async fn get_remote_aliases( &self, - dataset_ref: &DatasetRef, + _dataset_handle: &DatasetHandle, ) -> Result, GetAliasesError> { - Err(DatasetNotFoundError { - dataset_ref: dataset_ref.clone(), - } - .into()) + #[derive(Error, Debug)] + #[error("get_remote_aliases requested from stub implementation")] + struct NullError {} + + let e = NullError {}; + Err(GetAliasesError::Internal(e.int_err())) } } diff --git a/src/infra/core/src/repos/dataset_factory_impl.rs b/src/infra/core/src/repos/dataset_factory_impl.rs index a089d1c56f..798d4b1791 100644 --- a/src/infra/core/src/repos/dataset_factory_impl.rs +++ b/src/infra/core/src/repos/dataset_factory_impl.rs @@ -65,6 +65,7 @@ impl DatasetFactoryImpl { ObjectRepositoryLocalFS::new(layout.data_dir), ObjectRepositoryLocalFS::new(layout.checkpoints_dir), NamedObjectRepositoryLocalFS::new(layout.info_dir), + Url::from_directory_path(&layout.root_dir).unwrap(), ) } @@ -101,6 +102,7 @@ impl DatasetFactoryImpl { base_url.join("info/").unwrap(), header_map, ), + base_url.clone(), ) } @@ -114,10 +116,6 @@ impl DatasetFactoryImpl { // TODO: We should ensure optimal credential reuse. Perhaps in future we should // create a cache of S3Contexts keyed by an endpoint. let s3_context = S3Context::from_url(&base_url).await; - Self::get_s3_from_context(s3_context) - } - - pub fn get_s3_from_context(s3_context: S3Context) -> Result { Ok(DatasetImpl::new( MetadataChainImpl::new( MetadataBlockRepositoryCachingInMem::new(MetadataBlockRepositoryImpl::new( @@ -130,6 +128,7 @@ impl DatasetFactoryImpl { ObjectRepositoryS3Sha3::new(s3_context.sub_context("data/")), ObjectRepositoryS3Sha3::new(s3_context.sub_context("checkpoints/")), NamedObjectRepositoryS3::new(s3_context.into_sub_context("info/")), + base_url, )) } @@ -212,6 +211,7 @@ impl DatasetFactoryImpl { Default::default(), ), NamedObjectRepositoryIpfsHttp::new(client.clone(), dataset_url.join("info/").unwrap()), + dataset_url.clone(), )) } diff --git a/src/infra/core/src/repos/dataset_impl.rs b/src/infra/core/src/repos/dataset_impl.rs index f75d7cc85f..f6d07f1ad3 100644 --- a/src/infra/core/src/repos/dataset_impl.rs +++ b/src/infra/core/src/repos/dataset_impl.rs @@ -13,6 +13,7 @@ use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; use kamu_core::*; use opendatafabric::serde::yaml::Manifest; use opendatafabric::*; +use url::Url; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -21,6 +22,7 @@ pub struct DatasetImpl { data_repo: DataRepo, checkpoint_repo: CheckpointRepo, info_repo: InfoRepo, + storage_internal_url: Url, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -38,12 +40,14 @@ where data_repo: DataRepo, checkpoint_repo: CheckpointRepo, info_repo: InfoRepo, + storage_internal_url: Url, ) -> Self { Self { metadata_chain, data_repo, checkpoint_repo, info_repo, + storage_internal_url, } } @@ -585,6 +589,10 @@ where summary.ok_or_else(|| GetSummaryError::EmptyDataset) } + fn get_storage_internal_url(&self) -> &Url { + &self.storage_internal_url + } + fn as_metadata_chain(&self) -> &dyn MetadataChain { &self.metadata_chain } diff --git a/src/infra/core/src/repos/dataset_repository_helpers.rs b/src/infra/core/src/repos/dataset_repository_helpers.rs index 80fd38d089..6f2ec8c624 100644 --- a/src/infra/core/src/repos/dataset_repository_helpers.rs +++ b/src/infra/core/src/repos/dataset_repository_helpers.rs @@ -25,7 +25,7 @@ pub fn get_staging_name() -> String { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub(crate) async fn create_dataset_from_snapshot_impl< - TRepository: DatasetRepositoryExt + DatasetRepositoryWriter, + TRepository: DatasetRepository + DatasetRepositoryWriter, >( dataset_repo: &TRepository, mut snapshot: DatasetSnapshot, @@ -237,7 +237,7 @@ async fn resolve_transform_inputs( let mut missing_inputs = Vec::new(); for input in inputs.iter_mut() { - let hdl = match repo.resolve_dataset_ref(&input.dataset_ref).await { + let hdl = match repo.resolve_dataset_handle_by_ref(&input.dataset_ref).await { Ok(hdl) => Ok(hdl), Err(GetDatasetError::NotFound(_)) => { // Accumulate errors to report as one diff --git a/src/infra/core/src/repos/dataset_repository_local_fs.rs b/src/infra/core/src/repos/dataset_repository_local_fs.rs index 1d2083d849..d59a249d9b 100644 --- a/src/infra/core/src/repos/dataset_repository_local_fs.rs +++ b/src/infra/core/src/repos/dataset_repository_local_fs.rs @@ -33,20 +33,22 @@ pub struct DatasetRepositoryLocalFs { #[component(pub)] impl DatasetRepositoryLocalFs { + #[allow(clippy::needless_pass_by_value)] pub fn new( root: PathBuf, current_account_subject: Arc, - multi_tenant: bool, + tenancy_config: Arc, system_time_source: Arc, ) -> Self { Self { - storage_strategy: if multi_tenant { - Box::new(DatasetMultiTenantStorageStrategy::new( + storage_strategy: match *tenancy_config { + TenancyConfig::MultiTenant => Box::new(DatasetMultiTenantStorageStrategy::new( root, current_account_subject, - )) - } else { - Box::new(DatasetSingleTenantStorageStrategy::new(root)) + )), + TenancyConfig::SingleTenant => { + Box::new(DatasetSingleTenantStorageStrategy::new(root)) + } }, thrash_lock: tokio::sync::Mutex::new(()), system_time_source, @@ -64,6 +66,7 @@ impl DatasetRepositoryLocalFs { ObjectRepositoryLocalFSSha3::new(layout.data_dir), ObjectRepositoryLocalFSSha3::new(layout.checkpoints_dir), NamedObjectRepositoryLocalFS::new(layout.info_dir), + Url::from_directory_path(&layout.root_dir).unwrap(), )) } @@ -73,7 +76,7 @@ impl DatasetRepositoryLocalFs { &self, dataset_ref: &DatasetRef, ) -> Result { - let dataset_handle = self.resolve_dataset_ref(dataset_ref).await?; + let dataset_handle = self.resolve_dataset_handle_by_ref(dataset_ref).await?; Ok(DatasetLayout::new( self.storage_strategy.get_dataset_path(&dataset_handle), )) @@ -94,23 +97,8 @@ impl DatasetRepositoryLocalFs { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[async_trait] -impl DatasetRegistry for DatasetRepositoryLocalFs { - async fn get_dataset_url(&self, dataset_ref: &DatasetRef) -> Result { - let dataset_handle = self.resolve_dataset_ref(dataset_ref).await?; - let dataset_path = self.storage_strategy.get_dataset_path(&dataset_handle); - Ok(Url::from_directory_path(dataset_path).unwrap()) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #[async_trait] impl DatasetRepository for DatasetRepositoryLocalFs { - fn is_multi_tenant(&self) -> bool { - self.storage_strategy.is_multi_tenant() - } - // TODO: PERF: Cache data and speed up lookups by ID // // TODO: CONCURRENCY: Since resolving ID to Name currently requires accessing @@ -120,7 +108,7 @@ impl DatasetRepository for DatasetRepositoryLocalFs { // // Note that this lock does not prevent concurrent updates to summaries, only // reduces the chances of it. - async fn resolve_dataset_ref( + async fn resolve_dataset_handle_by_ref( &self, dataset_ref: &DatasetRef, ) -> Result { @@ -149,29 +137,22 @@ impl DatasetRepository for DatasetRepositoryLocalFs { } // TODO: PERF: Resolving handles currently involves reading summary files - fn get_all_datasets(&self) -> DatasetHandleStream<'_> { + fn all_dataset_handles(&self) -> DatasetHandleStream<'_> { self.storage_strategy.get_all_datasets() } - fn get_datasets_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_> { + fn all_dataset_handles_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_> { self.storage_strategy.get_datasets_by_owner(account_name) } - async fn find_dataset_by_ref( - &self, - dataset_ref: &DatasetRef, - ) -> Result, GetDatasetError> { - let dataset_handle = self.resolve_dataset_ref(dataset_ref).await?; - let dataset = self.get_dataset_by_handle(&dataset_handle); - Ok(dataset) - } - fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> Arc { let layout = DatasetLayout::new(self.storage_strategy.get_dataset_path(dataset_handle)); Self::build_dataset(layout) } } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[async_trait] impl DatasetRepositoryWriter for DatasetRepositoryLocalFs { async fn create_dataset( @@ -181,14 +162,14 @@ impl DatasetRepositoryWriter for DatasetRepositoryLocalFs { ) -> Result { // Check if a dataset with the same alias can be resolved successfully let maybe_existing_dataset_handle = match self - .resolve_dataset_ref(&dataset_alias.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await { Ok(existing_handle) => Ok(Some(existing_handle)), // ToDo temporary fix, remove it on favor of // https://github.com/kamu-data/kamu-cli/issues/342 Err(GetDatasetError::NotFound(_)) => match self - .resolve_dataset_ref(&(seed_block.event.dataset_id.clone().into())) + .resolve_dataset_handle_by_ref(&(seed_block.event.dataset_id.clone().into())) .await { Ok(existing_handle) => Ok(Some(existing_handle)), @@ -355,8 +336,6 @@ impl DatasetRepositoryWriter for DatasetRepositoryLocalFs { #[async_trait] trait DatasetStorageStrategy: Sync + Send { - fn is_multi_tenant(&self) -> bool; - fn get_dataset_path(&self, dataset_handle: &DatasetHandle) -> PathBuf; fn get_all_datasets(&self) -> DatasetHandleStream<'_>; @@ -476,10 +455,6 @@ impl DatasetSingleTenantStorageStrategy { #[async_trait] impl DatasetStorageStrategy for DatasetSingleTenantStorageStrategy { - fn is_multi_tenant(&self) -> bool { - false - } - fn get_dataset_path(&self, dataset_handle: &DatasetHandle) -> PathBuf { self.dataset_path_impl(&dataset_handle.alias) } @@ -760,10 +735,6 @@ impl DatasetMultiTenantStorageStrategy { #[async_trait] impl DatasetStorageStrategy for DatasetMultiTenantStorageStrategy { - fn is_multi_tenant(&self) -> bool { - true - } - fn get_dataset_path(&self, dataset_handle: &DatasetHandle) -> PathBuf { let account_name = self.effective_account_name(&dataset_handle.alias); diff --git a/src/infra/core/src/repos/dataset_repository_s3.rs b/src/infra/core/src/repos/dataset_repository_s3.rs index 1c5afd7a1d..4ef8a821e6 100644 --- a/src/infra/core/src/repos/dataset_repository_s3.rs +++ b/src/infra/core/src/repos/dataset_repository_s3.rs @@ -19,7 +19,6 @@ use kamu_core::*; use opendatafabric::*; use time_source::SystemTimeSource; use tokio::sync::Mutex; -use url::Url; use crate::utils::s3_context::S3Context; use crate::*; @@ -29,7 +28,7 @@ use crate::*; pub struct DatasetRepositoryS3 { s3_context: S3Context, current_account_subject: Arc, - multi_tenant: bool, + tenancy_config: Arc, registry_cache: Option>, metadata_cache_local_fs_path: Option>, system_time_source: Arc, @@ -51,7 +50,7 @@ impl DatasetRepositoryS3 { pub fn new( s3_context: S3Context, current_account_subject: Arc, - multi_tenant: bool, + tenancy_config: Arc, registry_cache: Option>, metadata_cache_local_fs_path: Option>, system_time_source: Arc, @@ -59,7 +58,7 @@ impl DatasetRepositoryS3 { Self { s3_context, current_account_subject, - multi_tenant, + tenancy_config, registry_cache, metadata_cache_local_fs_path, system_time_source, @@ -71,6 +70,8 @@ impl DatasetRepositoryS3 { .s3_context .sub_context(&format!("{}/", &dataset_id.as_multibase())); + let s3_context_url = s3_context.make_url(); + // TODO: Consider switching DatasetImpl to dynamic dispatch to simplify // configurability if let Some(metadata_cache_local_fs_path) = &self.metadata_cache_local_fs_path { @@ -89,6 +90,7 @@ impl DatasetRepositoryS3 { ObjectRepositoryS3Sha3::new(s3_context.sub_context("data/")), ObjectRepositoryS3Sha3::new(s3_context.sub_context("checkpoints/")), NamedObjectRepositoryS3::new(s3_context.into_sub_context("info/")), + s3_context_url, )) } else { Arc::new(DatasetImpl::new( @@ -103,6 +105,7 @@ impl DatasetRepositoryS3 { ObjectRepositoryS3Sha3::new(s3_context.sub_context("data/")), ObjectRepositoryS3Sha3::new(s3_context.sub_context("checkpoints/")), NamedObjectRepositoryS3::new(s3_context.into_sub_context("info/")), + s3_context_url, )) } } @@ -207,7 +210,7 @@ impl DatasetRepositoryS3 { fn normalize_alias(&self, alias: &DatasetAlias) -> DatasetAlias { if alias.is_multi_tenant() { alias.clone() - } else if self.is_multi_tenant() { + } else if *self.tenancy_config == TenancyConfig::MultiTenant { match self.current_account_subject.as_ref() { CurrentAccountSubject::Anonymous(_) => { panic!("Anonymous account misused, use multi-tenant alias"); @@ -224,22 +227,9 @@ impl DatasetRepositoryS3 { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[async_trait] -impl DatasetRegistry for DatasetRepositoryS3 { - async fn get_dataset_url(&self, _dataset_ref: &DatasetRef) -> Result { - unimplemented!("get_dataset_url not supported by S3 repository") - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #[async_trait] impl DatasetRepository for DatasetRepositoryS3 { - fn is_multi_tenant(&self) -> bool { - self.multi_tenant - } - - async fn resolve_dataset_ref( + async fn resolve_dataset_handle_by_ref( &self, dataset_ref: &DatasetRef, ) -> Result { @@ -249,7 +239,7 @@ impl DatasetRepository for DatasetRepositoryS3 { // TODO: this is really really slow and expensive! let normalized_alias = self.normalize_alias(alias); use futures::StreamExt; - let mut datasets = self.get_all_datasets(); + let mut datasets = self.all_dataset_handles(); while let Some(hdl) = datasets.next().await { let hdl = hdl?; if hdl.alias == normalized_alias { @@ -281,12 +271,14 @@ impl DatasetRepository for DatasetRepositoryS3 { } } - fn get_all_datasets(&self) -> DatasetHandleStream<'_> { + fn all_dataset_handles(&self) -> DatasetHandleStream<'_> { self.stream_datasets_if(|_| true) } - fn get_datasets_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_> { - if !self.is_multi_tenant() && *account_name != DEFAULT_ACCOUNT_NAME_STR { + fn all_dataset_handles_by_owner(&self, account_name: &AccountName) -> DatasetHandleStream<'_> { + if *self.tenancy_config == TenancyConfig::SingleTenant + && *account_name != DEFAULT_ACCOUNT_NAME_STR + { return Box::pin(futures::stream::empty()); } @@ -300,15 +292,6 @@ impl DatasetRepository for DatasetRepositoryS3 { }) } - async fn find_dataset_by_ref( - &self, - dataset_ref: &DatasetRef, - ) -> Result, GetDatasetError> { - let dataset_handle = self.resolve_dataset_ref(dataset_ref).await?; - let dataset = self.get_dataset_impl(&dataset_handle.id); - Ok(dataset) - } - fn get_dataset_by_handle(&self, dataset_handle: &DatasetHandle) -> Arc { self.get_dataset_impl(&dataset_handle.id) } @@ -328,7 +311,7 @@ impl DatasetRepositoryWriter for DatasetRepositoryS3 { // Check if a dataset with the same alias can be resolved successfully let maybe_existing_dataset_handle = match self - .resolve_dataset_ref(&dataset_alias.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await { Ok(existing_handle) => Ok(Some(existing_handle)), @@ -446,7 +429,10 @@ impl DatasetRepositoryWriter for DatasetRepositoryS3 { DatasetAlias::new(dataset_handle.alias.account_name.clone(), new_name.clone()); // Note: should collision check be moved to use case level? - match self.resolve_dataset_ref(&new_alias.as_local_ref()).await { + match self + .resolve_dataset_handle_by_ref(&new_alias.as_local_ref()) + .await + { Ok(_) => Err(RenameDatasetError::NameCollision(NameCollisionError { alias: DatasetAlias::new( dataset_handle.alias.account_name.clone(), diff --git a/src/infra/core/src/reset_service_impl.rs b/src/infra/core/src/reset_service_impl.rs index bf7bf6211c..a4cf3b06da 100644 --- a/src/infra/core/src/reset_service_impl.rs +++ b/src/infra/core/src/reset_service_impl.rs @@ -7,8 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::sync::Arc; - use dill::*; use internal_error::ResultIntoInternal; use kamu_core::*; @@ -16,43 +14,23 @@ use opendatafabric::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -pub struct ResetServiceImpl { - dataset_repo: Arc, - dataset_action_authorizer: Arc, -} - #[component(pub)] #[interface(dyn ResetService)] -impl ResetServiceImpl { - pub fn new( - dataset_repo: Arc, - dataset_action_authorizer: Arc, - ) -> Self { - Self { - dataset_repo, - dataset_action_authorizer, - } - } -} +pub struct ResetServiceImpl {} #[async_trait::async_trait] impl ResetService for ResetServiceImpl { + #[tracing::instrument(level = "info", skip_all, fields(new_head = ?new_head_maybe, old_head = ?old_head_maybe))] async fn reset_dataset( &self, - dataset_handle: &DatasetHandle, + target: ResolvedDataset, new_head_maybe: Option<&Multihash>, old_head_maybe: Option<&Multihash>, ) -> Result { - self.dataset_action_authorizer - .check_action_allowed(dataset_handle, auth::DatasetAction::Write) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - let new_head = if let Some(new_head) = new_head_maybe { new_head } else { - &dataset + &target .as_metadata_chain() .accept_one(SearchSeedVisitor::new()) .await @@ -62,7 +40,7 @@ impl ResetService for ResetServiceImpl { .0 }; if let Some(old_head) = old_head_maybe - && let Some(current_head) = dataset + && let Some(current_head) = target .as_metadata_chain() .try_get_ref(&BlockRef::Head) .await? @@ -74,7 +52,7 @@ impl ResetService for ResetServiceImpl { })); } - dataset + target .as_metadata_chain() .set_ref( &BlockRef::Head, diff --git a/src/infra/core/src/sync_request_builder.rs b/src/infra/core/src/sync_request_builder.rs new file mode 100644 index 0000000000..9f16923d10 --- /dev/null +++ b/src/infra/core/src/sync_request_builder.rs @@ -0,0 +1,193 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::component; +use internal_error::ErrorIntoInternal; +use kamu_core::services::DatasetNotFoundError; +use kamu_core::{ + BlockRef, + Dataset, + DatasetFactory, + DatasetRegistry, + DatasetRegistryExt, + GetDatasetError, + GetRefError, + RemoteRepositoryRegistry, + SyncError, + SyncRef, + SyncRefRemote, + SyncRequest, +}; +use opendatafabric::{DatasetHandleRemote, DatasetRefAny, DatasetRefRemote}; +use url::Url; + +use crate::UrlExt; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +pub struct SyncRequestBuilder { + dataset_registry: Arc, + dataset_factory: Arc, + remote_repo_registry: Arc, +} + +impl SyncRequestBuilder { + pub fn new( + dataset_registry: Arc, + dataset_factory: Arc, + remote_repo_registry: Arc, + ) -> Self { + Self { + dataset_registry, + dataset_factory, + remote_repo_registry, + } + } + + #[tracing::instrument(level = "debug", skip_all, fields(?src_ref, ?dst_ref, %create_dst_if_not_exists))] + pub async fn build_sync_request( + &self, + src_ref: DatasetRefAny, + dst_ref: DatasetRefAny, + create_dst_if_not_exists: bool, + ) -> Result { + let src_sync_ref = self.resolve_source_sync_ref(&src_ref).await?; + + let dst_sync_ref = self + .resolve_dest_sync_ref(&dst_ref, create_dst_if_not_exists) + .await?; + + let sync_request = SyncRequest { + src: src_sync_ref, + dst: dst_sync_ref, + }; + + Ok(sync_request) + } + + async fn resolve_source_sync_ref(&self, any_ref: &DatasetRefAny) -> Result { + match any_ref.as_local_ref(|repo| self.remote_repo_registry.get_repository(repo).is_ok()) { + Ok(local_ref) => { + let resolved_dataset = self.dataset_registry.get_dataset_by_ref(&local_ref).await?; + self.ensure_dataset_head_present(local_ref.as_any_ref(), resolved_dataset.as_ref()) + .await?; + Ok(SyncRef::Local(resolved_dataset)) + } + Err(remote_ref) => { + let remote_dataset_url = Arc::new(resolve_remote_dataset_url( + self.remote_repo_registry.as_ref(), + &remote_ref, + )?); + let dataset = self + .dataset_factory + .get_dataset(remote_dataset_url.as_ref(), false) + .await?; + self.ensure_dataset_head_present( + DatasetRefAny::Url(remote_dataset_url.clone()), + dataset.as_ref(), + ) + .await?; + Ok(SyncRef::Remote(SyncRefRemote { + url: remote_dataset_url, + dataset, + original_remote_ref: remote_ref, + })) + } + } + } + + async fn resolve_dest_sync_ref( + &self, + any_ref: &DatasetRefAny, + create_if_not_exists: bool, + ) -> Result { + match any_ref.as_local_ref(|repo| self.remote_repo_registry.get_repository(repo).is_ok()) { + Ok(local_ref) => match self.dataset_registry.get_dataset_by_ref(&local_ref).await { + Ok(resolved_dataset) => Ok(SyncRef::Local(resolved_dataset)), + Err(GetDatasetError::NotFound(_)) if create_if_not_exists => { + if let Some(alias) = local_ref.alias() { + Ok(SyncRef::LocalNew(alias.clone())) + } else { + Err(DatasetNotFoundError::new(local_ref.as_any_ref()).into()) + } + } + Err(err) => Err(err.into()), + }, + Err(remote_ref) => { + let remote_dataset_url = Arc::new(resolve_remote_dataset_url( + self.remote_repo_registry.as_ref(), + &remote_ref, + )?); + let dataset = self + .dataset_factory + .get_dataset(remote_dataset_url.as_ref(), create_if_not_exists) + .await?; + + if !create_if_not_exists { + self.ensure_dataset_head_present(remote_ref.as_any_ref(), dataset.as_ref()) + .await?; + } + + Ok(SyncRef::Remote(SyncRefRemote { + url: remote_dataset_url, + dataset, + original_remote_ref: remote_ref, + })) + } + } + } + + async fn ensure_dataset_head_present( + &self, + dataset_ref: DatasetRefAny, + dataset: &dyn Dataset, + ) -> Result<(), SyncError> { + match dataset + .as_metadata_chain() + .resolve_ref(&BlockRef::Head) + .await + { + Ok(_) => Ok(()), + Err(GetRefError::NotFound(_)) => Err(DatasetNotFoundError { dataset_ref }.into()), + Err(GetRefError::Access(e)) => Err(SyncError::Access(e)), + Err(GetRefError::Internal(e)) => Err(SyncError::Internal(e)), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn resolve_remote_dataset_url( + remote_repo_registry: &dyn RemoteRepositoryRegistry, + remote_ref: &DatasetRefRemote, +) -> Result { + // TODO: REMOTE ID + match remote_ref { + DatasetRefRemote::ID(_, _) => Err(SyncError::Internal( + "Syncing remote dataset by ID is not yet supported".int_err(), + )), + DatasetRefRemote::Alias(alias) + | DatasetRefRemote::Handle(DatasetHandleRemote { alias, .. }) => { + let mut repo = remote_repo_registry.get_repository(&alias.repo_name)?; + + repo.url.ensure_trailing_slash(); + Ok(repo.url.join(&format!("{}/", alias.local_alias())).unwrap()) + } + DatasetRefRemote::Url(url) => { + let mut dataset_url = url.as_ref().clone(); + dataset_url.ensure_trailing_slash(); + Ok(dataset_url) + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/sync_service_impl.rs b/src/infra/core/src/sync_service_impl.rs index 56d3334ca8..363c2063b6 100644 --- a/src/infra/core/src/sync_service_impl.rs +++ b/src/infra/core/src/sync_service_impl.rs @@ -19,20 +19,21 @@ use opendatafabric::*; use url::Url; use super::utils::smart_transfer_protocol::SmartTransferProtocolClient; +use crate::resolve_remote_dataset_url; use crate::utils::ipfs_wrapper::*; -use crate::utils::simple_transfer_protocol::{DatasetFactoryFn, SimpleTransferProtocol}; -use crate::utils::smart_transfer_protocol::TransferOptions; -use crate::DatasetRepositoryWriter; +use crate::utils::simple_transfer_protocol::{ + SimpleProtocolTransferOptions, + SimpleTransferProtocol, +}; +use crate::utils::smart_transfer_protocol::TransferOptions as SmartTransferOptions; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct SyncServiceImpl { remote_repo_reg: Arc, - dataset_repo: Arc, - dataset_repo_writer: Arc, - dataset_action_authorizer: Arc, dataset_factory: Arc, smart_transfer_protocol: Arc, + simple_transfer_protocol: Arc, ipfs_client: Arc, } @@ -43,170 +44,30 @@ pub struct SyncServiceImpl { impl SyncServiceImpl { pub fn new( remote_repo_reg: Arc, - dataset_repo: Arc, - dataset_repo_writer: Arc, - dataset_action_authorizer: Arc, dataset_factory: Arc, smart_transfer_protocol: Arc, + simple_transfer_protocol: Arc, ipfs_client: Arc, ) -> Self { Self { remote_repo_reg, - dataset_repo, - dataset_repo_writer, - dataset_action_authorizer, dataset_factory, smart_transfer_protocol, + simple_transfer_protocol, ipfs_client, } } - fn resolve_sync_ref(&self, any_ref: &DatasetRefAny) -> Result { - match any_ref.as_local_ref(|repo| self.remote_repo_reg.get_repository(repo).is_ok()) { - Ok(local_ref) => Ok(SyncRef::Local(local_ref)), - Err(remote_ref) => Ok(SyncRef::Remote(Arc::new( - self.resolve_remote_dataset_url(&remote_ref)?, - ))), - } - } - - fn resolve_remote_dataset_url(&self, remote_ref: &DatasetRefRemote) -> Result { - // TODO: REMOTE ID - match remote_ref { - DatasetRefRemote::ID(_, _) => Err(SyncError::Internal( - "Syncing remote dataset by ID is not yet supported".int_err(), - )), - DatasetRefRemote::Alias(alias) - | DatasetRefRemote::Handle(DatasetHandleRemote { alias, .. }) => { - let mut repo = self.remote_repo_reg.get_repository(&alias.repo_name)?; - - repo.url.ensure_trailing_slash(); - Ok(repo.url.join(&format!("{}/", alias.local_alias())).unwrap()) - } - DatasetRefRemote::Url(url) => { - let mut dataset_url = url.as_ref().clone(); - dataset_url.ensure_trailing_slash(); - Ok(dataset_url) - } - } - } - - async fn get_dataset_reader( - &self, - dataset_ref: &SyncRef, - ) -> Result, SyncError> { - let dataset = match dataset_ref { - SyncRef::Local(local_ref) => { - let dataset_handle = self.dataset_repo.resolve_dataset_ref(local_ref).await?; - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Read) - .await?; - - self.dataset_repo.find_dataset_by_ref(local_ref).await? - } - SyncRef::Remote(url) => { - // TODO: implement authorization checks somehow - self.dataset_factory - .get_dataset(url.as_ref(), false) - .await? - } - }; - - match dataset - .as_metadata_chain() - .resolve_ref(&BlockRef::Head) - .await - { - Ok(_) => Ok(dataset), - Err(GetRefError::NotFound(_)) => Err(DatasetNotFoundError { - dataset_ref: dataset_ref.as_any_ref(), - } - .into()), - Err(GetRefError::Access(e)) => Err(SyncError::Access(e)), - Err(GetRefError::Internal(e)) => Err(SyncError::Internal(e)), - } - } - - async fn get_dataset_writer( - &self, - dataset_ref: &SyncRef, - create_if_not_exists: bool, - ) -> Result<(Option>, Option), SyncError> { - match dataset_ref { - SyncRef::Local(local_ref) => { - match self.dataset_repo.find_dataset_by_ref(local_ref).await { - Ok(dataset) => { - let dataset_handle = - self.dataset_repo.resolve_dataset_ref(local_ref).await?; - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Write) - .await?; - - Ok((Some(dataset), None)) - } - Err(GetDatasetError::NotFound(_)) if create_if_not_exists => { - let alias = local_ref.alias().unwrap().clone(); - let repo_writer = self.dataset_repo_writer.clone(); - - Ok(( - None, - Some(Box::new(move |seed_block| { - Box::pin(async move { - // After retrieving the dataset externally, we default to - // private visibility. - /*let create_options = CreateDatasetUseCaseOptions { - dataset_visibility: DatasetVisibility::Private, - };*/ - - repo_writer.create_dataset(&alias, seed_block).await - }) - })), - )) - } - Err(err) => Err(err.into()), - } - } - SyncRef::Remote(url) => { - // TODO: implement authorization checks somehow - let dataset = self - .dataset_factory - .get_dataset(url.as_ref(), create_if_not_exists) - .await?; - - if !create_if_not_exists { - match dataset - .as_metadata_chain() - .resolve_ref(&BlockRef::Head) - .await - { - Ok(_) => Ok(()), - Err(GetRefError::NotFound(_)) => Err(DatasetNotFoundError { - dataset_ref: dataset_ref.as_any_ref(), - } - .into()), - Err(GetRefError::Access(e)) => Err(SyncError::Access(e)), - Err(GetRefError::Internal(e)) => Err(SyncError::Internal(e)), - }?; - } - - Ok((Some(dataset), None)) - } - } - } - + #[tracing::instrument(level = "debug", skip_all, fields(?src, ?dst, ?opts))] async fn sync_generic( &self, - src_ref: &SyncRef, - dst_ref: &SyncRef, + src: SyncRef, + dst: SyncRef, opts: SyncOptions, listener: Arc, ) -> Result { - let src_is_local = src_ref.is_local(); - - let src_dataset = self.get_dataset_reader(src_ref).await?; - let (dst_dataset, dst_factory) = self - .get_dataset_writer(dst_ref, opts.create_if_not_exists) - .await?; + let src_is_local = src.is_local(); + let trust_source_hashes = opts.trust_source.unwrap_or(src_is_local); let validation = if opts.trust_source.unwrap_or(src_is_local) { AppendValidation::None @@ -214,69 +75,92 @@ impl SyncServiceImpl { AppendValidation::Full }; - let trust_source_hashes = opts.trust_source.unwrap_or(src_is_local); + let src_dataset = match &src { + SyncRef::Local(resolved_dataset) => (**resolved_dataset).clone(), + SyncRef::LocalNew(_) => unreachable!(), + SyncRef::Remote(src_remote) => src_remote.dataset.clone(), + }; + + let maybe_dst_dataset = match &dst { + SyncRef::Local(resolved_dataset) => Some((**resolved_dataset).clone()), + SyncRef::LocalNew(_) => None, + SyncRef::Remote(src_remote) => Some(src_remote.dataset.clone()), + }; + + let maybe_dst_alias = match &dst { + SyncRef::Local(l) => Some(l.get_alias()), + SyncRef::LocalNew(alias) => Some(alias), + SyncRef::Remote(_) => None, + }; tracing::info!("Starting sync using Simple Transfer Protocol"); - SimpleTransferProtocol + self.simple_transfer_protocol .sync( - &src_ref.as_any_ref(), + &src.as_internal_any_ref(), src_dataset, - dst_dataset, - dst_factory, + maybe_dst_dataset, + maybe_dst_alias, validation, trust_source_hashes, opts.force, + SimpleProtocolTransferOptions::default(), listener, ) .await } + #[tracing::instrument(level = "debug", skip_all, fields(%src_url, ?dst, ?opts))] async fn sync_smart_pull_transfer_protocol( &self, src_url: &Url, - dst_ref: &SyncRef, + dst: SyncRef, opts: SyncOptions, listener: Arc, ) -> Result { let http_src_url = src_url.odf_to_transport_protocol()?; - let (dst_dataset, dst_factory) = self - .get_dataset_writer(dst_ref, opts.create_if_not_exists) - .await?; - tracing::info!("Starting sync using Smart Transfer Protocol (Pull flow)"); + let maybe_dst_alias = match &dst { + SyncRef::Local(l) => Some(l.get_alias()), + SyncRef::LocalNew(alias) => Some(alias), + SyncRef::Remote(_) => None, + }; + + let maybe_dst_dataset = match &dst { + SyncRef::Local(resolved_dataset) => Some((**resolved_dataset).clone()), + SyncRef::LocalNew(_) => None, + SyncRef::Remote(src_remote) => Some(src_remote.dataset.clone()), + }; + self.smart_transfer_protocol .pull_protocol_client_flow( &http_src_url, - dst_dataset, - dst_factory, + maybe_dst_dataset, + maybe_dst_alias, listener, - TransferOptions { + SmartTransferOptions { force_update_if_diverged: opts.force, + visibility_for_created_dataset: opts.dataset_visibility, ..Default::default() }, ) .await } + #[tracing::instrument(level = "debug", skip_all, fields(?src, %dst_url, ?opts))] async fn sync_smart_push_transfer_protocol<'a>( &'a self, - src: &SyncRef, + src: SyncRef, dst_url: &Url, opts: SyncOptions, listener: Arc, ) -> Result { - let src_dataset = self.get_dataset_reader(src).await?; - let http_dst_url = dst_url.odf_to_transport_protocol()?; // TODO: move head check into the protocol - let maybe_dst_head = match self - .get_dataset_reader(&SyncRef::Remote(Arc::new(http_dst_url.clone()))) - .await - { + let maybe_dst_head = match self.dataset_factory.get_dataset(&http_dst_url, false).await { Ok(http_dst_dataset_view) => match http_dst_dataset_view .as_metadata_chain() .resolve_ref(&BlockRef::Head) @@ -287,10 +171,15 @@ impl SyncServiceImpl { Err(GetRefError::Access(e)) => Err(SyncError::Access(e)), Err(GetRefError::Internal(e)) => Err(SyncError::Internal(e)), }, - Err(SyncError::DatasetNotFound(_)) => Ok(None), - Err(e) => Err(e), + Err(e) => Err(e.into()), }?; + let src_dataset = match &src { + SyncRef::Local(resolved_dataset) => (**resolved_dataset).clone(), + SyncRef::LocalNew(_) => unreachable!(), + SyncRef::Remote(src_remote) => src_remote.dataset.clone(), + }; + tracing::info!("Starting sync using Smart Transfer Protocol (Push flow)"); self.smart_transfer_protocol .push_protocol_client_flow( @@ -298,7 +187,7 @@ impl SyncServiceImpl { &http_dst_url, maybe_dst_head.as_ref(), listener, - TransferOptions { + SmartTransferOptions { force_update_if_diverged: opts.force, visibility_for_created_dataset: opts.dataset_visibility, ..Default::default() @@ -307,9 +196,10 @@ impl SyncServiceImpl { .await } + #[tracing::instrument(level = "debug", skip_all, fields(%dst_url, ?opts))] async fn sync_to_ipfs( &self, - src: &DatasetRef, + src_dataset: Arc, dst_url: &Url, opts: SyncOptions, ) -> Result { @@ -329,13 +219,7 @@ impl SyncServiceImpl { tracing::info!(key_name = %key.name, key_id = %key.id, "Resolved the key to use for IPNS publishing"); - let src_dataset_handle = self.dataset_repo.resolve_dataset_ref(src).await?; - self.dataset_action_authorizer - .check_action_allowed(&src_dataset_handle, auth::DatasetAction::Read) - .await?; - // Resolve and compare heads - let src_dataset = self.dataset_repo.find_dataset_by_ref(src).await?; let src_head = src_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) @@ -356,8 +240,10 @@ impl SyncServiceImpl { } Some(old_cid) => { tracing::info!(%old_cid, "Attempting to read remote head"); - let dst_http_url = - self.resolve_remote_dataset_url(&DatasetRefRemote::from(dst_url))?; + let dst_http_url = resolve_remote_dataset_url( + self.remote_repo_reg.as_ref(), + &DatasetRefRemote::from(dst_url), + )?; let dst_dataset = self .dataset_factory .get_dataset(&dst_http_url, false) @@ -484,7 +370,7 @@ impl SyncServiceImpl { // Add files to IPFS tracing::info!("Adding files to IPFS"); - let cid = self.add_to_ipfs(src).await?; + let cid = self.add_to_ipfs(src_dataset.as_ref()).await?; // Publish to IPNS tracing::info!(%cid, "Publishing to IPNS"); @@ -507,9 +393,15 @@ impl SyncServiceImpl { }) } - async fn add_to_ipfs(&self, src: &DatasetRef) -> Result { - let source_url = self.dataset_repo.get_dataset_url(src).await.int_err()?; - let source_path = source_url.to_file_path().unwrap(); + async fn add_to_ipfs(&self, src_dataset: &dyn Dataset) -> Result { + let source_url = src_dataset.get_storage_internal_url(); + let source_path = source_url.to_file_path().map_err(|_| { + IpfsAddError::UnsupportedIpfsStorageType({ + UnsupportedIpfsStorageTypeError { + url: source_url.clone(), + } + }) + })?; let cid = self .ipfs_client @@ -524,23 +416,19 @@ impl SyncServiceImpl { Ok(cid) } - #[tracing::instrument(level = "info", name = "sync", skip_all, fields(%src, %dst))] + #[tracing::instrument(level = "info", name = "sync", skip_all, fields(src=?src, dst=?dst))] async fn sync_impl( &self, - src: &DatasetRefAny, - dst: &DatasetRefAny, + src: SyncRef, + dst: SyncRef, opts: SyncOptions, listener: Arc, ) -> Result { - let src = self.resolve_sync_ref(src)?; - let dst = self.resolve_sync_ref(dst)?; - tracing::info!(src_loc = ?src, dst_loc = ?dst, "Resolved source / destination"); - match (&src, &dst) { // * -> ipfs - (_, SyncRef::Remote(dst_url)) if dst_url.scheme() == "ipfs" => { + (_, SyncRef::Remote(dst_remote)) if dst_remote.url.scheme() == "ipfs" => { Err(UnsupportedProtocolError { - url: dst_url.as_ref().clone(), + url: dst_remote.url.as_ref().clone(), message: Some( concat!( "Cannot sync to ipfs://{CID} URLs since IPFS ", @@ -554,9 +442,11 @@ impl SyncServiceImpl { .into()) } // -> ipns - (SyncRef::Remote(_), SyncRef::Remote(dst_url)) if dst_url.scheme() == "ipns" => { + (SyncRef::Remote(_), SyncRef::Remote(dst_remote)) + if dst_remote.url.scheme() == "ipns" => + { Err(UnsupportedProtocolError { - url: dst_url.as_ref().clone(), + url: dst_remote.url.as_ref().clone(), message: Some( concat!( "Syncing from a remote repository directly to IPFS ", @@ -569,11 +459,16 @@ impl SyncServiceImpl { .into()) } // -> ipns - (SyncRef::Local(src_ref), SyncRef::Remote(dst_url)) if dst_url.scheme() == "ipns" => { - match dst_url.path() { - "" | "/" => self.sync_to_ipfs(src_ref, dst_url, opts).await, + (SyncRef::Local(src_dataset), SyncRef::Remote(dst_remote)) + if dst_remote.url.scheme() == "ipns" => + { + match dst_remote.url.path() { + "" | "/" => { + self.sync_to_ipfs((**src_dataset).clone(), dst_remote.url.as_ref(), opts) + .await + } _ => Err(UnsupportedProtocolError { - url: dst_url.as_ref().clone(), + url: dst_remote.url.as_ref().clone(), message: Some( concat!( "Cannot use a sub-path when syncing to ipns:// URL. ", @@ -586,11 +481,11 @@ impl SyncServiceImpl { } } // odf -> odf - (SyncRef::Remote(src_url), SyncRef::Remote(dst_url)) - if src_url.is_odf_protocol() && dst_url.is_odf_protocol() => + (SyncRef::Remote(src_remote), SyncRef::Remote(dst_remote)) + if src_remote.url.is_odf_protocol() && dst_remote.url.is_odf_protocol() => { Err(UnsupportedProtocolError { - url: dst_url.as_ref().clone(), + url: dst_remote.url.as_ref().clone(), message: Some( concat!( "Syncing from a remote ODF repository directly to remote ODF ", @@ -603,17 +498,17 @@ impl SyncServiceImpl { .into()) } // odf -> * - (SyncRef::Remote(src_url), _) if src_url.is_odf_protocol() => { - self.sync_smart_pull_transfer_protocol(src_url.as_ref(), &dst, opts, listener) + (SyncRef::Remote(src_remote), _) if src_remote.url.is_odf_protocol() => { + self.sync_smart_pull_transfer_protocol(src_remote.url.as_ref(), dst, opts, listener) .await } // * -> odf - (_, SyncRef::Remote(dst_url)) if dst_url.is_odf_protocol() => { - self.sync_smart_push_transfer_protocol(&src, dst_url.as_ref(), opts, listener) + (_, SyncRef::Remote(dst_remote)) if dst_remote.url.is_odf_protocol() => { + self.sync_smart_push_transfer_protocol(src, dst_remote.url.as_ref(), opts, listener) .await } // * -> * - (_, _) => self.sync_generic(&src, &dst, opts, listener).await, + (_, _) => self.sync_generic(src, dst, opts, listener).await, } } } @@ -622,17 +517,19 @@ impl SyncServiceImpl { #[async_trait::async_trait] impl SyncService for SyncServiceImpl { + #[tracing::instrument(level = "debug", skip_all, fields(?request, ?options))] async fn sync( &self, - src: &DatasetRefAny, - dst: &DatasetRefAny, + request: SyncRequest, options: SyncOptions, listener: Option>, ) -> Result { let listener = listener.unwrap_or(Arc::new(NullSyncListener)); listener.begin(); - - match self.sync_impl(src, dst, options, listener.clone()).await { + match self + .sync_impl(request.src, request.dst, options, listener.clone()) + .await + { Ok(result) => { listener.success(&result); Ok(result) @@ -644,50 +541,9 @@ impl SyncService for SyncServiceImpl { } } - // TODO: Parallelism - async fn sync_multi( - &self, - requests: Vec, - options: SyncOptions, - listener: Option>, - ) -> Vec { - let mut results = Vec::new(); - - for SyncRequest { src, dst } in requests { - let listener = listener.as_ref().and_then(|l| l.begin_sync(&src, &dst)); - let result = self.sync(&src, &dst, options.clone(), listener).await; - results.push(SyncResultMulti { src, dst, result }); - } - - results - } - - async fn ipfs_add(&self, src: &DatasetRef) -> Result { - self.add_to_ipfs(src).await - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone)] -enum SyncRef { - Local(DatasetRef), - Remote(Arc), -} - -impl SyncRef { - fn is_local(&self) -> bool { - match self { - Self::Local(_) => true, - Self::Remote(_) => false, - } - } - - fn as_any_ref(&self) -> DatasetRefAny { - match self { - Self::Local(local_ref) => local_ref.as_any_ref(), - Self::Remote(url) => DatasetRefAny::Url(Arc::clone(url)), - } + #[tracing::instrument(level = "debug", skip_all)] + async fn ipfs_add(&self, src: ResolvedDataset) -> Result { + self.add_to_ipfs(src.as_ref()).await } } diff --git a/src/infra/core/src/testing/dataset_test_helper.rs b/src/infra/core/src/testing/dataset_test_helper.rs index 99577e6eac..9e3372d0a0 100644 --- a/src/infra/core/src/testing/dataset_test_helper.rs +++ b/src/infra/core/src/testing/dataset_test_helper.rs @@ -81,18 +81,18 @@ impl DatasetTestHelper { } pub async fn append_random_data( - dataset_repo: &dyn DatasetRepository, + dataset_registry: &dyn DatasetRegistry, dataset_ref: impl Into, data_size: usize, ) -> Multihash { let tmp_dir = tempfile::tempdir().unwrap(); - let ds = dataset_repo - .find_dataset_by_ref(&dataset_ref.into()) + let resolved_dataset = dataset_registry + .get_dataset_by_ref(&dataset_ref.into()) .await .unwrap(); - let prev_data = ds + let prev_data = resolved_dataset .as_metadata_chain() .iter_blocks() .filter_map_ok(|(_, b)| match b.event { @@ -123,21 +123,22 @@ impl DatasetTestHelper { end: start + num_records - 1, }; - ds.commit_add_data( - AddDataParams { - prev_checkpoint, - prev_offset, - new_offset_interval: Some(new_offset_interval), - new_watermark: None, - new_source_state: None, - }, - Some(OwnedFile::new(data_path)), - Some(CheckpointRef::New(OwnedFile::new(checkpoint_path))), - CommitOpts::default(), - ) - .await - .unwrap() - .new_head + resolved_dataset + .commit_add_data( + AddDataParams { + prev_checkpoint, + prev_offset, + new_offset_interval: Some(new_offset_interval), + new_watermark: None, + new_source_state: None, + }, + Some(OwnedFile::new(data_path)), + Some(CheckpointRef::New(OwnedFile::new(checkpoint_path))), + CommitOpts::default(), + ) + .await + .unwrap() + .new_head } } diff --git a/src/infra/core/src/testing/dummy_smart_transfer_protocol_client.rs b/src/infra/core/src/testing/dummy_smart_transfer_protocol_client.rs index 862a4153cb..5c8ad08367 100644 --- a/src/infra/core/src/testing/dummy_smart_transfer_protocol_client.rs +++ b/src/infra/core/src/testing/dummy_smart_transfer_protocol_client.rs @@ -10,12 +10,13 @@ use std::sync::Arc; use kamu_core::{Dataset, SyncError, SyncListener, SyncResult}; -use opendatafabric::Multihash; +use opendatafabric as odf; use url::Url; -use crate::utils::simple_transfer_protocol::DatasetFactoryFn; use crate::utils::smart_transfer_protocol::{SmartTransferProtocolClient, TransferOptions}; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[dill::component] #[dill::interface(dyn SmartTransferProtocolClient)] pub struct DummySmartTransferProtocolClient {} @@ -32,7 +33,7 @@ impl SmartTransferProtocolClient for DummySmartTransferProtocolClient { &self, _http_src_url: &Url, _dst: Option>, - _dst_factory: Option, + _dst_alias: Option<&odf::DatasetAlias>, _listener: Arc, _transfer_options: TransferOptions, ) -> Result { @@ -43,10 +44,12 @@ impl SmartTransferProtocolClient for DummySmartTransferProtocolClient { &self, _src: Arc, _http_dst_url: &Url, - _dst_head: Option<&Multihash>, + _dst_head: Option<&odf::Multihash>, _listener: Arc, _transfer_options: TransferOptions, ) -> Result { unimplemented!("Not supported yet") } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/metadata_factory.rs b/src/infra/core/src/testing/metadata_factory.rs index c21c560170..b65768497c 100644 --- a/src/infra/core/src/testing/metadata_factory.rs +++ b/src/infra/core/src/testing/metadata_factory.rs @@ -650,6 +650,23 @@ impl ExecuteTransformBuilder { self } + pub fn empty_query_inputs_from_particular_ids(mut self, dataset_ids: I) -> Self + where + I: IntoIterator, + { + self.v.query_inputs = dataset_ids + .into_iter() + .map(|dataset_id| ExecuteTransformInput { + dataset_id, + prev_block_hash: None, + new_block_hash: None, + prev_offset: None, + new_offset: None, + }) + .collect(); + self + } + pub fn empty_query_inputs_from_seeded_ids(mut self, aliases: I) -> Self where I: IntoIterator, diff --git a/src/infra/core/src/testing/mock_dataset_action_authorizer.rs b/src/infra/core/src/testing/mock_dataset_action_authorizer.rs index 9670b576ce..4f1fc6cd3e 100644 --- a/src/infra/core/src/testing/mock_dataset_action_authorizer.rs +++ b/src/infra/core/src/testing/mock_dataset_action_authorizer.rs @@ -9,8 +9,10 @@ use std::collections::HashSet; +use internal_error::InternalError; use kamu_core::auth::{ self, + ClassifyByAllowanceResponse, DatasetAction, DatasetActionAuthorizer, DatasetActionNotEnoughPermissionsError, @@ -25,6 +27,7 @@ use opendatafabric::{DatasetAlias, DatasetHandle}; mockall::mock! { pub DatasetActionAuthorizer {} + #[async_trait::async_trait] impl DatasetActionAuthorizer for DatasetActionAuthorizer { async fn check_action_allowed( @@ -34,6 +37,18 @@ mockall::mock! { ) -> Result<(), DatasetActionUnauthorizedError>; async fn get_allowed_actions(&self, dataset_handle: &DatasetHandle) -> HashSet; + + async fn filter_datasets_allowing( + &self, + dataset_handles: Vec, + action: DatasetAction, + ) -> Result, InternalError>; + + async fn classify_datasets_by_allowance( + &self, + dataset_handles: Vec, + action: DatasetAction, + ) -> Result; } } @@ -134,6 +149,37 @@ impl MockDatasetActionAuthorizer { self } + + pub fn make_expect_classify_datasets_by_allowance( + mut self, + action: auth::DatasetAction, + times: usize, + authorized: HashSet, + ) -> Self { + self.expect_classify_datasets_by_allowance() + .with(always(), eq(action)) + .times(times) + .returning(move |handles, action| { + let mut good = Vec::new(); + let mut bad = Vec::new(); + + for handle in handles { + if authorized.contains(&handle.alias) { + good.push(handle); + } else { + let error = Self::denying_error(&handle, action); + bad.push((handle, error)); + } + } + + Ok(ClassifyByAllowanceResponse { + authorized_handles: good, + unauthorized_handles_with_errors: bad, + }) + }); + + self + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/mock_dataset_changes_service.rs b/src/infra/core/src/testing/mock_dataset_changes_service.rs index 3949fcb8a8..fcf54622cd 100644 --- a/src/infra/core/src/testing/mock_dataset_changes_service.rs +++ b/src/infra/core/src/testing/mock_dataset_changes_service.rs @@ -14,6 +14,7 @@ use opendatafabric::{DatasetID, Multihash}; mockall::mock! { pub DatasetChangesService {} + #[async_trait::async_trait] impl DatasetChangesService for DatasetChangesService { #[allow(clippy::ref_option_ref)] diff --git a/src/infra/core/src/testing/mock_dependency_graph_repository.rs b/src/infra/core/src/testing/mock_dependency_graph_repository.rs index 0aae4d6e23..103cf19014 100644 --- a/src/infra/core/src/testing/mock_dependency_graph_repository.rs +++ b/src/infra/core/src/testing/mock_dependency_graph_repository.rs @@ -13,6 +13,7 @@ use kamu_core::{DatasetDependenciesIDStream, DependencyGraphRepository}; mockall::mock! { pub DependencyGraphRepository {} + #[async_trait::async_trait] impl DependencyGraphRepository for DependencyGraphRepository { fn list_dependencies_of_all_datasets(&self) -> DatasetDependenciesIDStream<'_>; diff --git a/src/infra/core/src/testing/mock_odf_server_access_token_resolver.rs b/src/infra/core/src/testing/mock_odf_server_access_token_resolver.rs index e7fb738126..28458af742 100644 --- a/src/infra/core/src/testing/mock_odf_server_access_token_resolver.rs +++ b/src/infra/core/src/testing/mock_odf_server_access_token_resolver.rs @@ -14,6 +14,7 @@ use url::Url; mockall::mock! { pub OdfServerAccessTokenResolver {} + #[async_trait::async_trait] impl OdfServerAccessTokenResolver for OdfServerAccessTokenResolver { fn resolve_odf_dataset_access_token( diff --git a/src/infra/core/src/testing/mock_polling_source_service.rs b/src/infra/core/src/testing/mock_polling_source_service.rs index f4d180ad88..f992a8d9ff 100644 --- a/src/infra/core/src/testing/mock_polling_source_service.rs +++ b/src/infra/core/src/testing/mock_polling_source_service.rs @@ -14,14 +14,13 @@ use kamu_core::{ GetDatasetError, PollingIngestError, PollingIngestListener, - PollingIngestMultiListener, PollingIngestOptions, - PollingIngestResponse, PollingIngestResult, PollingIngestService, + ResolvedDataset, }; use opendatafabric::{ - DatasetRef, + DatasetAlias, FetchStep, FetchStepUrl, MergeStrategy, @@ -37,32 +36,39 @@ use opendatafabric::{ mockall::mock! { pub PollingIngestService {} + #[async_trait::async_trait] impl PollingIngestService for PollingIngestService { - async fn get_active_polling_source( - &self, - dataset_ref: &DatasetRef, - ) -> Result)>, GetDatasetError>; - - async fn ingest( - &self, - dataset_ref: &DatasetRef, - options: PollingIngestOptions, - listener: Option>, - ) -> Result; + async fn get_active_polling_source( + &self, + target: ResolvedDataset, + ) -> Result)>, GetDatasetError>; - async fn ingest_multi( - &self, - dataset_refs: Vec, - options: PollingIngestOptions, - listener: Option>, - ) -> Vec; + async fn ingest( + &self, + target: ResolvedDataset, + options: PollingIngestOptions, + listener: Option>, + ) -> Result; } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// impl MockPollingIngestService { + pub fn make_expect_ingest(mut self, dataset_alias: DatasetAlias) -> Self { + self.expect_ingest() + .withf(move |target, _, _| target.get_alias() == &dataset_alias) + .times(1) + .returning(|_, _, _| { + Ok(PollingIngestResult::UpToDate { + no_source_defined: false, + uncacheable: false, + }) + }); + self + } + pub fn without_active_polling_source() -> Self { let mut dependency_graph_repo_mock = MockPollingIngestService::default(); dependency_graph_repo_mock diff --git a/src/infra/core/src/testing/mock_sync_service.rs b/src/infra/core/src/testing/mock_sync_service.rs new file mode 100644 index 0000000000..82c627d5b0 --- /dev/null +++ b/src/infra/core/src/testing/mock_sync_service.rs @@ -0,0 +1,64 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use kamu_core::*; +use opendatafabric::{DatasetAlias, DatasetRefRemote}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +mockall::mock! { + pub SyncService {} + + #[async_trait::async_trait] + impl SyncService for SyncService { + async fn sync( + &self, + request: SyncRequest, + options: SyncOptions, + listener: Option>, + ) -> Result; + + /// Adds dataset to IPFS and returns the root CID. + /// Unlike `sync` it does not do IPNS resolution and publishing. + async fn ipfs_add(&self, src: ResolvedDataset) -> Result; + } +} + +impl MockSyncService { + pub fn make_expect_sync_pull_from_remote_to_existing_local( + mut self, + target_alias: DatasetAlias, + src_remote_ref: DatasetRefRemote, + injected_result: SyncResult, + ) -> Self { + self.expect_sync() + .withf(move |request, _, _| { + matches!( + &(request.src), + SyncRef::Remote(SyncRefRemote { + url: _, + dataset: _, + original_remote_ref, + }) + if original_remote_ref == &src_remote_ref + ) && matches!( + &(request.dst), + SyncRef::Local(resolved_dataset) + if resolved_dataset.get_alias() == &target_alias + ) + }) + .times(1) + .returning(move |_, _, _| Ok(injected_result.clone())); + self + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/mock_transform_elaboration_service.rs b/src/infra/core/src/testing/mock_transform_elaboration_service.rs new file mode 100644 index 0000000000..6112c00f5a --- /dev/null +++ b/src/infra/core/src/testing/mock_transform_elaboration_service.rs @@ -0,0 +1,60 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use engine::TransformRequestExt; +use kamu_core::*; +use opendatafabric::DatasetAlias; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +mockall::mock! { + pub TransformElaborationService {} + + #[async_trait::async_trait] + impl TransformElaborationService for TransformElaborationService { + async fn elaborate_transform( + &self, + target: ResolvedDataset, + plan: TransformPreliminaryPlan, + transform_options: TransformOptions, + maybe_listener: Option>, + ) -> Result; + } +} + +impl MockTransformElaborationService { + pub fn make_expect_elaborate_transform(mut self, target_alias: DatasetAlias) -> Self { + self.expect_elaborate_transform() + .withf(move |target, _, _, _| target.get_alias() == &target_alias) + .times(1) + .returning(|_, plan, _, _| { + Ok(TransformElaboration::Elaborated(TransformPlan { + request: TransformRequestExt { + operation_id: plan.preliminary_request.operation_id, + dataset_handle: plan.preliminary_request.dataset_handle, + block_ref: plan.preliminary_request.block_ref, + head: plan.preliminary_request.head, + transform: plan.preliminary_request.transform, + system_time: plan.preliminary_request.system_time, + schema: plan.preliminary_request.schema, + prev_offset: plan.preliminary_request.prev_offset, + vocab: plan.preliminary_request.vocab, + inputs: vec![], + prev_checkpoint: plan.preliminary_request.prev_checkpoint, + }, + datasets_map: ResolvedDatasetsMap::default(), + })) + }); + self + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/mock_transform_execution_service.rs b/src/infra/core/src/testing/mock_transform_execution_service.rs new file mode 100644 index 0000000000..d3e08ece16 --- /dev/null +++ b/src/infra/core/src/testing/mock_transform_execution_service.rs @@ -0,0 +1,51 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use kamu_core::*; +use opendatafabric::DatasetAlias; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +mockall::mock! { + pub TransformExecutionService {} + + #[async_trait::async_trait] + impl TransformExecutionService for TransformExecutionService { + async fn execute_transform( + &self, + target: ResolvedDataset, + plan: TransformPlan, + maybe_listener: Option>, + ) -> ( + ResolvedDataset, + Result, + ); + + async fn execute_verify_transform( + &self, + target: ResolvedDataset, + verification_operation: VerifyTransformOperation, + maybe_listener: Option>, + ) -> Result<(), VerifyTransformExecuteError>; + } +} + +impl MockTransformExecutionService { + pub fn make_expect_transform(mut self, target_alias: DatasetAlias) -> Self { + self.expect_execute_transform() + .withf(move |target, _, _| target.get_alias() == &target_alias) + .times(1) + .returning(|target, _, _| (target, Ok(TransformResult::UpToDate))); + self + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/mock_transform_request_planner.rs b/src/infra/core/src/testing/mock_transform_request_planner.rs new file mode 100644 index 0000000000..00468b589c --- /dev/null +++ b/src/infra/core/src/testing/mock_transform_request_planner.rs @@ -0,0 +1,74 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::Utc; +use internal_error::InternalError; +use kamu_core::*; +use opendatafabric::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +mockall::mock! { + pub TransformRequestPlanner {} + #[async_trait::async_trait] + impl TransformRequestPlanner for TransformRequestPlanner { + async fn get_active_transform( + &self, + target: ResolvedDataset, + ) -> Result)>, InternalError>; + + async fn build_transform_preliminary_plan( + &self, + target: ResolvedDataset, + ) -> Result; + + async fn build_transform_verification_plan( + &self, + target: ResolvedDataset, + block_range: (Option, Option), + ) -> Result; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +impl MockTransformRequestPlanner { + pub fn without_set_transform() -> Self { + let mut mock = Self::default(); + mock.expect_get_active_transform().return_once(|_| Ok(None)); + mock + } + + pub fn with_set_transform() -> Self { + let mut mock = Self::default(); + mock.expect_get_active_transform().return_once(|_| { + Ok(Some(( + Multihash::from_digest_sha3_256(b"a"), + MetadataBlockTyped { + system_time: Utc::now(), + prev_block_hash: None, + event: SetTransform { + inputs: vec![], + transform: Transform::Sql(TransformSql { + engine: "spark".to_string(), + version: None, + query: None, + queries: None, + temporal_tables: None, + }), + }, + sequence_number: 0, + }, + ))) + }); + mock + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/mock_transform_service.rs b/src/infra/core/src/testing/mock_transform_service.rs deleted file mode 100644 index 8da316ec50..0000000000 --- a/src/infra/core/src/testing/mock_transform_service.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::sync::Arc; - -use chrono::Utc; -use kamu_core::{ - GetDatasetError, - TransformError, - TransformListener, - TransformMultiListener, - TransformOptions, - TransformResult, - TransformService, - VerificationError, - VerificationListener, -}; -use opendatafabric::{ - DatasetRef, - MetadataBlockTyped, - Multihash, - SetTransform, - Transform, - TransformSql, -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -mockall::mock! { - pub TransformService {} - #[async_trait::async_trait] - impl TransformService for TransformService { - async fn get_active_transform( - &self, - dataset_ref: &DatasetRef, - ) -> Result)>, GetDatasetError>; - - async fn transform( - &self, - dataset_ref: &DatasetRef, - options: TransformOptions, - listener: Option>, - ) -> Result; - - async fn transform_multi( - &self, - dataset_refs: Vec, - options: TransformOptions, - listener: Option>, - ) -> Vec<(DatasetRef, Result)>; - - async fn verify_transform( - &self, - dataset_ref: &DatasetRef, - block_range: (Option, Option), - listener: Option>, - ) -> Result<(), VerificationError>; - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -impl MockTransformService { - pub fn without_set_transform() -> Self { - let mut dependency_graph_repo_mock = MockTransformService::default(); - dependency_graph_repo_mock - .expect_get_active_transform() - .return_once(|_| Ok(None)); - dependency_graph_repo_mock - } - - pub fn with_set_transform() -> Self { - let mut dependency_graph_repo_mock = MockTransformService::default(); - dependency_graph_repo_mock - .expect_get_active_transform() - .return_once(|_| { - Ok(Some(( - Multihash::from_digest_sha3_256(b"a"), - MetadataBlockTyped { - system_time: Utc::now(), - prev_block_hash: None, - event: SetTransform { - inputs: vec![], - transform: Transform::Sql(TransformSql { - engine: "spark".to_string(), - version: None, - query: None, - queries: None, - temporal_tables: None, - }), - }, - sequence_number: 0, - }, - ))) - }); - dependency_graph_repo_mock - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/testing/mod.rs b/src/infra/core/src/testing/mod.rs index 052290a40b..fa05145765 100644 --- a/src/infra/core/src/testing/mod.rs +++ b/src/infra/core/src/testing/mod.rs @@ -19,7 +19,10 @@ mod mock_dataset_changes_service; mod mock_dependency_graph_repository; mod mock_odf_server_access_token_resolver; mod mock_polling_source_service; -mod mock_transform_service; +mod mock_sync_service; +mod mock_transform_elaboration_service; +mod mock_transform_execution_service; +mod mock_transform_request_planner; mod parquet_reader_helper; mod parquet_writer_helper; @@ -35,6 +38,9 @@ pub use mock_dataset_changes_service::*; pub use mock_dependency_graph_repository::*; pub use mock_odf_server_access_token_resolver::*; pub use mock_polling_source_service::*; -pub use mock_transform_service::*; +pub use mock_sync_service::*; +pub use mock_transform_elaboration_service::*; +pub use mock_transform_execution_service::*; +pub use mock_transform_request_planner::*; pub use parquet_reader_helper::*; pub use parquet_writer_helper::*; diff --git a/src/infra/core/src/transform/mod.rs b/src/infra/core/src/transform/mod.rs new file mode 100644 index 0000000000..77b70bb4d8 --- /dev/null +++ b/src/infra/core/src/transform/mod.rs @@ -0,0 +1,18 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod transform_elaboration_service_impl; +mod transform_execution_service_impl; +mod transform_helpers; +mod transform_request_planner_impl; + +pub use transform_elaboration_service_impl::*; +pub use transform_execution_service_impl::*; +pub(crate) use transform_helpers::*; +pub use transform_request_planner_impl::*; diff --git a/src/infra/core/src/transform/transform_elaboration_service_impl.rs b/src/infra/core/src/transform/transform_elaboration_service_impl.rs new file mode 100644 index 0000000000..09c831572e --- /dev/null +++ b/src/infra/core/src/transform/transform_elaboration_service_impl.rs @@ -0,0 +1,230 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::*; +use engine::{TransformRequestExt, TransformRequestInputExt}; +use internal_error::ResultIntoInternal; +use kamu_core::*; +use opendatafabric::{ExecuteTransformInput, TransformInput}; +use time_source::SystemTimeSource; + +use super::get_transform_input_from_query_input; +use crate::build_preliminary_request_ext; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TransformElaborationServiceImpl { + compaction_svc: Arc, + time_source: Arc, +} + +#[component(pub)] +#[interface(dyn TransformElaborationService)] +impl TransformElaborationServiceImpl { + pub fn new( + compaction_svc: Arc, + time_source: Arc, + ) -> Self { + Self { + compaction_svc, + time_source, + } + } + + async fn elaborate_preliminary_request( + &self, + preliminary_request: TransformPreliminaryRequestExt, + datasets_map: &ResolvedDatasetsMap, + ) -> Result, TransformElaborateError> { + use futures::{StreamExt, TryStreamExt}; + let inputs: Vec<_> = futures::stream::iter(preliminary_request.input_states) + .then(|(input_decl, input_state)| { + self.get_transform_input(input_decl, input_state, datasets_map) + }) + .try_collect() + .await?; + + // Nothing to do? + // Note that we're considering a schema here, as even if there is no data to + // process we would like to run the transform to establish the schema of the + // output. + // + // TODO: Detect the situation where inputs only had source updates and skip + // running the engine + if inputs + .iter() + .all(|i| i.data_slices.is_empty() && i.explicit_watermarks.is_empty()) + && preliminary_request.schema.is_some() + { + return Ok(None); + } + + let final_request: TransformRequestExt = TransformRequestExt { + operation_id: preliminary_request.operation_id, + dataset_handle: preliminary_request.dataset_handle, + block_ref: preliminary_request.block_ref, + head: preliminary_request.head, + transform: preliminary_request.transform, + system_time: preliminary_request.system_time, + schema: preliminary_request.schema, + prev_offset: preliminary_request.prev_offset, + vocab: preliminary_request.vocab, + inputs, + prev_checkpoint: preliminary_request.prev_checkpoint, + }; + + Ok(Some(final_request)) + } + + async fn get_transform_input( + &self, + input_decl: TransformInput, + input_state: Option, + datasets_map: &ResolvedDatasetsMap, + ) -> Result { + let dataset_id = input_decl.dataset_ref.id().unwrap(); + if let Some(input_state) = &input_state { + assert_eq!(*dataset_id, input_state.dataset_id); + } + + let target = datasets_map.get_by_id(dataset_id); + let input_chain = target.as_metadata_chain(); + + // Determine last processed input block and offset + let last_processed_block = input_state.as_ref().and_then(|i| i.last_block_hash()); + let last_processed_offset = input_state + .as_ref() + .and_then(ExecuteTransformInput::last_offset); + + // Determine unprocessed block and offset range + let last_unprocessed_block = input_chain.resolve_ref(&BlockRef::Head).await.int_err()?; + let last_unprocessed_offset = input_chain + .accept_one_by_hash( + &last_unprocessed_block, + SearchSingleDataBlockVisitor::next(), + ) + .await + .int_err()? + .into_event() + .and_then(|event| event.last_offset()) + .or(last_processed_offset); + + let query_input = ExecuteTransformInput { + dataset_id: dataset_id.clone(), + prev_block_hash: last_processed_block.cloned(), + new_block_hash: if Some(&last_unprocessed_block) != last_processed_block { + Some(last_unprocessed_block) + } else { + None + }, + prev_offset: last_processed_offset, + new_offset: if last_unprocessed_offset != last_processed_offset { + last_unprocessed_offset + } else { + None + }, + }; + + get_transform_input_from_query_input( + query_input, + input_decl.alias.clone().unwrap(), + None, + datasets_map, + ) + .await + .map_err(Into::into) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl TransformElaborationService for TransformElaborationServiceImpl { + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle(), ?plan, ?options))] + async fn elaborate_transform( + &self, + target: ResolvedDataset, + plan: TransformPreliminaryPlan, + options: TransformOptions, + maybe_listener: Option>, + ) -> Result { + let listener = maybe_listener.unwrap_or_else(|| Arc::new(NullTransformListener)); + + match self + .elaborate_preliminary_request(plan.preliminary_request.clone(), &plan.datasets_map) + .await + { + Ok(Some(request)) => Ok(TransformElaboration::Elaborated(TransformPlan { + request, + datasets_map: plan.datasets_map, + })), + Ok(None) => Ok(TransformElaboration::UpToDate), + // TODO: Trapping the error to preserve old behavior - we should consider + // surfacing it and handling on upper layers + Err(TransformElaborateError::InputSchemaNotDefined(e)) => { + tracing::info!( + input = %e.dataset_handle, + "Not processing because one of the inputs was never pulled", + ); + listener.begin(); + listener.success(&TransformResult::UpToDate); + Ok(TransformElaboration::UpToDate) + } + Err(err @ TransformElaborateError::InvalidInputInterval(_)) + if options.reset_derivatives_on_diverged_input => + { + tracing::warn!( + error = %err, + "Interval error detected - resetting on diverged input", + ); + + let compaction_result = self + .compaction_svc + .compact_dataset( + target.clone(), + CompactionOptions { + keep_metadata_only: true, + ..Default::default() + }, + None, + ) + .await + .int_err()?; + + if let CompactionResult::Success { .. } = compaction_result { + // Recursing to try again after compaction + self.elaborate_transform( + target.clone(), + TransformPreliminaryPlan { + preliminary_request: build_preliminary_request_ext( + target, + self.time_source.now(), + ) + .await + .int_err()?, + datasets_map: plan.datasets_map, + }, + TransformOptions { + reset_derivatives_on_diverged_input: false, + }, + Some(listener), + ) + .await + } else { + Err(err) + } + } + Err(e) => Err(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/transform/transform_execution_service_impl.rs b/src/infra/core/src/transform/transform_execution_service_impl.rs new file mode 100644 index 0000000000..1cbb79a45e --- /dev/null +++ b/src/infra/core/src/transform/transform_execution_service_impl.rs @@ -0,0 +1,354 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::*; +use engine::{TransformRequestExt, TransformResponseExt}; +use internal_error::ResultIntoInternal; +use kamu_core::*; +use kamu_ingest_datafusion::DataWriterDataFusion; +use opendatafabric::{EnumWithVariants, ExecuteTransform, SetDataSchema, Transform}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TransformExecutionServiceImpl { + engine_provisioner: Arc, +} + +#[component(pub)] +#[interface(dyn TransformExecutionService)] +impl TransformExecutionServiceImpl { + pub fn new(engine_provisioner: Arc) -> Self { + Self { engine_provisioner } + } + + // Note: Can be called from multiple threads + #[tracing::instrument(level = "info", skip_all, fields(operation_id = %request.operation_id))] + async fn do_transform( + engine_provisioner: Arc, + request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, + commit_fn: CommitFn, + listener: Arc, + ) -> Result + where + CommitFn: FnOnce(TransformRequestExt, TransformResponseExt) -> Fut, + Fut: futures::Future>, + { + tracing::info!(?request, "Transform request"); + + listener.begin(); + + match Self::do_transform_inner( + engine_provisioner, + request, + datasets_map, + commit_fn, + listener.clone(), + ) + .await + { + Ok(res) => { + tracing::info!("Transform successful"); + listener.success(&res); + Ok(res) + } + Err(err) => { + tracing::error!(error = ?err, error_msg = %err, "Transform failed"); + listener.execute_error(&err); + Err(err) + } + } + } + + // Note: Can be called from multiple threads + async fn do_transform_inner( + engine_provisioner: Arc, + request: TransformRequestExt, + datasets_map: &ResolvedDatasetsMap, + commit_fn: CommitFn, + listener: Arc, + ) -> Result + where + CommitFn: FnOnce(TransformRequestExt, TransformResponseExt) -> Fut, + Fut: futures::Future>, + { + let engine = engine_provisioner + .provision_engine( + match request.transform { + Transform::Sql(ref sql) => &sql.engine, + }, + listener.clone().get_engine_provisioning_listener(), + ) + .await?; + + let response = engine + .execute_transform(request.clone(), datasets_map) + .await?; + assert_eq!( + response.new_offset_interval.is_some(), + response.new_data.is_some() + ); + + commit_fn(request, response).await + } + + async fn commit_execute_transform( + resolved_dataset: ResolvedDataset, + request: TransformRequestExt, + response: TransformResponseExt, + ) -> Result { + let old_head = request.head.clone(); + let mut new_head = old_head.clone(); + + if response.output_schema.is_none() { + tracing::warn!("Engine did not produce a schema. In future this will become an error."); + }; + + if let Some(prev_schema) = request.schema { + // Validate schema + if let Some(new_schema) = response.output_schema { + DataWriterDataFusion::validate_output_schema_equivalence(&prev_schema, &new_schema) + .int_err()?; + } + } else { + // Set schema upon first transform + if let Some(new_schema) = response.output_schema { + // TODO: make schema commit atomic with data + let commit_schema_result = resolved_dataset + .commit_event( + SetDataSchema::new(&new_schema).into(), + CommitOpts { + block_ref: &request.block_ref, + system_time: Some(request.system_time), + prev_block_hash: Some(Some(&new_head)), + check_object_refs: false, + update_block_ref: true, + }, + ) + .await?; + + new_head = commit_schema_result.new_head; + } + } + + let params = ExecuteTransformParams { + query_inputs: request.inputs.iter().map(|i| i.clone().into()).collect(), + prev_checkpoint: request.prev_checkpoint, + prev_offset: request.prev_offset, + new_offset_interval: response.new_offset_interval, + new_watermark: response.new_watermark, + }; + + match resolved_dataset + .commit_execute_transform( + params, + response.new_data, + response.new_checkpoint.map(CheckpointRef::New), + CommitOpts { + block_ref: &request.block_ref, + system_time: Some(request.system_time), + prev_block_hash: Some(Some(&new_head)), + check_object_refs: true, + update_block_ref: true, + }, + ) + .await + { + Ok(res) => { + new_head = res.new_head; + Ok(()) + } + Err(CommitError::MetadataAppendError(AppendError::InvalidBlock( + AppendValidationError::NoOpEvent(_), + ))) => Ok(()), + Err(err) => Err(err), + }?; + + assert_ne!( + old_head, new_head, + "Commit did not update neither schema nor data" + ); + + Ok(TransformResult::Updated { old_head, new_head }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl TransformExecutionService for TransformExecutionServiceImpl { + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle(), ?plan))] + async fn execute_transform( + &self, + target: ResolvedDataset, + plan: TransformPlan, + maybe_listener: Option>, + ) -> ( + ResolvedDataset, + Result, + ) { + let listener = maybe_listener.unwrap_or_else(|| Arc::new(NullTransformListener)); + + ( + target.clone(), + Self::do_transform( + self.engine_provisioner.clone(), + plan.request, + &plan.datasets_map, + |request, response| async move { + Self::commit_execute_transform(target, request, response).await + }, + listener, + ) + .await, + ) + } + + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle(), ?verification_operation))] + async fn execute_verify_transform( + &self, + target: ResolvedDataset, + verification_operation: VerifyTransformOperation, + maybe_listener: Option>, + ) -> Result<(), VerifyTransformExecuteError> { + let listener = maybe_listener.unwrap_or(Arc::new(NullVerificationListener {})); + + let num_steps = verification_operation.steps.len(); + listener.begin_phase(VerificationPhase::ReplayTransform); + + for (step_index, step) in verification_operation.steps.into_iter().enumerate() { + let request = step.request; + + let block_hash = step.expected_hash; + let expected_block = step.expected_block; + let expected_event = expected_block + .event + .into_variant::() + .unwrap(); + + // Will be set during "commit" step + let mut actual_event: Option = None; + + tracing::info!( + %block_hash, + "Replaying block" + ); + + listener.begin_block( + &block_hash, + step_index, + num_steps, + VerificationPhase::ReplayTransform, + ); + + let transform_listener = listener + .clone() + .get_transform_listener() + .unwrap_or_else(|| Arc::new(NullTransformListener)); + + let ds = (*target).clone(); + let out_event = &mut actual_event; + + let result = TransformResult::Updated { + old_head: expected_block.prev_block_hash.clone().unwrap(), + new_head: block_hash.clone(), + }; + + Self::do_transform( + self.engine_provisioner.clone(), + request, + &verification_operation.datasets_map, + |request, response| async move { + let params = ExecuteTransformParams { + query_inputs: request.inputs.iter().map(|i| i.clone().into()).collect(), + prev_checkpoint: request.prev_checkpoint, + prev_offset: request.prev_offset, + new_offset_interval: response.new_offset_interval, + new_watermark: response.new_watermark, + }; + + // We expect outputs to be cleaned up automatically on drop + let new_event = ds + .prepare_execute_transform( + params, + response.new_data.as_ref(), + response.new_checkpoint.map(CheckpointRef::New).as_ref(), + ) + .await?; + + *out_event = Some(new_event); + + // This result is ignored + Ok(result) + }, + transform_listener, + ) + .await + .map_err(|e| match e { + TransformExecuteError::EngineProvisioningError(e) => { + VerifyTransformExecuteError::EngineProvisioningError(e) + } + TransformExecuteError::EngineError(e) => { + VerifyTransformExecuteError::EngineError(e) + } + TransformExecuteError::CommitError(_) => unreachable!(), + TransformExecuteError::Internal(e) => VerifyTransformExecuteError::Internal(e), + })?; + + let actual_event = actual_event.unwrap(); + + tracing::debug!(%block_hash, ?expected_event, ?actual_event, "Comparing expected and replayed events"); + + let mut cmp_actual_event = actual_event.clone(); + + // Parquet format is non-reproducible, so we rely only on logical hash for + // equivalence test and overwrite the physical hash and size with + // the expected values for comparison + if let Some(actual_slice) = &mut cmp_actual_event.new_data { + if let Some(expected_slice) = &expected_event.new_data { + actual_slice.physical_hash = expected_slice.physical_hash.clone(); + actual_slice.size = expected_slice.size; + } + } + + // Currently we're considering checkpoints non-reproducible and thus exclude + // them from equivalence test + cmp_actual_event + .new_checkpoint + .clone_from(&expected_event.new_checkpoint); + + if expected_event != cmp_actual_event { + tracing::warn!(%block_hash, ?expected_event, ?actual_event, "Data is not reproducible"); + + let err = VerifyTransformExecuteError::DataNotReproducible(DataNotReproducible { + block_hash, + expected_event: Box::new(expected_event.into()), + actual_event: Box::new(actual_event.into()), + }); + listener.transform_error(&err); + return Err(err); + } + + tracing::info!(%block_hash, "Block is valid"); + listener.end_block( + &block_hash, + step_index, + num_steps, + VerificationPhase::ReplayTransform, + ); + } + + listener.end_phase(VerificationPhase::ReplayTransform); + Ok(()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/transform/transform_helpers.rs b/src/infra/core/src/transform/transform_helpers.rs new file mode 100644 index 0000000000..eccf4bd119 --- /dev/null +++ b/src/infra/core/src/transform/transform_helpers.rs @@ -0,0 +1,305 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; +use kamu_core::engine::TransformRequestInputExt; +use kamu_core::{ + BlockRef, + Dataset, + InputSchemaNotDefinedError, + InvalidInputIntervalError, + IterBlocksError, + MetadataChainExt, + ResolvedDataset, + ResolvedDatasetsMap, + SearchExecuteTransformVisitor, + SearchSetDataSchemaVisitor, + SearchSetTransformVisitor, + SearchSetVocabVisitor, + TransformElaborateError, + TransformNotDefinedError, + TransformPlanError, + TransformPreliminaryRequestExt, + VerifyTransformPlanError, +}; +use opendatafabric::{ + DatasetVocabulary, + ExecuteTransform, + ExecuteTransformInput, + IntoDataStreamBlock, + SetDataSchema, + TransformInput, + Watermark, +}; +use random_names::get_random_name; +use thiserror::Error; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tracing::instrument(level = "info", skip_all)] +pub async fn build_preliminary_request_ext( + target: ResolvedDataset, + system_time: DateTime, +) -> Result { + let output_chain = target.as_metadata_chain(); + + // TODO: externalize + let block_ref = BlockRef::Head; + let head = output_chain.resolve_ref(&block_ref).await.int_err()?; + + // TODO: PERF: Search for source, vocab, and data schema result in full scan + let (source, schema, set_vocab, prev_query) = { + // TODO: Support transform evolution + let mut set_transform_visitor = SearchSetTransformVisitor::new(); + let mut set_vocab_visitor = SearchSetVocabVisitor::new(); + let mut set_data_schema_visitor = SearchSetDataSchemaVisitor::new(); + let mut execute_transform_visitor = SearchExecuteTransformVisitor::new(); + + target + .as_metadata_chain() + .accept_by_hash( + &mut [ + &mut set_transform_visitor, + &mut set_vocab_visitor, + &mut set_data_schema_visitor, + &mut execute_transform_visitor, + ], + &head, + ) + .await + .int_err()?; + + ( + set_transform_visitor.into_event(), + set_data_schema_visitor + .into_event() + .as_ref() + .map(SetDataSchema::schema_as_arrow) + .transpose() // Option> -> Result, E> + .int_err()?, + set_vocab_visitor.into_event(), + execute_transform_visitor.into_event(), + ) + }; + + let Some(source) = source else { + return Err(TransformNotDefinedError {}.into()); + }; + tracing::debug!(?source, "Transforming using source"); + + // Prepare inputs + use itertools::Itertools; + let input_states: Vec<(TransformInput, Option)> = + if let Some(query) = &prev_query { + source + .inputs + .iter() + .cloned() + .zip_eq(query.query_inputs.iter().cloned().map(Some)) + .collect() + } else { + source.inputs.iter().map(|i| (i.clone(), None)).collect() + }; + + // Build preliminary transform request + Ok(TransformPreliminaryRequestExt { + operation_id: get_random_name(None, 10), + dataset_handle: target.get_handle().clone(), + block_ref, + head, + transform: source.transform, + system_time, + schema, + prev_offset: prev_query.as_ref().and_then(ExecuteTransform::last_offset), + vocab: set_vocab.unwrap_or_default().into(), + input_states, + prev_checkpoint: prev_query.and_then(|q| q.new_checkpoint.map(|c| c.physical_hash)), + }) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) async fn get_transform_input_from_query_input( + query_input: ExecuteTransformInput, + alias: String, + vocab_hint: Option, + datasets_map: &ResolvedDatasetsMap, +) -> Result { + let resolved_input = datasets_map.get_by_id(&query_input.dataset_id); + let input_chain = resolved_input.as_metadata_chain(); + + // Find schema + // TODO: Make single-pass via multi-visitor + let schema = resolved_input + .as_metadata_chain() + .accept_one(SearchSetDataSchemaVisitor::new()) + .await + .int_err()? + .into_event() + .map(|e| e.schema_as_arrow()) + .transpose() + .int_err()? + .ok_or_else(|| InputSchemaNotDefinedError { + dataset_handle: resolved_input.get_handle().clone(), + })?; + + // Collect unprocessed input blocks + use futures::TryStreamExt; + let blocks_unprocessed = if let Some(new_block_hash) = &query_input.new_block_hash { + input_chain + .iter_blocks_interval(new_block_hash, query_input.prev_block_hash.as_ref(), false) + .try_collect() + .await + .map_err(|chain_err| match chain_err { + IterBlocksError::InvalidInterval(err) => { + GetTransformInputError::InvalidInputInterval(InvalidInputIntervalError { + head: err.head, + tail: err.tail, + input_dataset_id: query_input.dataset_id, + }) + } + _ => GetTransformInputError::Internal(chain_err.int_err()), + })? + } else { + Vec::new() + }; + + let mut data_slices = Vec::new(); + let mut explicit_watermarks = Vec::new(); + for block in blocks_unprocessed + .iter() + .rev() + .filter_map(|(_, b)| b.as_data_stream_block()) + { + if let Some(slice) = block.event.new_data { + data_slices.push(slice.physical_hash.clone()); + } + + if let Some(wm) = block.event.new_watermark { + explicit_watermarks.push(Watermark { + system_time: *block.system_time, + event_time: *wm, + }); + } + } + + let vocab = match vocab_hint { + Some(v) => v, + None => get_vocab(resolved_input.as_ref()).await?, + }; + + let is_empty = data_slices.is_empty() && explicit_watermarks.is_empty(); + + let input = TransformRequestInputExt { + dataset_handle: resolved_input.get_handle().clone(), + alias, + vocab, + prev_block_hash: query_input.prev_block_hash, + new_block_hash: query_input.new_block_hash, + prev_offset: query_input.prev_offset, + new_offset: query_input.new_offset, + data_slices, + schema, + explicit_watermarks, + }; + + tracing::info!(?input, is_empty, "Computed transform input"); + + Ok(input) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO: Avoid iterating through output chain multiple times +async fn get_vocab(dataset: &dyn Dataset) -> Result { + Ok(dataset + .as_metadata_chain() + .accept_one(SearchSetVocabVisitor::new()) + .await + .int_err()? + .into_event() + .unwrap_or_default() + .into()) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub(crate) enum BuildPreliminaryTransformRequestError { + #[error(transparent)] + TransformNotDefined( + #[from] + #[backtrace] + TransformNotDefinedError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +impl From for TransformPlanError { + fn from(value: BuildPreliminaryTransformRequestError) -> Self { + match value { + BuildPreliminaryTransformRequestError::TransformNotDefined(e) => { + Self::TransformNotDefined(e) + } + BuildPreliminaryTransformRequestError::Internal(e) => Self::Internal(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Error)] +pub(crate) enum GetTransformInputError { + #[error(transparent)] + InputSchemaNotDefined( + #[from] + #[backtrace] + InputSchemaNotDefinedError, + ), + #[error(transparent)] + InvalidInputInterval( + #[from] + #[backtrace] + InvalidInputIntervalError, + ), + #[error(transparent)] + Internal( + #[from] + #[backtrace] + InternalError, + ), +} + +impl From for TransformElaborateError { + fn from(value: GetTransformInputError) -> Self { + match value { + GetTransformInputError::InputSchemaNotDefined(e) => Self::InputSchemaNotDefined(e), + GetTransformInputError::InvalidInputInterval(e) => Self::InvalidInputInterval(e), + GetTransformInputError::Internal(e) => Self::Internal(e), + } + } +} + +impl From for VerifyTransformPlanError { + fn from(value: GetTransformInputError) -> Self { + match value { + GetTransformInputError::InputSchemaNotDefined(e) => Self::InputSchemaNotDefined(e), + GetTransformInputError::InvalidInputInterval(e) => Self::InvalidInputInterval(e), + GetTransformInputError::Internal(e) => Self::Internal(e), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/transform/transform_request_planner_impl.rs b/src/infra/core/src/transform/transform_request_planner_impl.rs new file mode 100644 index 0000000000..52f06bc605 --- /dev/null +++ b/src/infra/core/src/transform/transform_request_planner_impl.rs @@ -0,0 +1,327 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::BTreeMap; +use std::sync::Arc; + +use chrono::{DateTime, Utc}; +use dill::*; +use engine::TransformRequestExt; +use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; +use kamu_core::*; +use opendatafabric::{ + AsTypedBlock, + DatasetVocabulary, + ExecuteTransform, + MetadataBlock, + MetadataBlockTyped, + MetadataEventTypeFlags, + Multihash, + SetDataSchema, + SetTransform, +}; +use random_names::get_random_name; +use time_source::SystemTimeSource; + +use super::build_preliminary_request_ext; +use crate::get_transform_input_from_query_input; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TransformRequestPlannerImpl { + dataset_registry: Arc, + time_source: Arc, +} + +#[component(pub)] +#[interface(dyn TransformRequestPlanner)] +impl TransformRequestPlannerImpl { + pub fn new( + dataset_registry: Arc, + time_source: Arc, + ) -> Self { + Self { + dataset_registry, + time_source, + } + } + + // TODO: PERF: Avoid multiple passes over metadata chain + #[tracing::instrument(level = "info", skip_all)] + async fn get_next_operation( + &self, + target: ResolvedDataset, + system_time: DateTime, + ) -> Result { + // Build prelmiinary request + let preliminary_request = + build_preliminary_request_ext(target.clone(), system_time).await?; + + // Pre-fill datasets that is used in the operation + let mut datasets_map = ResolvedDatasetsMap::default(); + datasets_map.register(target); + for (input_decl, _) in &preliminary_request.input_states { + let hdl = self + .dataset_registry + .resolve_dataset_handle_by_ref(&input_decl.dataset_ref) + .await + .int_err()?; + datasets_map + .register_with(&hdl, |hdl| self.dataset_registry.get_dataset_by_handle(hdl)); + } + + Ok(TransformPreliminaryPlan { + preliminary_request, + datasets_map, + }) + } + + // TODO: Avoid iterating through output chain multiple times + async fn get_vocab(&self, dataset: &dyn Dataset) -> Result { + Ok(dataset + .as_metadata_chain() + .accept_one(SearchSetVocabVisitor::new()) + .await + .int_err()? + .into_event() + .unwrap_or_default() + .into()) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl TransformRequestPlanner for TransformRequestPlannerImpl { + /// Returns an active transform, if any + #[tracing::instrument(level = "debug", skip_all, fields(target=%target.get_handle()))] + /// + async fn get_active_transform( + &self, + target: ResolvedDataset, + ) -> Result)>, InternalError> { + // TODO: Support transform evolution + Ok(target + .as_metadata_chain() + .accept_one(SearchSetTransformVisitor::new()) + .await + .int_err()? + .into_hashed_block()) + } + + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle()))] + async fn build_transform_preliminary_plan( + &self, + target: ResolvedDataset, + ) -> Result { + // TODO: There might be more operations to do + self.get_next_operation(target.clone(), self.time_source.now()) + .await + } + + #[tracing::instrument(level = "info", skip_all, fields(target=%target.get_handle(), ?block_range))] + async fn build_transform_verification_plan( + &self, + target: ResolvedDataset, + block_range: (Option, Option), + ) -> Result { + let metadata_chain = target.as_metadata_chain(); + + let head = match block_range.1 { + None => metadata_chain.resolve_ref(&BlockRef::Head).await?, + Some(hash) => hash, + }; + let tail = block_range.0; + let tail_sequence_number = match tail.as_ref() { + Some(tail) => { + let block = metadata_chain.get_block(tail).await?; + + Some(block.sequence_number) + } + None => None, + }; + + let (source, set_vocab, schema, blocks, finished_range) = { + // TODO: Support dataset evolution + let mut set_transform_visitor = SearchSetTransformVisitor::new(); + let mut set_vocab_visitor = SearchSetVocabVisitor::new(); + let mut set_data_schema_visitor = SearchSetDataSchemaVisitor::new(); + + type Flag = MetadataEventTypeFlags; + type Decision = MetadataVisitorDecision; + + struct ExecuteTransformCollectorVisitor { + tail_sequence_number: Option, + blocks: Vec<(Multihash, MetadataBlock)>, + finished_range: bool, + } + + let mut execute_transform_collector_visitor = GenericCallbackVisitor::new( + ExecuteTransformCollectorVisitor { + tail_sequence_number, + blocks: Vec::new(), + finished_range: false, + }, + Decision::NextOfType(Flag::EXECUTE_TRANSFORM), + |state, hash, block| { + if Some(block.sequence_number) < state.tail_sequence_number { + state.finished_range = true; + + return Decision::Stop; + }; + + let block_flag = Flag::from(&block.event); + + if Flag::EXECUTE_TRANSFORM.contains(block_flag) { + state.blocks.push((hash.clone(), block.clone())); + }; + + if Some(block.sequence_number) == state.tail_sequence_number { + state.finished_range = true; + + Decision::Stop + } else { + Decision::NextOfType(Flag::EXECUTE_TRANSFORM) + } + }, + ); + + metadata_chain + .accept(&mut [ + &mut set_transform_visitor, + &mut set_vocab_visitor, + &mut set_data_schema_visitor, + &mut execute_transform_collector_visitor, + ]) + .await + .int_err()?; + + let ExecuteTransformCollectorVisitor { + blocks, + finished_range, + .. + } = execute_transform_collector_visitor.into_state(); + + ( + set_transform_visitor.into_event(), + set_vocab_visitor.into_event(), + set_data_schema_visitor + .into_event() + .as_ref() + .map(SetDataSchema::schema_as_arrow) + .transpose() // Option> -> Result, E> + .int_err()?, + blocks, + finished_range, + ) + }; + + // Ensure start_block was found if specified + if tail.is_some() && !finished_range { + return Err(InvalidIntervalError { + head, + tail: tail.unwrap(), + } + .into()); + } + + let source = source.ok_or( + "Expected a derivative dataset but SetTransform block was not found".int_err(), + )?; + + // Fill table of working datasets + let mut datasets_map = ResolvedDatasetsMap::default(); + datasets_map.register(target.clone()); + for input in &source.inputs { + let hdl = self + .dataset_registry + .resolve_dataset_handle_by_ref(&input.dataset_ref) + .await + .int_err()?; + let resolved_input = self.dataset_registry.get_dataset_by_handle(&hdl); + datasets_map.register(resolved_input); + } + + // TODO: Replace maps with access by index, as ODF guarantees same order of + // inputs in ExecuteTransform as in SetTransform + use futures::{StreamExt, TryStreamExt}; + let dataset_vocabs: BTreeMap<_, _> = futures::stream::iter(&source.inputs) + .map(|input| input.dataset_ref.id().cloned().unwrap()) + .then(|input_id| async { + use futures::TryFutureExt; + let resolved_input = datasets_map.get_by_id(&input_id); + self.get_vocab(resolved_input.as_ref()) + .map_ok(|vocab| (input_id, vocab)) + .await + }) + .try_collect() + .await?; + + let input_aliases: BTreeMap<_, _> = source + .inputs + .iter() + .map(|i| { + ( + i.dataset_ref.id().cloned().unwrap(), + i.alias.clone().unwrap(), + ) + }) + .collect(); + + let mut steps = Vec::new(); + + for (block_hash, block) in blocks.into_iter().rev() { + let block_t = block.as_typed::().unwrap(); + + let inputs = futures::stream::iter(&block_t.event.query_inputs) + .then(|slice| { + let alias = input_aliases.get(&slice.dataset_id).unwrap(); + + let vocab = dataset_vocabs.get(&slice.dataset_id).cloned().unwrap(); + + get_transform_input_from_query_input( + slice.clone(), + alias.clone(), + Some(vocab), + &datasets_map, + ) + }) + .try_collect() + .await + .map_err(Into::::into)?; + + let step = VerifyTransformStep { + request: TransformRequestExt { + operation_id: get_random_name(None, 10), + dataset_handle: target.get_handle().clone(), + block_ref: BlockRef::Head, + head: block_t.prev_block_hash.unwrap().clone(), + transform: source.transform.clone(), + system_time: block.system_time, + schema: schema.clone(), + prev_offset: block_t.event.prev_offset, + inputs, + vocab: set_vocab.clone().unwrap_or_default().into(), + prev_checkpoint: block_t.event.prev_checkpoint.clone(), + }, + expected_block: block, + expected_hash: block_hash, + }; + + steps.push(step); + } + + Ok(VerifyTransformOperation { + steps, + datasets_map, + }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/transform_service_impl.rs b/src/infra/core/src/transform_service_impl.rs deleted file mode 100644 index c5d758d146..0000000000 --- a/src/infra/core/src/transform_service_impl.rs +++ /dev/null @@ -1,959 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::collections::BTreeMap; -use std::sync::Arc; - -use chrono::{DateTime, Utc}; -use dill::*; -use futures::{StreamExt, TryFutureExt, TryStreamExt}; -use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; -use itertools::Itertools; -use kamu_core::engine::*; -use kamu_core::*; -use kamu_ingest_datafusion::DataWriterDataFusion; -use opendatafabric::*; -use random_names::get_random_name; -use time_source::SystemTimeSource; - -pub struct TransformServiceImpl { - dataset_repo: Arc, - dataset_action_authorizer: Arc, - engine_provisioner: Arc, - time_source: Arc, - compaction_svc: Arc, -} - -#[component(pub)] -#[interface(dyn TransformService)] -impl TransformServiceImpl { - pub fn new( - dataset_repo: Arc, - dataset_action_authorizer: Arc, - engine_provisioner: Arc, - time_source: Arc, - compaction_svc: Arc, - ) -> Self { - Self { - dataset_repo, - dataset_action_authorizer, - engine_provisioner, - time_source, - compaction_svc, - } - } - - // Note: Can be called from multiple threads - #[tracing::instrument(level = "info", skip_all, fields(operation_id = %request.operation_id))] - async fn do_transform( - engine_provisioner: Arc, - request: TransformRequestExt, - commit_fn: CommitFn, - listener: Arc, - ) -> Result - where - CommitFn: FnOnce(TransformRequestExt, TransformResponseExt) -> Fut, - Fut: futures::Future>, - { - tracing::info!(?request, "Transform request"); - - listener.begin(); - - match Self::do_transform_inner(engine_provisioner, request, commit_fn, listener.clone()) - .await - { - Ok(res) => { - tracing::info!("Transform successful"); - listener.success(&res); - Ok(res) - } - Err(err) => { - tracing::error!(error = ?err, error_msg = %err, "Transform failed"); - listener.error(&err); - Err(err) - } - } - } - - // Note: Can be called from multiple threads - async fn do_transform_inner( - engine_provisioner: Arc, - request: TransformRequestExt, - commit_fn: CommitFn, - listener: Arc, - ) -> Result - where - CommitFn: FnOnce(TransformRequestExt, TransformResponseExt) -> Fut, - Fut: futures::Future>, - { - let engine = engine_provisioner - .provision_engine( - match request.transform { - Transform::Sql(ref sql) => &sql.engine, - }, - listener.clone().get_engine_provisioning_listener(), - ) - .await?; - - let response = engine.execute_transform(request.clone()).await?; - assert_eq!( - response.new_offset_interval.is_some(), - response.new_data.is_some() - ); - - commit_fn(request, response).await - } - - async fn commit_execute_transform( - dataset_repo: Arc, - request: TransformRequestExt, - response: TransformResponseExt, - ) -> Result { - let old_head = request.head.clone(); - let mut new_head = old_head.clone(); - - let dataset = dataset_repo.get_dataset_by_handle(&request.dataset_handle); - - if response.output_schema.is_none() { - tracing::warn!("Engine did not produce a schema. In future this will become an error."); - }; - - if let Some(prev_schema) = request.schema { - // Validate schema - if let Some(new_schema) = response.output_schema { - DataWriterDataFusion::validate_output_schema_equivalence(&prev_schema, &new_schema) - .int_err()?; - } - } else { - // Set schema upon first transform - if let Some(new_schema) = response.output_schema { - // TODO: make schema commit atomic with data - let commit_schema_result = dataset - .commit_event( - SetDataSchema::new(&new_schema).into(), - CommitOpts { - block_ref: &request.block_ref, - system_time: Some(request.system_time), - prev_block_hash: Some(Some(&new_head)), - check_object_refs: false, - update_block_ref: true, - }, - ) - .await?; - - new_head = commit_schema_result.new_head; - } - } - - let params = ExecuteTransformParams { - query_inputs: request.inputs.iter().map(|i| i.clone().into()).collect(), - prev_checkpoint: request.prev_checkpoint, - prev_offset: request.prev_offset, - new_offset_interval: response.new_offset_interval, - new_watermark: response.new_watermark, - }; - - match dataset - .commit_execute_transform( - params, - response.new_data, - response.new_checkpoint.map(CheckpointRef::New), - CommitOpts { - block_ref: &request.block_ref, - system_time: Some(request.system_time), - prev_block_hash: Some(Some(&new_head)), - check_object_refs: true, - update_block_ref: true, - }, - ) - .await - { - Ok(res) => { - new_head = res.new_head; - Ok(()) - } - Err(CommitError::MetadataAppendError(AppendError::InvalidBlock( - AppendValidationError::NoOpEvent(_), - ))) => Ok(()), - Err(err) => Err(err), - }?; - - assert_ne!( - old_head, new_head, - "Commit did not update neither schema nor data" - ); - - Ok(TransformResult::Updated { old_head, new_head }) - } - - // TODO: PERF: Avoid multiple passes over metadata chain - #[tracing::instrument(level = "info", skip_all)] - pub async fn get_next_operation( - &self, - dataset_handle: &DatasetHandle, - system_time: DateTime, - ) -> Result, TransformError> { - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - - let output_chain = dataset.as_metadata_chain(); - - // TODO: externalize - let block_ref = BlockRef::Head; - let head = output_chain.resolve_ref(&block_ref).await.int_err()?; - - // TODO: PERF: Search for source, vocab, and data schema result in full scan - let (source, schema, set_vocab, prev_query) = { - // TODO: Support transform evolution - let mut set_transform_visitor = SearchSetTransformVisitor::new(); - let mut set_vocab_visitor = SearchSetVocabVisitor::new(); - let mut set_data_schema_visitor = SearchSetDataSchemaVisitor::new(); - let mut execute_transform_visitor = SearchExecuteTransformVisitor::new(); - - dataset - .as_metadata_chain() - .accept_by_hash( - &mut [ - &mut set_transform_visitor, - &mut set_vocab_visitor, - &mut set_data_schema_visitor, - &mut execute_transform_visitor, - ], - &head, - ) - .await - .int_err()?; - - ( - set_transform_visitor.into_event(), - set_data_schema_visitor - .into_event() - .as_ref() - .map(SetDataSchema::schema_as_arrow) - .transpose() // Option> -> Result, E> - .int_err()?, - set_vocab_visitor.into_event(), - execute_transform_visitor.into_event(), - ) - }; - - let Some(source) = source else { - return Err(TransformNotDefinedError {}.into()); - }; - tracing::debug!(?source, "Transforming using source"); - - // Prepare inputs - let input_states: Vec<(&TransformInput, Option<&ExecuteTransformInput>)> = - if let Some(query) = &prev_query { - source - .inputs - .iter() - .zip_eq(query.query_inputs.iter().map(Some)) - .collect() - } else { - source.inputs.iter().map(|i| (i, None)).collect() - }; - - let inputs: Vec<_> = futures::stream::iter(input_states) - .then(|(input_decl, input_state)| self.get_transform_input(input_decl, input_state)) - .try_collect() - .await?; - - // Nothing to do? - // Note that we're considering a schema here, as even if there is no data to - // process we would like to run the transform to establish the schema of the - // output. - // - // TODO: Detect the situation where inputs only had source updates and skip - // running the engine - if inputs - .iter() - .all(|i| i.data_slices.is_empty() && i.explicit_watermarks.is_empty()) - && schema.is_some() - { - return Ok(None); - } - - Ok(Some(TransformRequestExt { - operation_id: get_random_name(None, 10), - dataset_handle: dataset_handle.clone(), - block_ref, - head, - transform: source.transform, - system_time, - schema, - prev_offset: prev_query.as_ref().and_then(ExecuteTransform::last_offset), - vocab: set_vocab.unwrap_or_default().into(), - inputs, - prev_checkpoint: prev_query.and_then(|q| q.new_checkpoint.map(|c| c.physical_hash)), - })) - } - - async fn get_transform_input( - &self, - input_decl: &TransformInput, - input_state: Option<&ExecuteTransformInput>, - ) -> Result { - let dataset_id = input_decl.dataset_ref.id().unwrap(); - if let Some(input_state) = input_state { - assert_eq!(*dataset_id, input_state.dataset_id); - } - - let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&dataset_id.as_local_ref()) - .await - .int_err()?; - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - let input_chain = dataset.as_metadata_chain(); - - // Determine last processed input block and offset - let last_processed_block = input_state.and_then(|i| i.last_block_hash()); - let last_processed_offset = input_state.and_then(ExecuteTransformInput::last_offset); - - // Determine unprocessed block and offset range - let last_unprocessed_block = input_chain.resolve_ref(&BlockRef::Head).await.int_err()?; - let last_unprocessed_offset = input_chain - .accept_one_by_hash( - &last_unprocessed_block, - SearchSingleDataBlockVisitor::next(), - ) - .await - .int_err()? - .into_event() - .and_then(|event| event.last_offset()) - .or(last_processed_offset); - - let query_input = ExecuteTransformInput { - dataset_id: dataset_id.clone(), - prev_block_hash: last_processed_block.cloned(), - new_block_hash: if Some(&last_unprocessed_block) != last_processed_block { - Some(last_unprocessed_block) - } else { - None - }, - prev_offset: last_processed_offset, - new_offset: if last_unprocessed_offset != last_processed_offset { - last_unprocessed_offset - } else { - None - }, - }; - - self.get_transform_input_from_query_input( - query_input, - input_decl.alias.clone().unwrap(), - None, - ) - .await - } - - async fn get_transform_input_from_query_input( - &self, - query_input: ExecuteTransformInput, - alias: String, - vocab_hint: Option, - ) -> Result { - let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&query_input.dataset_id.as_local_ref()) - .await - .int_err()?; - - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Read) - .await?; - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - let input_chain = dataset.as_metadata_chain(); - - // Find schema - // TODO: Make single-pass via multi-visitor - let schema = dataset - .as_metadata_chain() - .accept_one(SearchSetDataSchemaVisitor::new()) - .await - .int_err()? - .into_event() - .map(|e| e.schema_as_arrow()) - .transpose() - .int_err()? - .ok_or_else(|| InputSchemaNotDefinedError { - dataset_handle: dataset_handle.clone(), - })?; - - // Collect unprocessed input blocks - let blocks_unprocessed = if let Some(new_block_hash) = &query_input.new_block_hash { - input_chain - .iter_blocks_interval(new_block_hash, query_input.prev_block_hash.as_ref(), false) - .try_collect() - .await - .map_err(|chain_err| match chain_err { - IterBlocksError::InvalidInterval(err) => { - TransformError::InvalidInputInterval(InvalidInputIntervalError { - head: err.head, - tail: err.tail, - input_dataset_id: dataset_handle.id.clone(), - }) - } - _ => TransformError::Internal(chain_err.int_err()), - })? - } else { - Vec::new() - }; - - let mut data_slices = Vec::new(); - let mut explicit_watermarks = Vec::new(); - for block in blocks_unprocessed - .iter() - .rev() - .filter_map(|(_, b)| b.as_data_stream_block()) - { - if let Some(slice) = block.event.new_data { - data_slices.push(slice.physical_hash.clone()); - } - - if let Some(wm) = block.event.new_watermark { - explicit_watermarks.push(Watermark { - system_time: *block.system_time, - event_time: *wm, - }); - } - } - - let vocab = match vocab_hint { - Some(v) => v, - None => self.get_vocab(&dataset_handle.as_local_ref()).await?, - }; - - let is_empty = data_slices.is_empty() && explicit_watermarks.is_empty(); - - let input = TransformRequestInputExt { - dataset_handle, - alias, - vocab, - prev_block_hash: query_input.prev_block_hash, - new_block_hash: query_input.new_block_hash, - prev_offset: query_input.prev_offset, - new_offset: query_input.new_offset, - data_slices, - schema, - explicit_watermarks, - }; - - tracing::info!(?input, is_empty, "Computed transform input"); - - Ok(input) - } - - // TODO: Avoid iterating through output chain multiple times - async fn get_vocab( - &self, - dataset_ref: &DatasetRef, - ) -> Result { - let dataset = self - .dataset_repo - .find_dataset_by_ref(dataset_ref) - .await - .int_err()?; - - Ok(dataset - .as_metadata_chain() - .accept_one(SearchSetVocabVisitor::new()) - .await - .int_err()? - .into_event() - .unwrap_or_default() - .into()) - } - - // TODO: Improve error handling - // Need an inconsistent metadata error? - #[tracing::instrument(level = "info", skip_all)] - pub async fn get_verification_plan( - &self, - dataset_handle: &DatasetHandle, - block_range: (Option, Option), - ) -> Result, VerificationError> { - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - let metadata_chain = dataset.as_metadata_chain(); - - let head = match block_range.1 { - None => metadata_chain.resolve_ref(&BlockRef::Head).await?, - Some(hash) => hash, - }; - let tail = block_range.0; - let tail_sequence_number = match tail.as_ref() { - Some(tail) => { - let block = metadata_chain.get_block(tail).await?; - - Some(block.sequence_number) - } - None => None, - }; - - let (source, set_vocab, schema, blocks, finished_range) = { - // TODO: Support dataset evolution - let mut set_transform_visitor = SearchSetTransformVisitor::new(); - let mut set_vocab_visitor = SearchSetVocabVisitor::new(); - let mut set_data_schema_visitor = SearchSetDataSchemaVisitor::new(); - - type Flag = MetadataEventTypeFlags; - type Decision = MetadataVisitorDecision; - - struct ExecuteTransformCollectorVisitor { - tail_sequence_number: Option, - blocks: Vec<(Multihash, MetadataBlock)>, - finished_range: bool, - } - - let mut execute_transform_collector_visitor = GenericCallbackVisitor::new( - ExecuteTransformCollectorVisitor { - tail_sequence_number, - blocks: Vec::new(), - finished_range: false, - }, - Decision::NextOfType(Flag::EXECUTE_TRANSFORM), - |state, hash, block| { - if Some(block.sequence_number) < state.tail_sequence_number { - state.finished_range = true; - - return Decision::Stop; - }; - - let block_flag = Flag::from(&block.event); - - if Flag::EXECUTE_TRANSFORM.contains(block_flag) { - state.blocks.push((hash.clone(), block.clone())); - }; - - if Some(block.sequence_number) == state.tail_sequence_number { - state.finished_range = true; - - Decision::Stop - } else { - Decision::NextOfType(Flag::EXECUTE_TRANSFORM) - } - }, - ); - - metadata_chain - .accept(&mut [ - &mut set_transform_visitor, - &mut set_vocab_visitor, - &mut set_data_schema_visitor, - &mut execute_transform_collector_visitor, - ]) - .await - .int_err()?; - - let ExecuteTransformCollectorVisitor { - blocks, - finished_range, - .. - } = execute_transform_collector_visitor.into_state(); - - ( - set_transform_visitor.into_event(), - set_vocab_visitor.into_event(), - set_data_schema_visitor - .into_event() - .as_ref() - .map(SetDataSchema::schema_as_arrow) - .transpose() // Option> -> Result, E> - .int_err()?, - blocks, - finished_range, - ) - }; - - // Ensure start_block was found if specified - if tail.is_some() && !finished_range { - return Err(InvalidIntervalError { - head, - tail: tail.unwrap(), - } - .into()); - } - - let source = source.ok_or( - "Expected a derivative dataset but SetTransform block was not found".int_err(), - )?; - - // TODO: Replace maps with access by index, as ODF guarantees same order of - // inputs in ExecuteTransform as in SetTransform - let dataset_vocabs: BTreeMap<_, _> = futures::stream::iter(&source.inputs) - .map(|input| { - ( - input.dataset_ref.id().cloned().unwrap(), - input.dataset_ref.id().unwrap().as_local_ref(), - ) - }) - .then(|(input_id, input_ref)| async move { - self.get_vocab(&input_ref) - .map_ok(|vocab| (input_id, vocab)) - .await - }) - .try_collect() - .await?; - - let input_aliases: BTreeMap<_, _> = source - .inputs - .iter() - .map(|i| { - ( - i.dataset_ref.id().cloned().unwrap(), - i.alias.clone().unwrap(), - ) - }) - .collect(); - - let mut plan = Vec::new(); - - for (block_hash, block) in blocks.into_iter().rev() { - let block_t = block.as_typed::().unwrap(); - - let inputs = futures::stream::iter(&block_t.event.query_inputs) - .then(|slice| { - let alias = input_aliases.get(&slice.dataset_id).unwrap(); - - let vocab = dataset_vocabs.get(&slice.dataset_id).cloned().unwrap(); - - self.get_transform_input_from_query_input( - slice.clone(), - alias.clone(), - Some(vocab), - ) - }) - .try_collect() - .await - .map_err(|e| match e { - TransformError::Access(e) => VerificationError::Access(e), - TransformError::Internal(e) => VerificationError::Internal(e), - _ => VerificationError::Internal(e.int_err()), - })?; - - let step = VerificationStep { - request: TransformRequestExt { - operation_id: get_random_name(None, 10), - dataset_handle: dataset_handle.clone(), - block_ref: BlockRef::Head, - head: block_t.prev_block_hash.unwrap().clone(), - transform: source.transform.clone(), - system_time: block.system_time, - schema: schema.clone(), - prev_offset: block_t.event.prev_offset, - inputs, - vocab: set_vocab.clone().unwrap_or_default().into(), - prev_checkpoint: block_t.event.prev_checkpoint.clone(), - }, - expected_block: block, - expected_hash: block_hash, - }; - - plan.push(step); - } - - Ok(plan) - } - - #[async_recursion::async_recursion] - #[tracing::instrument(level = "info", name = "transform", skip_all, fields(%dataset_ref))] - async fn transform_impl( - &self, - dataset_ref: DatasetRef, - options: TransformOptions, - maybe_listener: Option>, - ) -> Result { - let listener = maybe_listener.unwrap_or_else(|| Arc::new(NullTransformListener)); - let dataset_handle = self.dataset_repo.resolve_dataset_ref(&dataset_ref).await?; - - self.dataset_action_authorizer - .check_action_allowed(&dataset_handle, auth::DatasetAction::Write) - .await?; - - // TODO: There might be more operations to do - match self - .get_next_operation(&dataset_handle, self.time_source.now()) - .await - { - Ok(Some(operation)) => { - let dataset_repo = self.dataset_repo.clone(); - Self::do_transform( - self.engine_provisioner.clone(), - operation, - |request, response| async move { - Self::commit_execute_transform(dataset_repo, request, response).await - }, - listener, - ) - .await - } - Ok(None) => { - listener.begin(); - listener.success(&TransformResult::UpToDate); - Ok(TransformResult::UpToDate) - } - // TODO: Trapping the error to preserve old behavior - we should consider - // surfacing it and handling on upper layers - Err(TransformError::InputSchemaNotDefined(e)) => { - tracing::info!( - input = %e.dataset_handle, - "Not processing because one of the inputs was never pulled", - ); - listener.begin(); - listener.success(&TransformResult::UpToDate); - Ok(TransformResult::UpToDate) - } - Err(err @ TransformError::InvalidInputInterval(_)) - if options.reset_derivatives_on_diverged_input => - { - tracing::warn!( - error = %err, - "Interval error detected - resetting on diverged input", - ); - - let compaction_result = self - .compaction_svc - .compact_dataset( - &dataset_handle, - CompactionOptions { - keep_metadata_only: true, - ..Default::default() - }, - None, - ) - .await - .int_err()?; - - if let CompactionResult::Success { .. } = compaction_result { - // Recursing to try again after compaction - self.transform_impl( - dataset_ref.clone(), - TransformOptions { - reset_derivatives_on_diverged_input: false, - }, - Some(listener), - ) - .await - } else { - Err(err) - } - } - Err(e) => Err(e), - } - } -} - -#[async_trait::async_trait] -impl TransformService for TransformServiceImpl { - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref))] - async fn get_active_transform( - &self, - dataset_ref: &DatasetRef, - ) -> Result)>, GetDatasetError> { - let dataset = self.dataset_repo.find_dataset_by_ref(dataset_ref).await?; - - // TODO: Support transform evolution - Ok(dataset - .as_metadata_chain() - .accept_one(SearchSetTransformVisitor::new()) - .await - .int_err()? - .into_hashed_block()) - } - - async fn transform( - &self, - dataset_ref: &DatasetRef, - options: TransformOptions, - maybe_listener: Option>, - ) -> Result { - tracing::info!(?dataset_ref, "Transforming a single dataset"); - - self.transform_impl(dataset_ref.clone(), options, maybe_listener) - .await - } - - async fn transform_multi( - &self, - dataset_refs: Vec, - options: TransformOptions, - maybe_multi_listener: Option>, - ) -> Vec<(DatasetRef, Result)> { - let multi_listener = - maybe_multi_listener.unwrap_or_else(|| Arc::new(NullTransformMultiListener)); - - tracing::info!(?dataset_refs, "Transforming multiple datasets"); - - let mut futures = Vec::new(); - - for dataset_ref in &dataset_refs { - let f = match self.dataset_repo.resolve_dataset_ref(dataset_ref).await { - Ok(hdl) => { - let maybe_listener = multi_listener.begin_transform(&hdl); - self.transform_impl(hdl.into(), options, maybe_listener) - } - // Relying on this call to fail to avoid boxing the futures - Err(_) => self.transform_impl(dataset_ref.clone(), options, None), - }; - futures.push(f); - } - - let results = futures::future::join_all(futures).await; - dataset_refs.into_iter().zip(results).collect() - } - - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref, ?block_range))] - async fn verify_transform( - &self, - dataset_ref: &DatasetRef, - block_range: (Option, Option), - maybe_listener: Option>, - ) -> Result<(), VerificationError> { - let listener = maybe_listener.unwrap_or(Arc::new(NullVerificationListener {})); - - let dataset_handle = self.dataset_repo.resolve_dataset_ref(dataset_ref).await?; - - // Note: output dataset read permissions are already checked in - // VerificationService. But permissions for input datasets have to be - // checked here - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - - let verification_plan = self - .get_verification_plan(&dataset_handle, block_range) - .await?; - let num_steps = verification_plan.len(); - listener.begin_phase(VerificationPhase::ReplayTransform); - - for (step_index, step) in verification_plan.into_iter().enumerate() { - let request = step.request; - let block_hash = step.expected_hash; - let expected_block = step.expected_block; - let expected_event = expected_block - .event - .into_variant::() - .unwrap(); - - // Will be set during "commit" step - let mut actual_event: Option = None; - - tracing::info!( - %block_hash, - "Replaying block" - ); - - listener.begin_block( - &block_hash, - step_index, - num_steps, - VerificationPhase::ReplayTransform, - ); - - let transform_listener = listener - .clone() - .get_transform_listener() - .unwrap_or_else(|| Arc::new(NullTransformListener)); - - let ds = dataset.clone(); - let out_event = &mut actual_event; - - let result = TransformResult::Updated { - old_head: expected_block.prev_block_hash.clone().unwrap(), - new_head: block_hash.clone(), - }; - - Self::do_transform( - self.engine_provisioner.clone(), - request, - |request, response| async move { - let params = ExecuteTransformParams { - query_inputs: request.inputs.iter().map(|i| i.clone().into()).collect(), - prev_checkpoint: request.prev_checkpoint, - prev_offset: request.prev_offset, - new_offset_interval: response.new_offset_interval, - new_watermark: response.new_watermark, - }; - - // We expect outputs to be cleaned up automatically on drop - let new_event = ds - .prepare_execute_transform( - params, - response.new_data.as_ref(), - response.new_checkpoint.map(CheckpointRef::New).as_ref(), - ) - .await?; - - *out_event = Some(new_event); - - // This result is ignored - Ok(result) - }, - transform_listener, - ) - .await?; - - let actual_event = actual_event.unwrap(); - - tracing::debug!(%block_hash, ?expected_event, ?actual_event, "Comparing expected and replayed events"); - - let mut cmp_actual_event = actual_event.clone(); - - // Parquet format is non-reproducible, so we rely only on logical hash for - // equivalence test and overwrite the physical hash and size with - // the expected values for comparison - if let Some(actual_slice) = &mut cmp_actual_event.new_data { - if let Some(expected_slice) = &expected_event.new_data { - actual_slice.physical_hash = expected_slice.physical_hash.clone(); - actual_slice.size = expected_slice.size; - } - } - - // Currently we're considering checkpoints non-reproducible and thus exclude - // them from equivalence test - cmp_actual_event - .new_checkpoint - .clone_from(&expected_event.new_checkpoint); - - if expected_event != cmp_actual_event { - tracing::warn!(%block_hash, ?expected_event, ?actual_event, "Data is not reproducible"); - - let err = VerificationError::DataNotReproducible(DataNotReproducible { - block_hash, - expected_event: Box::new(expected_event.into()), - actual_event: Box::new(actual_event.into()), - }); - listener.error(&err); - return Err(err); - } - - tracing::info!(%block_hash, "Block is valid"); - listener.end_block( - &block_hash, - step_index, - num_steps, - VerificationPhase::ReplayTransform, - ); - } - - listener.end_phase(VerificationPhase::ReplayTransform); - Ok(()) - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug)] -pub struct VerificationStep { - pub request: TransformRequestExt, - pub expected_block: MetadataBlock, - pub expected_hash: Multihash, -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/use_cases/append_dataset_metadata_batch_use_case_impl.rs b/src/infra/core/src/use_cases/append_dataset_metadata_batch_use_case_impl.rs index d5f1bd60d0..c7fd062670 100644 --- a/src/infra/core/src/use_cases/append_dataset_metadata_batch_use_case_impl.rs +++ b/src/infra/core/src/use_cases/append_dataset_metadata_batch_use_case_impl.rs @@ -42,6 +42,12 @@ impl AppendDatasetMetadataBatchUseCaseImpl { #[async_trait::async_trait] impl AppendDatasetMetadataBatchUseCase for AppendDatasetMetadataBatchUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "AppendDatasetMetadataBatchUseCase::execute", + skip_all, + fields(dataset_handle, ?new_blocks, force_update_if_diverged) + )] async fn execute( &self, dataset: &dyn Dataset, diff --git a/src/infra/core/src/use_cases/commit_dataset_event_use_case_impl.rs b/src/infra/core/src/use_cases/commit_dataset_event_use_case_impl.rs index d23f3b9f7d..5c4478cb62 100644 --- a/src/infra/core/src/use_cases/commit_dataset_event_use_case_impl.rs +++ b/src/infra/core/src/use_cases/commit_dataset_event_use_case_impl.rs @@ -17,7 +17,7 @@ use kamu_core::{ CommitOpts, CommitResult, DatasetLifecycleMessage, - DatasetRepository, + DatasetRegistry, MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, }; use messaging_outbox::{Outbox, OutboxExt}; @@ -28,19 +28,19 @@ use opendatafabric::{DatasetHandle, MetadataEvent}; #[component(pub)] #[interface(dyn CommitDatasetEventUseCase)] pub struct CommitDatasetEventUseCaseImpl { - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, outbox: Arc, } impl CommitDatasetEventUseCaseImpl { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, dataset_action_authorizer: Arc, outbox: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, dataset_action_authorizer, outbox, } @@ -49,6 +49,12 @@ impl CommitDatasetEventUseCaseImpl { #[async_trait::async_trait] impl CommitDatasetEventUseCase for CommitDatasetEventUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "CommitDatasetEventUseCase::execute", + skip_all, + fields(dataset_handle, ?event, ?opts) + )] async fn execute( &self, dataset_handle: &DatasetHandle, @@ -59,9 +65,9 @@ impl CommitDatasetEventUseCase for CommitDatasetEventUseCaseImpl { .check_action_allowed(dataset_handle, DatasetAction::Write) .await?; - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); - let commit_result = dataset.commit_event(event, opts).await?; + let commit_result = resolved_dataset.commit_event(event, opts).await?; if !commit_result.new_upstream_ids.is_empty() { self.outbox diff --git a/src/infra/core/src/use_cases/compact_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/compact_dataset_use_case_impl.rs new file mode 100644 index 0000000000..ec7e8292ba --- /dev/null +++ b/src/infra/core/src/use_cases/compact_dataset_use_case_impl.rs @@ -0,0 +1,111 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::{component, interface}; +use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer}; +use kamu_core::{ + CompactDatasetUseCase, + CompactionError, + CompactionListener, + CompactionMultiListener, + CompactionOptions, + CompactionResponse, + CompactionResult, + CompactionService, + DatasetRegistry, + NullCompactionMultiListener, +}; +use opendatafabric::DatasetHandle; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn CompactDatasetUseCase)] +pub struct CompactDatasetUseCaseImpl { + compaction_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, +} + +impl CompactDatasetUseCaseImpl { + pub fn new( + compaction_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, + ) -> Self { + Self { + compaction_service, + dataset_registry, + dataset_action_authorizer, + } + } +} + +#[async_trait::async_trait] +impl CompactDatasetUseCase for CompactDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "CompactDatasetUseCase::execute", + skip_all, + fields(dataset_handle, ?options) + )] + async fn execute( + &self, + dataset_handle: &DatasetHandle, + options: CompactionOptions, + maybe_listener: Option>, + ) -> Result { + // Permission check + self.dataset_action_authorizer + .check_action_allowed(dataset_handle, DatasetAction::Write) + .await?; + + // Resolve dataset + let target = self.dataset_registry.get_dataset_by_handle(dataset_handle); + + // Actual action + self.compaction_service + .compact_dataset(target, options, maybe_listener) + .await + } + + #[tracing::instrument( + level = "info", + name = "CompactDatasetUseCase::execute_multi", + skip_all, + fields(?dataset_handles, ?options) + )] + async fn execute_multi( + &self, + dataset_handles: Vec, + options: CompactionOptions, + multi_listener: Option>, + ) -> Vec { + let listener = multi_listener.unwrap_or(Arc::new(NullCompactionMultiListener {})); + + let mut result = vec![]; + for dataset_handle in &dataset_handles { + result.push(CompactionResponse { + dataset_ref: dataset_handle.as_local_ref(), + result: self + .execute( + dataset_handle, + options.clone(), + listener.begin_compact(dataset_handle), + ) + .await, + }); + } + result + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/use_cases/create_dataset_from_snapshot_use_case_impl.rs b/src/infra/core/src/use_cases/create_dataset_from_snapshot_use_case_impl.rs index 0d5e36216a..9708eed327 100644 --- a/src/infra/core/src/use_cases/create_dataset_from_snapshot_use_case_impl.rs +++ b/src/infra/core/src/use_cases/create_dataset_from_snapshot_use_case_impl.rs @@ -51,6 +51,7 @@ impl CreateDatasetFromSnapshotUseCaseImpl { #[async_trait::async_trait] impl CreateDatasetFromSnapshotUseCase for CreateDatasetFromSnapshotUseCaseImpl { + #[tracing::instrument(level = "info", skip_all, fields(?snapshot, ?options))] async fn execute( &self, snapshot: DatasetSnapshot, diff --git a/src/infra/core/src/use_cases/create_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/create_dataset_use_case_impl.rs index 82efc19d33..691086a43f 100644 --- a/src/infra/core/src/use_cases/create_dataset_use_case_impl.rs +++ b/src/infra/core/src/use_cases/create_dataset_use_case_impl.rs @@ -50,6 +50,7 @@ impl CreateDatasetUseCaseImpl { #[async_trait::async_trait] impl CreateDatasetUseCase for CreateDatasetUseCaseImpl { + #[tracing::instrument(level = "info", skip_all, fields(dataset_alias, ?seed_block, ?options))] async fn execute( &self, dataset_alias: &DatasetAlias, diff --git a/src/infra/core/src/use_cases/delete_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/delete_dataset_use_case_impl.rs index 9bdbfad506..b46cff9d49 100644 --- a/src/infra/core/src/use_cases/delete_dataset_use_case_impl.rs +++ b/src/infra/core/src/use_cases/delete_dataset_use_case_impl.rs @@ -15,7 +15,7 @@ use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer}; use kamu_core::{ DanglingReferenceError, DatasetLifecycleMessage, - DatasetRepository, + DatasetRegistry, DeleteDatasetError, DeleteDatasetUseCase, DependencyGraphService, @@ -32,7 +32,7 @@ use crate::DatasetRepositoryWriter; #[component(pub)] #[interface(dyn DeleteDatasetUseCase)] pub struct DeleteDatasetUseCaseImpl { - dataset_repo: Arc, + dataset_registry: Arc, dataset_repo_writer: Arc, dataset_action_authorizer: Arc, dependency_graph_service: Arc, @@ -41,14 +41,14 @@ pub struct DeleteDatasetUseCaseImpl { impl DeleteDatasetUseCaseImpl { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, dataset_repo_writer: Arc, dataset_action_authorizer: Arc, dependency_graph_service: Arc, outbox: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, dataset_repo_writer, dataset_action_authorizer, dependency_graph_service, @@ -73,8 +73,8 @@ impl DeleteDatasetUseCaseImpl { let mut children = Vec::with_capacity(downstream_dataset_ids.len()); for downstream_dataset_id in downstream_dataset_ids { let hdl = self - .dataset_repo - .resolve_dataset_ref(&downstream_dataset_id.as_local_ref()) + .dataset_registry + .resolve_dataset_handle_by_ref(&downstream_dataset_id.as_local_ref()) .await .int_err()?; children.push(hdl); @@ -93,8 +93,18 @@ impl DeleteDatasetUseCaseImpl { #[async_trait::async_trait] impl DeleteDatasetUseCase for DeleteDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "DeleteDatasetUseCase::execute_via_ref", + skip_all, + fields(dataset_ref) + )] async fn execute_via_ref(&self, dataset_ref: &DatasetRef) -> Result<(), DeleteDatasetError> { - let dataset_handle = match self.dataset_repo.resolve_dataset_ref(dataset_ref).await { + let dataset_handle = match self + .dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await + { Ok(h) => Ok(h), Err(GetDatasetError::NotFound(e)) => Err(DeleteDatasetError::NotFound(e)), Err(GetDatasetError::Internal(e)) => Err(DeleteDatasetError::Internal(e)), @@ -103,6 +113,12 @@ impl DeleteDatasetUseCase for DeleteDatasetUseCaseImpl { self.execute_via_handle(&dataset_handle).await } + #[tracing::instrument( + level = "info", + name = "DeleteDatasetUseCase::execute_via_handle", + skip_all, + fields(dataset_handle) + )] async fn execute_via_handle( &self, dataset_handle: &DatasetHandle, diff --git a/src/infra/core/src/use_cases/mod.rs b/src/infra/core/src/use_cases/mod.rs index 5a46a4d4b4..c9a04aecb4 100644 --- a/src/infra/core/src/use_cases/mod.rs +++ b/src/infra/core/src/use_cases/mod.rs @@ -9,14 +9,26 @@ mod append_dataset_metadata_batch_use_case_impl; mod commit_dataset_event_use_case_impl; +mod compact_dataset_use_case_impl; mod create_dataset_from_snapshot_use_case_impl; mod create_dataset_use_case_impl; mod delete_dataset_use_case_impl; +mod pull_dataset_use_case_impl; +mod push_dataset_use_case_impl; mod rename_dataset_use_case_impl; +mod reset_dataset_use_case_impl; +mod set_watermark_use_case_impl; +mod verify_dataset_use_case_impl; pub use append_dataset_metadata_batch_use_case_impl::*; pub use commit_dataset_event_use_case_impl::*; +pub use compact_dataset_use_case_impl::*; pub use create_dataset_from_snapshot_use_case_impl::*; pub use create_dataset_use_case_impl::*; pub use delete_dataset_use_case_impl::*; +pub use pull_dataset_use_case_impl::*; +pub use push_dataset_use_case_impl::*; pub use rename_dataset_use_case_impl::*; +pub use reset_dataset_use_case_impl::*; +pub use set_watermark_use_case_impl::*; +pub use verify_dataset_use_case_impl::*; diff --git a/src/infra/core/src/use_cases/pull_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/pull_dataset_use_case_impl.rs new file mode 100644 index 0000000000..477ca29091 --- /dev/null +++ b/src/infra/core/src/use_cases/pull_dataset_use_case_impl.rs @@ -0,0 +1,599 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; +use std::sync::Arc; + +use dill::*; +use internal_error::{InternalError, ResultIntoInternal}; +use kamu_core::auth::{ + ClassifyByAllowanceResponse, + DatasetAction, + DatasetActionAuthorizer, + DatasetActionUnauthorizedError, +}; +use kamu_core::*; +use opendatafabric::{DatasetHandle, DatasetRefAny}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn PullDatasetUseCase)] +pub struct PullDatasetUseCaseImpl { + dataset_registry: Arc, + pull_request_planner: Arc, + dataset_action_authorizer: Arc, + remote_alias_registry: Arc, + polling_ingest_svc: Arc, + transform_elaboration_svc: Arc, + transform_execution_svc: Arc, + sync_svc: Arc, + tenancy_config: Arc, +} + +impl PullDatasetUseCaseImpl { + pub fn new( + dataset_registry: Arc, + pull_request_planner: Arc, + dataset_action_authorizer: Arc, + remote_alias_registry: Arc, + polling_ingest_svc: Arc, + transform_elaboration_svc: Arc, + transform_execution_svc: Arc, + sync_svc: Arc, + tenancy_config: Arc, + ) -> Self { + Self { + dataset_registry, + pull_request_planner, + dataset_action_authorizer, + remote_alias_registry, + polling_ingest_svc, + transform_elaboration_svc, + transform_execution_svc, + sync_svc, + tenancy_config, + } + } + + #[tracing::instrument(level = "info", skip_all)] + async fn pull_by_plan( + &self, + plan: Vec, + options: PullOptions, + listener: Option>, + ) -> Result, InternalError> { + let mut results = Vec::new(); + + // Prepare multi-listeners + let maybe_ingest_multi_listener = listener + .as_ref() + .and_then(|l| l.clone().get_ingest_listener()); + + let maybe_transform_multi_listener = listener + .as_ref() + .and_then(|l| l.clone().get_transform_listener()); + + let maybe_sync_multi_listener = listener + .as_ref() + .and_then(|l| l.clone().get_sync_listener()); + + // Execute each iteration + for iteration in plan { + tracing::info!(depth = %iteration.depth, jobs = ?iteration.jobs, "Running pull iteration"); + + // Authorization checks for this iteration + let (iteration, write_errors) = self.make_authorization_write_checks(iteration).await?; + let (iteration, read_errors) = self.make_authorization_read_checks(iteration).await?; + if !write_errors.is_empty() || !read_errors.is_empty() { + results.extend(write_errors); + results.extend(read_errors); + break; + } + + // Run iteration jobs concurrently + let mut tasks = tokio::task::JoinSet::new(); + for job in iteration.jobs { + match job { + PullPlanIterationJob::Ingest(pii) => { + let maybe_listener = maybe_ingest_multi_listener + .as_ref() + .and_then(|l| l.begin_ingest(pii.target.get_handle())); + tasks.spawn(Self::ingest( + pii, + options.ingest_options.clone(), + self.polling_ingest_svc.clone(), + maybe_listener, + )) + } + PullPlanIterationJob::Transform(pti) => { + let maybe_listener = maybe_transform_multi_listener + .as_ref() + .and_then(|l| l.begin_transform(pti.target.get_handle())); + tasks.spawn(Self::transform( + pti, + options.transform_options, + self.transform_elaboration_svc.clone(), + self.transform_execution_svc.clone(), + maybe_listener, + )) + } + PullPlanIterationJob::Sync(psi) => { + let maybe_listener = maybe_sync_multi_listener.as_ref().and_then(|l| { + l.begin_sync( + &psi.sync_request.src.as_user_friendly_any_ref(), + &psi.sync_request.dst.as_user_friendly_any_ref(), + ) + }); + + tasks.spawn(Self::sync( + psi, + options.clone(), + self.sync_svc.clone(), + self.dataset_registry.clone(), + self.remote_alias_registry.clone(), + maybe_listener, + )) + } + }; + } + let iteration_results = tasks.join_all().await; + tracing::info!(iteration_result=?iteration_results, "Pull iteration finished"); + + // Deal with results + let mut has_errors = false; + for result in iteration_results { + let result = result?; + if result.result.is_err() { + has_errors = true; + } + results.push(result); + } + if has_errors { + break; + } + } + + Ok(results) + } + + #[tracing::instrument(level = "debug", name = "PullDatasetUseCase::write_authorizations", skip_all, fields(?iteration))] + async fn make_authorization_write_checks( + &self, + iteration: PullPlanIteration, + ) -> Result<(PullPlanIteration, Vec), InternalError> { + let mut written_datasets = Vec::with_capacity(iteration.jobs.len()); + let mut written_jobs_by_handle = HashMap::with_capacity(iteration.jobs.len()); + let mut other_jobs = Vec::new(); + + for job in iteration.jobs { + if let Some(written_handle) = job.as_common_item().try_get_written_handle() { + written_datasets.push(written_handle.clone()); + written_jobs_by_handle.insert(written_handle.clone(), job); + } else { + other_jobs.push(job); + } + } + + if written_datasets.is_empty() { + return Ok(( + PullPlanIteration { + depth: iteration.depth, + jobs: other_jobs, + }, + Vec::new(), + )); + } + + let ClassifyByAllowanceResponse { + authorized_handles, + unauthorized_handles_with_errors, + } = self + .dataset_action_authorizer + .classify_datasets_by_allowance(written_datasets, DatasetAction::Write) + .await?; + + let mut okay_jobs = Vec::with_capacity(authorized_handles.len() + other_jobs.len()); + for authorized_hdl in authorized_handles { + let job = written_jobs_by_handle + .remove(&authorized_hdl) + .expect("item must be present"); + okay_jobs.push(job); + } + okay_jobs.extend(other_jobs); + + let unauthorized_responses: Vec<_> = unauthorized_handles_with_errors + .into_iter() + .map(|(hdl, auth_error)| { + let job = written_jobs_by_handle + .remove(&hdl) + .expect("item must be present"); + PullResponse { + maybe_local_ref: Some(hdl.as_local_ref()), + maybe_remote_ref: None, + maybe_original_request: job.into_original_pull_request(), + result: Err(match auth_error { + DatasetActionUnauthorizedError::Access(e) => PullError::Access(e), + DatasetActionUnauthorizedError::Internal(e) => PullError::Internal(e), + }), + } + }) + .collect(); + + Ok(( + PullPlanIteration { + depth: iteration.depth, + jobs: okay_jobs, + }, + unauthorized_responses, + )) + } + + #[tracing::instrument(level = "debug", name = "PullDatasetUseCase::read_authoirzations", skip_all, fields(?iteration))] + async fn make_authorization_read_checks( + &self, + iteration: PullPlanIteration, + ) -> Result<(PullPlanIteration, Vec), InternalError> { + let mut read_datasets = Vec::new(); + let mut reading_jobs = Vec::with_capacity(iteration.jobs.len()); + let mut other_jobs = Vec::with_capacity(iteration.jobs.len()); + + for job in iteration.jobs { + let read_handles = job.as_common_item().get_read_handles(); + if read_handles.is_empty() { + other_jobs.push(job); + } else { + read_datasets.extend(read_handles.into_iter().cloned()); + reading_jobs.push(job); + } + } + + if read_datasets.is_empty() { + return Ok(( + PullPlanIteration { + depth: iteration.depth, + jobs: other_jobs, + }, + Vec::new(), + )); + } + + let ClassifyByAllowanceResponse { + authorized_handles: _, + unauthorized_handles_with_errors, + } = self + .dataset_action_authorizer + .classify_datasets_by_allowance(read_datasets, DatasetAction::Read) + .await?; + + if unauthorized_handles_with_errors.is_empty() { + let mut all_jobs = Vec::with_capacity(reading_jobs.len() + other_jobs.len()); + all_jobs.extend(reading_jobs); + all_jobs.extend(other_jobs); + return Ok(( + PullPlanIteration { + jobs: all_jobs, + depth: iteration.depth, + }, + vec![], + )); + } + + let mut unauthorized_handles_to_errors: HashMap< + DatasetHandle, + DatasetActionUnauthorizedError, + > = unauthorized_handles_with_errors.into_iter().collect(); + + let mut unauthorized_responses = Vec::new(); + + let mut okay_jobs = Vec::with_capacity(reading_jobs.len() + other_jobs.len()); + okay_jobs.extend(other_jobs); + + for reading_job in reading_jobs { + let read_handles = reading_job.as_common_item().get_read_handles(); + let mut maybe_error = None; + for read_hdl in read_handles { + if let Some(auth_error) = unauthorized_handles_to_errors.remove(read_hdl) { + maybe_error = Some(match auth_error { + DatasetActionUnauthorizedError::Access(e) => PullError::Access(e), + DatasetActionUnauthorizedError::Internal(e) => PullError::Internal(e), + }); + break; + } + } + + if let Some(error) = maybe_error { + unauthorized_responses.push(PullResponse { + maybe_local_ref: reading_job + .as_common_item() + .try_get_written_handle() + .map(DatasetHandle::as_local_ref), + maybe_remote_ref: None, + maybe_original_request: reading_job.into_original_pull_request(), + result: Err(error), + }); + } else { + okay_jobs.push(reading_job); + } + } + + Ok(( + PullPlanIteration { + depth: iteration.depth, + jobs: okay_jobs, + }, + unauthorized_responses, + )) + } + + async fn ingest( + pii: PullIngestItem, + ingest_options: PollingIngestOptions, + polling_ingest_svc: Arc, + maybe_listener: Option>, + ) -> Result { + let ingest_response = polling_ingest_svc + .ingest(pii.target.clone(), ingest_options, maybe_listener) + .await; + + Ok(PullResponse { + maybe_original_request: pii.maybe_original_request, + maybe_local_ref: Some(pii.target.get_handle().as_local_ref()), + maybe_remote_ref: None, + result: match ingest_response { + Ok(r) => Ok(r.into()), + Err(e) => Err(e.into()), + }, + }) + } + + async fn transform( + pti: PullTransformItem, + transform_options: TransformOptions, + transform_elaboration_svc: Arc, + transform_execution_svc: Arc, + maybe_listener: Option>, + ) -> Result { + // Remember original request + let maybe_original_request = pti.maybe_original_request.clone(); + + // Main transform run + async fn run_transform( + pti: PullTransformItem, + transform_elaboration_svc: Arc, + transform_execution_svc: Arc, + transform_options: TransformOptions, + maybe_listener: Option>, + ) -> (ResolvedDataset, Result) { + // Elaborate phase + match transform_elaboration_svc + .elaborate_transform( + pti.target.clone(), + pti.plan, + transform_options, + maybe_listener.clone(), + ) + .await + { + // Elaborate success + Ok(TransformElaboration::Elaborated(plan)) => { + // Execute phase + let (target, result) = transform_execution_svc + .execute_transform(pti.target, plan, maybe_listener) + .await; + ( + target, + result.map_err(|e| PullError::TransformError(TransformError::Execute(e))), + ) + } + // Already up-to-date + Ok(TransformElaboration::UpToDate) => (pti.target, Ok(TransformResult::UpToDate)), + // Elaborate error + Err(e) => ( + pti.target, + Err(PullError::TransformError(TransformError::Elaborate(e))), + ), + } + } + + let transform_result = run_transform( + pti, + transform_elaboration_svc, + transform_execution_svc, + transform_options, + maybe_listener, + ) + .await; + + // Prepare response + Ok(PullResponse { + maybe_original_request, + maybe_local_ref: Some(transform_result.0.get_handle().as_local_ref()), + maybe_remote_ref: None, + result: transform_result.1.map(Into::into), + }) + } + + async fn sync( + psi: PullSyncItem, + options: PullOptions, + sync_svc: Arc, + dataset_registry: Arc, + remote_alias_registry: Arc, + listener: Option>, + ) -> Result { + // Run sync action + let mut sync_result = sync_svc + .sync(*psi.sync_request, options.sync_options, listener) + .await; + + // Associate newly-synced datasets with remotes + if options.add_aliases + && let Ok(SyncResult::Updated { old_head: None, .. }) = &sync_result + { + // Note: this would have failed before sync if dataset didn't exist, + // however, by this moment the dataset must have been created + let hdl = dataset_registry + .resolve_dataset_handle_by_ref(&psi.local_target.as_local_ref()) + .await + .int_err()?; + + let alias_add_result = match remote_alias_registry.get_remote_aliases(&hdl).await { + Ok(mut aliases) => aliases.add(&psi.remote_ref, RemoteAliasKind::Pull).await, + Err(e) => match e { + GetAliasesError::Internal(e) => Err(e), + }, + }; + + if let Err(e) = alias_add_result { + sync_result = Err(SyncError::Internal(e)); + } + } + + // Prepare response + Ok(PullResponse { + maybe_original_request: psi.maybe_original_request, + maybe_local_ref: Some(psi.local_target.as_local_ref()), // TODO: multi-tenancy + maybe_remote_ref: Some(psi.remote_ref), + result: match sync_result { + Ok(response) => Ok(response.into()), + Err(e) => Err(e.into()), + }, + }) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl PullDatasetUseCase for PullDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "PullDatasetUseCase::execute", + skip_all, + fields(?request, ?options) + )] + async fn execute( + &self, + request: PullRequest, + options: PullOptions, + listener: Option>, + ) -> Result { + let listener = + listener.map(|l| Arc::new(ListenerMultiAdapter(l)) as Arc); + + let mut responses = self.execute_multi(vec![request], options, listener).await?; + + assert_eq!(responses.len(), 1); + Ok(responses.pop().unwrap()) + } + + #[tracing::instrument( + level = "info", + name = "PullDatasetUseCase::execute_multi", + skip_all, + fields(?requests, ?options) + )] + async fn execute_multi( + &self, + requests: Vec, + options: PullOptions, + listener: Option>, + ) -> Result, InternalError> { + tracing::info!(?requests, ?options, "Performing pull"); + + let (plan, errors) = self + .pull_request_planner + .build_pull_multi_plan(&requests, &options, *self.tenancy_config) + .await; + + tracing::info!( + num_steps = plan.len(), + num_errors = errors.len(), + "Prepared pull execution plan" + ); + if !errors.is_empty() { + return Ok(errors); + } + + self.pull_by_plan(plan, options, listener).await + } + + #[tracing::instrument( + level = "info", + name = "PullDatasetUseCase::execute_all_owned", + skip_all, + fields(?options) + )] + async fn execute_all_owned( + &self, + options: PullOptions, + listener: Option>, + ) -> Result, InternalError> { + tracing::info!(?options, "Performing pull (all owned)"); + + let (plan, errors) = self + .pull_request_planner + .build_pull_plan_all_owner_datasets(&options, *self.tenancy_config) + .await?; + + tracing::info!( + num_steps = plan.len(), + num_errors = errors.len(), + "Prepared pull execution plan (all owned)" + ); + if !errors.is_empty() { + return Ok(errors); + } + + self.pull_by_plan(plan, options, listener).await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +struct ListenerMultiAdapter(Arc); + +impl PullMultiListener for ListenerMultiAdapter { + fn get_ingest_listener(self: Arc) -> Option> { + Some(self) + } + + fn get_transform_listener(self: Arc) -> Option> { + Some(self) + } + + fn get_sync_listener(self: Arc) -> Option> { + Some(self) + } +} + +impl PollingIngestMultiListener for ListenerMultiAdapter { + fn begin_ingest(&self, _dataset: &DatasetHandle) -> Option> { + self.0.clone().get_ingest_listener() + } +} + +impl TransformMultiListener for ListenerMultiAdapter { + fn begin_transform(&self, _dataset: &DatasetHandle) -> Option> { + self.0.clone().get_transform_listener() + } +} + +impl SyncMultiListener for ListenerMultiAdapter { + fn begin_sync( + &self, + _src: &DatasetRefAny, + _dst: &DatasetRefAny, + ) -> Option> { + self.0.clone().get_sync_listener() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/use_cases/push_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/push_dataset_use_case_impl.rs new file mode 100644 index 0000000000..0da57cded6 --- /dev/null +++ b/src/infra/core/src/use_cases/push_dataset_use_case_impl.rs @@ -0,0 +1,225 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::{component, interface}; +use internal_error::InternalError; +use kamu_core::auth::{ + ClassifyByAllowanceResponse, + DatasetAction, + DatasetActionAuthorizer, + DatasetActionUnauthorizedError, +}; +use kamu_core::{ + PushDatasetUseCase, + PushError, + PushItem, + PushMultiOptions, + PushRequestPlanner, + PushResponse, + RemoteAliasKind, + RemoteAliasesRegistry, + SyncError, + SyncMultiListener, + SyncOptions, + SyncRequest, + SyncService, +}; +use opendatafabric::{DatasetHandle, DatasetPushTarget}; + +use crate::SyncRequestBuilder; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn PushDatasetUseCase)] +pub struct PushDatasetUseCaseImpl { + push_request_planner: Arc, + sync_request_builder: Arc, + sync_service: Arc, + dataset_action_authorizer: Arc, + remote_alias_registry: Arc, +} + +impl PushDatasetUseCaseImpl { + pub fn new( + push_request_planner: Arc, + sync_request_builder: Arc, + sync_service: Arc, + dataset_action_authorizer: Arc, + remote_alias_registry: Arc, + ) -> Self { + Self { + push_request_planner, + sync_request_builder, + sync_service, + dataset_action_authorizer, + remote_alias_registry, + } + } + + #[tracing::instrument(level = "debug", name = "PushDatasetUseCase::authorizations", skip_all, fields(?dataset_handles, ?push_target))] + async fn make_authorization_checks( + &self, + dataset_handles: Vec, + push_target: Option<&DatasetPushTarget>, + ) -> Result<(Vec, Vec), InternalError> { + let ClassifyByAllowanceResponse { + authorized_handles, + unauthorized_handles_with_errors, + } = self + .dataset_action_authorizer + .classify_datasets_by_allowance(dataset_handles, DatasetAction::Read) + .await?; + + let unauthorized_responses = unauthorized_handles_with_errors + .into_iter() + .map(|(hdl, error)| PushResponse { + local_handle: Some(hdl), + target: push_target.cloned(), + result: Err(PushError::SyncError(match error { + DatasetActionUnauthorizedError::Access(e) => SyncError::Access(e), + DatasetActionUnauthorizedError::Internal(e) => SyncError::Internal(e), + })), + }) + .collect(); + + Ok((authorized_handles, unauthorized_responses)) + } + + #[tracing::instrument( + level = "debug", + name = "PushDatasetUseCase::build_sync_requests", + skip_all, + fields(?plan, ?sync_options, ?push_target) + )] + async fn build_sync_requests( + &self, + plan: &[PushItem], + sync_options: SyncOptions, + push_target: Option<&DatasetPushTarget>, + ) -> (Vec, Vec) { + let mut sync_requests = Vec::new(); + let mut errors = Vec::new(); + + for pi in plan { + let src_ref = pi.local_handle.as_any_ref(); + let dst_ref = (&pi.remote_target.url).into(); + match self + .sync_request_builder + .build_sync_request(src_ref, dst_ref, sync_options.create_if_not_exists) + .await + { + Ok(sync_request) => sync_requests.push(sync_request), + Err(e) => errors.push(PushResponse { + local_handle: Some(pi.local_handle.clone()), + target: push_target.cloned(), + result: Err(e.into()), + }), + } + } + + (sync_requests, errors) + } +} + +#[async_trait::async_trait] +impl PushDatasetUseCase for PushDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "PushDatasetUseCase::execute_multi", + skip_all, + fields(?dataset_handles, ?options) + )] + async fn execute_multi( + &self, + dataset_handles: Vec, + options: PushMultiOptions, + sync_listener: Option>, + ) -> Result, InternalError> { + // Check for unsupported options first + if options.recursive { + unimplemented!("Recursive push is not yet supported") + } + if options.all { + unimplemented!("Pushing all datasets is not yet supported") + } + + // Authorization checks upon all datasets come first + let (authorized_handles, unauthorized_responses) = self + .make_authorization_checks(dataset_handles, options.remote_target.as_ref()) + .await?; + if !unauthorized_responses.is_empty() { + return Ok(unauthorized_responses); + } + + // Prepare a push plan + let (plan, errors) = self + .push_request_planner + .collect_plan(&authorized_handles, options.remote_target.as_ref()) + .await; + if !errors.is_empty() { + return Ok(errors); + } + + tracing::debug!(?plan, "Obtained push plan"); + + // Create sync requests + let (sync_requests, errors) = self + .build_sync_requests(&plan, options.sync_options, options.remote_target.as_ref()) + .await; + if !errors.is_empty() { + return Ok(errors); + } + + // Run sync process + let futures: Vec<_> = sync_requests + .into_iter() + .map(|sync_request| { + let listener = sync_listener.as_ref().and_then(|l| { + l.begin_sync( + &sync_request.src.as_user_friendly_any_ref(), + &sync_request.dst.as_user_friendly_any_ref(), + ) + }); + self.sync_service + .sync(sync_request, options.sync_options, listener) + }) + .collect(); + let sync_results = futures::future::join_all(futures).await; + + // Convert results + assert_eq!(plan.len(), sync_results.len()); + let results: Vec<_> = std::iter::zip(&plan, sync_results) + .map(|(pi, res)| pi.as_response(res)) + .collect(); + + // If no errors - add aliases to initial items + if options.add_aliases && results.iter().all(|r| r.result.is_ok()) { + for push_item in &plan { + // TODO: Improve error handling + self.remote_alias_registry + .get_remote_aliases(&push_item.local_handle) + .await + .unwrap() + .add( + &((&push_item.remote_target.url).into()), + RemoteAliasKind::Push, + ) + .await + .unwrap(); + } + } + + Ok(results) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/use_cases/rename_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/rename_dataset_use_case_impl.rs index 57f8340bb7..e407cbca1c 100644 --- a/src/infra/core/src/use_cases/rename_dataset_use_case_impl.rs +++ b/src/infra/core/src/use_cases/rename_dataset_use_case_impl.rs @@ -14,7 +14,7 @@ use kamu_accounts::CurrentAccountSubject; use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer}; use kamu_core::{ DatasetLifecycleMessage, - DatasetRepository, + DatasetRegistry, GetDatasetError, RenameDatasetError, RenameDatasetUseCase, @@ -28,7 +28,7 @@ use crate::DatasetRepositoryWriter; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct RenameDatasetUseCaseImpl { - dataset_repo: Arc, + dataset_registry: Arc, dataset_repo_writer: Arc, dataset_action_authorizer: Arc, outbox: Arc, @@ -39,14 +39,14 @@ pub struct RenameDatasetUseCaseImpl { #[interface(dyn RenameDatasetUseCase)] impl RenameDatasetUseCaseImpl { pub fn new( - dataset_repo: Arc, + dataset_registry: Arc, dataset_repo_writer: Arc, dataset_action_authorizer: Arc, outbox: Arc, current_account_subject: Arc, ) -> Self { Self { - dataset_repo, + dataset_registry, dataset_repo_writer, dataset_action_authorizer, outbox, @@ -57,6 +57,12 @@ impl RenameDatasetUseCaseImpl { #[async_trait::async_trait] impl RenameDatasetUseCase for RenameDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "RenameDatasetUseCase::execute", + skip_all, + fields(dataset_ref, new_name) + )] async fn execute( &self, dataset_ref: &DatasetRef, @@ -68,7 +74,11 @@ impl RenameDatasetUseCase for RenameDatasetUseCaseImpl { } CurrentAccountSubject::Logged(l) => l.account_id.clone(), }; - let dataset_handle = match self.dataset_repo.resolve_dataset_ref(dataset_ref).await { + let dataset_handle = match self + .dataset_registry + .resolve_dataset_handle_by_ref(dataset_ref) + .await + { Ok(h) => Ok(h), Err(GetDatasetError::NotFound(e)) => Err(RenameDatasetError::NotFound(e)), Err(GetDatasetError::Internal(e)) => Err(RenameDatasetError::Internal(e)), diff --git a/src/infra/core/src/use_cases/reset_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/reset_dataset_use_case_impl.rs new file mode 100644 index 0000000000..e275a2570e --- /dev/null +++ b/src/infra/core/src/use_cases/reset_dataset_use_case_impl.rs @@ -0,0 +1,70 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::{component, interface}; +use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer}; +use kamu_core::{DatasetRegistry, ResetDatasetUseCase, ResetError, ResetService}; +use opendatafabric::{DatasetHandle, Multihash}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn ResetDatasetUseCase)] +pub struct ResetDatasetUseCaseImpl { + reset_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, +} + +impl ResetDatasetUseCaseImpl { + pub fn new( + reset_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, + ) -> Self { + Self { + reset_service, + dataset_registry, + dataset_action_authorizer, + } + } +} + +#[async_trait::async_trait] +impl ResetDatasetUseCase for ResetDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "ResetDatasetUseCase::execute", + skip_all, + fields(dataset_handle, ?maybe_new_head, ?maybe_old_head) + )] + async fn execute( + &self, + dataset_handle: &DatasetHandle, + maybe_new_head: Option<&Multihash>, + maybe_old_head: Option<&Multihash>, + ) -> Result { + // Permission check + self.dataset_action_authorizer + .check_action_allowed(dataset_handle, DatasetAction::Write) + .await?; + + // Resolve dataset + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); + + // Actual action + self.reset_service + .reset_dataset(resolved_dataset, maybe_new_head, maybe_old_head) + .await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/use_cases/set_watermark_use_case_impl.rs b/src/infra/core/src/use_cases/set_watermark_use_case_impl.rs new file mode 100644 index 0000000000..6d6de3f1bf --- /dev/null +++ b/src/infra/core/src/use_cases/set_watermark_use_case_impl.rs @@ -0,0 +1,76 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use chrono::{DateTime, Utc}; +use dill::{component, interface}; +use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer}; +use kamu_core::{ + DatasetRegistry, + SetWatermarkError, + SetWatermarkResult, + SetWatermarkUseCase, + WatermarkService, +}; +use opendatafabric::DatasetHandle; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn SetWatermarkUseCase)] +pub struct SetWatermarkUseCaseImpl { + watermark_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, +} + +impl SetWatermarkUseCaseImpl { + pub fn new( + watermark_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, + ) -> Self { + Self { + watermark_service, + dataset_registry, + dataset_action_authorizer, + } + } +} + +#[async_trait::async_trait] +impl SetWatermarkUseCase for SetWatermarkUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "SetWatermarkUseCase::execute", + skip_all, + fields(dataset_handle, new_watermark) + )] + async fn execute( + &self, + dataset_handle: &DatasetHandle, + new_watermark: DateTime, + ) -> Result { + // Permission check + self.dataset_action_authorizer + .check_action_allowed(dataset_handle, DatasetAction::Write) + .await?; + + // Resolve dataset + let resolved_dataset = self.dataset_registry.get_dataset_by_handle(dataset_handle); + + // Actual action + self.watermark_service + .set_watermark(resolved_dataset, new_watermark) + .await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/use_cases/verify_dataset_use_case_impl.rs b/src/infra/core/src/use_cases/verify_dataset_use_case_impl.rs new file mode 100644 index 0000000000..bc9d352837 --- /dev/null +++ b/src/infra/core/src/use_cases/verify_dataset_use_case_impl.rs @@ -0,0 +1,133 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::{component, interface}; +use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer}; +use kamu_core::{ + DatasetRegistry, + VerificationListener, + VerificationMultiListener, + VerificationRequest, + VerificationResult, + VerificationService, + VerifyDatasetUseCase, +}; +use opendatafabric::DatasetHandle; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn VerifyDatasetUseCase)] +pub struct VerifyDatasetUseCaseImpl { + verification_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, +} + +impl VerifyDatasetUseCaseImpl { + pub fn new( + verification_service: Arc, + dataset_registry: Arc, + dataset_action_authorizer: Arc, + ) -> Self { + Self { + verification_service, + dataset_registry, + dataset_action_authorizer, + } + } +} + +#[async_trait::async_trait] +impl VerifyDatasetUseCase for VerifyDatasetUseCaseImpl { + #[tracing::instrument( + level = "info", + name = "VerifyDatasetUseCase::execute", + skip_all, + fields(?request) + )] + async fn execute( + &self, + request: VerificationRequest, + maybe_listener: Option>, + ) -> VerificationResult { + // Permission check + // TODO: verification of derived datasets requires read permission for inputs + match self + .dataset_action_authorizer + .check_action_allowed(&request.target, DatasetAction::Read) + .await + { + Ok(_) => {} + Err(e) => return VerificationResult::err(request.target.clone(), e), + }; + + // Resolve dataset + let target = self.dataset_registry.get_dataset_by_handle(&request.target); + + // Actual action + self.verification_service + .verify( + VerificationRequest { + target, + block_range: request.block_range, + options: request.options, + }, + maybe_listener, + ) + .await + } + + #[tracing::instrument( + level = "info", + name = "VerifyDatasetUseCase::execute_multi", + skip_all, + fields(?requests) + )] + async fn execute_multi( + &self, + requests: Vec>, + maybe_multi_listener: Option>, + ) -> Vec { + let mut verification_results = Vec::new(); + + // Exclude tasks, where there is no read permission for dataset + let mut authorized_requests = Vec::new(); + for request in requests { + let res = self + .dataset_action_authorizer + .check_action_allowed(&request.target, DatasetAction::Read) + .await; + match res { + Ok(_) => authorized_requests.push(VerificationRequest { + target: self.dataset_registry.get_dataset_by_handle(&request.target), + block_range: request.block_range, + options: request.options, + }), + Err(e) => { + verification_results.push(VerificationResult::err(request.target, e)); + } + } + } + + // Run verification for authorized datasets + let mut authorized_results = self + .verification_service + .verify_multi(authorized_requests, maybe_multi_listener) + .await; + + // Join results + verification_results.append(&mut authorized_results); + verification_results + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/utils/datasets_filtering.rs b/src/infra/core/src/utils/datasets_filtering.rs index befe0400d4..37650ff5af 100644 --- a/src/infra/core/src/utils/datasets_filtering.rs +++ b/src/infra/core/src/utils/datasets_filtering.rs @@ -12,7 +12,14 @@ use std::sync::Arc; use futures::{future, StreamExt, TryStreamExt}; use internal_error::InternalError; -use kamu_core::{DatasetRepository, GetDatasetError, SearchError, SearchOptions, SearchService}; +use kamu_core::{ + DatasetRegistry, + GetDatasetError, + SearchError, + SearchOptions, + SearchService, + TenancyConfig, +}; use opendatafabric::{ AccountName, DatasetAliasRemote, @@ -33,7 +40,7 @@ type FilteredDatasetRefAnyStream<'a> = //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub fn filter_datasets_by_local_pattern( - dataset_repo: &dyn DatasetRepository, + dataset_registry: &dyn DatasetRegistry, dataset_ref_patterns: Vec, ) -> FilteredDatasetHandleStream<'_> { // We assume here that resolving specific references one by one is more @@ -46,12 +53,12 @@ pub fn filter_datasets_by_local_pattern( Box::pin(async_stream::try_stream! { for dataset_ref_pattern in &dataset_ref_patterns { // TODO: PERF: Create a batch version of `resolve_dataset_ref` - yield dataset_repo.resolve_dataset_ref(dataset_ref_pattern.as_dataset_ref().unwrap()).await?; + yield dataset_registry.resolve_dataset_handle_by_ref(dataset_ref_pattern.as_dataset_ref().unwrap()).await?; } }) } else { - dataset_repo - .get_all_datasets() + dataset_registry + .all_dataset_handles() .try_filter(move |dataset_handle| { future::ready( dataset_ref_patterns @@ -67,26 +74,29 @@ pub fn filter_datasets_by_local_pattern( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub fn filter_datasets_by_any_pattern<'a>( - dataset_repo: &'a dyn DatasetRepository, + dataset_registry: &'a dyn DatasetRegistry, search_svc: Arc, dataset_ref_any_patterns: Vec, current_account_name: &AccountName, + tenancy_config: TenancyConfig, ) -> FilteredDatasetRefAnyStream<'a> { - let is_multitenant_mode = dataset_repo.is_multi_tenant(); - let (all_ref_patterns, static_refs): (Vec<_>, Vec<_>) = dataset_ref_any_patterns .into_iter() .partition(DatasetRefAnyPattern::is_pattern); - let (remote_ref_patterns, local_ref_patterns): (Vec<_>, Vec<_>) = all_ref_patterns - .into_iter() - .partition(|pattern| pattern.is_remote_pattern(is_multitenant_mode)); + let (remote_ref_patterns, local_ref_patterns): (Vec<_>, Vec<_>) = + all_ref_patterns.into_iter().partition(|pattern| { + pattern.is_remote_pattern(tenancy_config == TenancyConfig::MultiTenant) + }); let static_datasets_stream = get_static_datasets_stream(static_refs); - let remote_patterns_stream = - get_remote_datasets_stream(search_svc, remote_ref_patterns, is_multitenant_mode); + let remote_patterns_stream = get_remote_datasets_stream( + search_svc, + remote_ref_patterns, + tenancy_config == TenancyConfig::MultiTenant, + ); let local_patterns_stream = - get_local_datasets_stream(dataset_repo, local_ref_patterns, current_account_name); + get_local_datasets_stream(dataset_registry, local_ref_patterns, current_account_name); static_datasets_stream .chain(remote_patterns_stream) @@ -165,12 +175,12 @@ pub fn matches_remote_ref_pattern( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub fn get_local_datasets_stream<'a>( - dataset_repo: &'a dyn DatasetRepository, + dataset_registry: &'a dyn DatasetRegistry, dataset_ref_patterns: Vec, current_account_name: &AccountName, ) -> impl Stream> + 'a { - dataset_repo - .get_datasets_by_owner(current_account_name) + dataset_registry + .all_dataset_handles_by_owner(current_account_name) .try_filter(move |dataset_handle| { future::ready(dataset_ref_patterns.iter().any(|dataset_ref_pattern| { matches_local_ref_pattern(dataset_ref_pattern, dataset_handle) diff --git a/src/infra/core/src/utils/simple_transfer_protocol.rs b/src/infra/core/src/utils/simple_transfer_protocol.rs index 0aa70c06e1..16996917b6 100644 --- a/src/infra/core/src/utils/simple_transfer_protocol.rs +++ b/src/infra/core/src/utils/simple_transfer_protocol.rs @@ -10,12 +10,14 @@ use std::pin::Pin; use std::sync::{Arc, Mutex}; +use dill::{component, Catalog}; use futures::{stream, Future, StreamExt, TryStreamExt}; -use internal_error::ErrorIntoInternal; +use internal_error::{ErrorIntoInternal, ResultIntoInternal}; use kamu_core::sync_service::DatasetNotFoundError; use kamu_core::utils::metadata_chain_comparator::*; use kamu_core::*; -use opendatafabric::*; +use odf::{AsTypedBlock, IntoDataStreamBlock}; +use opendatafabric as odf; use crate::*; @@ -27,16 +29,10 @@ const DEFAULT_SIMPLE_PROTOCOL_MAX_PARALLEL_TRANSFERS: usize = 10; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -type BoxedCreateDatasetFuture = std::pin::Pin< - Box> + Send>, ->; - -pub type DatasetFactoryFn = - Box) -> BoxedCreateDatasetFuture + Send>; - #[derive(Debug, Eq, PartialEq)] pub struct SimpleProtocolTransferOptions { pub max_parallel_transfers: usize, + pub visibility_for_created_dataset: DatasetVisibility, } impl Default for SimpleProtocolTransferOptions { @@ -50,6 +46,7 @@ impl Default for SimpleProtocolTransferOptions { }; Self { max_parallel_transfers, + visibility_for_created_dataset: DatasetVisibility::Private, } } } @@ -57,20 +54,28 @@ impl Default for SimpleProtocolTransferOptions { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// Implements "Simple Transfer Protocol" as described in ODF spec -pub struct SimpleTransferProtocol; +pub struct SimpleTransferProtocol { + catalog: Catalog, +} //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[component(pub)] impl SimpleTransferProtocol { + pub fn new(catalog: Catalog) -> Self { + Self { catalog } + } + pub async fn sync( &self, - src_ref: &DatasetRefAny, + src_ref: &odf::DatasetRefAny, src: Arc, maybe_dst: Option>, - dst_factory: Option, + dst_alias: Option<&odf::DatasetAlias>, validation: AppendValidation, trust_source_hashes: bool, force: bool, + transfer_options: SimpleProtocolTransferOptions, listener: Arc, ) -> Result { listener.begin(); @@ -158,14 +163,29 @@ impl SimpleTransferProtocol { let (dst, dst_head) = if let Some(dst) = maybe_dst { (dst, dst_head) } else { - let (_, first_block) = blocks.pop().unwrap(); + let (first_hash, first_block) = blocks.pop().unwrap(); let seed_block = first_block .into_typed() .ok_or_else(|| CorruptedSourceError { message: "First metadata block is not Seed".to_owned(), source: None, })?; - let create_result = dst_factory.unwrap()(seed_block).await?; + + let create_dataset_use_case = + self.catalog.get_one::().unwrap(); + let alias = + dst_alias.ok_or_else(|| "Destination dataset alias is unknown".int_err())?; + let create_result = create_dataset_use_case + .execute( + alias, + seed_block, + CreateDatasetUseCaseOptions { + dataset_visibility: transfer_options.visibility_for_created_dataset, + }, + ) + .await + .int_err()?; + assert_eq!(first_hash, create_result.head); (create_result.dataset, Some(create_result.head)) }; @@ -178,6 +198,7 @@ impl SimpleTransferProtocol { validation, trust_source_hashes, listener, + transfer_options, listener_adapter.into_status(), ) .await?; @@ -191,9 +212,9 @@ impl SimpleTransferProtocol { async fn get_src_head( &self, - src_ref: &DatasetRefAny, + src_ref: &odf::DatasetRefAny, src_chain: &dyn MetadataChain, - ) -> Result { + ) -> Result { match src_chain.resolve_ref(&BlockRef::Head).await { Ok(head) => Ok(head), Err(GetRefError::NotFound(_)) => Err(DatasetNotFoundError { @@ -208,7 +229,7 @@ impl SimpleTransferProtocol { async fn get_dest_head( &self, dst_chain: &dyn MetadataChain, - ) -> Result, SyncError> { + ) -> Result, SyncError> { match dst_chain.resolve_ref(&BlockRef::Head).await { Ok(h) => Ok(Some(h)), Err(GetRefError::NotFound(_)) => Ok(None), @@ -245,7 +266,7 @@ impl SimpleTransferProtocol { &'a self, src: &'a dyn Dataset, dst: &'a dyn Dataset, - data_slice: &DataSlice, + data_slice: &odf::DataSlice, trust_source_hashes: bool, listener: Arc, arc_stats: Arc>, @@ -316,7 +337,7 @@ impl SimpleTransferProtocol { &'a self, src: &'a dyn Dataset, dst: &'a dyn Dataset, - checkpoint: &Checkpoint, + checkpoint: &odf::Checkpoint, trust_source_hashes: bool, listener: Arc, arc_stats: Arc>, @@ -389,11 +410,12 @@ impl SimpleTransferProtocol { blocks: Vec, src: &'a dyn Dataset, dst: &'a dyn Dataset, - src_head: &'a Multihash, - dst_head: Option<&'a Multihash>, + src_head: &'a odf::Multihash, + dst_head: Option<&'a odf::Multihash>, validation: AppendValidation, trust_source_hashes: bool, listener: Arc, + transfer_options: SimpleProtocolTransferOptions, mut stats: SyncStats, ) -> Result<(), SyncError> { // Update stats estimates based on metadata @@ -454,10 +476,7 @@ impl SimpleTransferProtocol { stream::iter(block_download_tasks) .map(Ok) - .try_for_each_concurrent( - SimpleProtocolTransferOptions::default().max_parallel_transfers, - |future| future, - ) + .try_for_each_concurrent(transfer_options.max_parallel_transfers, |future| future) .await?; // Commit blocks diff --git a/src/infra/core/src/utils/smart_transfer_protocol.rs b/src/infra/core/src/utils/smart_transfer_protocol.rs index a67d7065e9..be1cfc4c91 100644 --- a/src/infra/core/src/utils/smart_transfer_protocol.rs +++ b/src/infra/core/src/utils/smart_transfer_protocol.rs @@ -10,11 +10,9 @@ use std::sync::Arc; use kamu_core::{Dataset, DatasetVisibility, SyncError, SyncListener, SyncResult}; -use opendatafabric::Multihash; +use opendatafabric as odf; use url::Url; -pub use super::simple_transfer_protocol::DatasetFactoryFn; - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Eq, PartialEq)] @@ -49,7 +47,7 @@ pub trait SmartTransferProtocolClient: Sync + Send { &self, http_src_url: &Url, dst: Option>, - dst_factory: Option, + dst_alias: Option<&odf::DatasetAlias>, listener: Arc, transfer_options: TransferOptions, ) -> Result; @@ -58,7 +56,7 @@ pub trait SmartTransferProtocolClient: Sync + Send { &self, src: Arc, http_dst_url: &Url, - dst_head: Option<&Multihash>, + dst_head: Option<&odf::Multihash>, listener: Arc, transfer_options: TransferOptions, ) -> Result; diff --git a/src/infra/core/src/verification_service_impl.rs b/src/infra/core/src/verification_service_impl.rs index ef4460ca60..ffd9051cc5 100644 --- a/src/infra/core/src/verification_service_impl.rs +++ b/src/infra/core/src/verification_service_impl.rs @@ -18,39 +18,36 @@ use opendatafabric::*; use crate::utils::cached_object::CachedObject; use crate::*; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + pub struct VerificationServiceImpl { - dataset_repo: Arc, - dataset_authorizer: Arc, - transform_service: Arc, + transform_request_planner: Arc, + transform_execution_svc: Arc, } #[component(pub)] #[interface(dyn VerificationService)] impl VerificationServiceImpl { pub fn new( - dataset_repo: Arc, - dataset_authorizer: Arc, - transform_service: Arc, + transform_request_planner: Arc, + transform_execution_svc: Arc, ) -> Self { Self { - dataset_repo, - dataset_authorizer, - transform_service, + transform_request_planner, + transform_execution_svc, } } #[tracing::instrument(level = "info", skip_all)] - async fn check_data_integrity<'a>( - &'a self, - dataset_handle: &'a DatasetHandle, + async fn check_data_integrity( + &self, + resolved_dataset: &ResolvedDataset, dataset_kind: DatasetKind, block_range: (Option, Option), check_logical_hashes: bool, listener: Arc, ) -> Result<(), VerificationError> { - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - - let chain = dataset.as_metadata_chain(); + let chain = resolved_dataset.as_metadata_chain(); let head = match block_range.1 { None => chain.resolve_ref(&BlockRef::Head).await?, @@ -81,7 +78,7 @@ impl VerificationServiceImpl { if let Some(output_slice) = &block.event.new_data { // Check size first - let size_actual = dataset + let size_actual = resolved_dataset .as_data_repo() .get_size(&output_slice.physical_hash) .await @@ -99,8 +96,11 @@ impl VerificationServiceImpl { )); } - let data_hashing_helper = - CachedObject::from(&output_slice.physical_hash, dataset.as_data_repo()).await?; + let data_hashing_helper = CachedObject::from( + &output_slice.physical_hash, + resolved_dataset.as_data_repo(), + ) + .await?; // Do a fast pass using physical hash let physical_hash_actual = data_hashing_helper.physical_hash().await.int_err()?; @@ -139,7 +139,7 @@ impl VerificationServiceImpl { if let Some(checkpoint) = block.event.new_checkpoint { // Check size - let size_actual = dataset + let size_actual = resolved_dataset .as_checkpoint_repo() .get_size(&checkpoint.physical_hash) .await @@ -158,9 +158,11 @@ impl VerificationServiceImpl { } // Check physical hash - let checkpoint_hashing_helper = - CachedObject::from(&checkpoint.physical_hash, dataset.as_checkpoint_repo()) - .await?; + let checkpoint_hashing_helper = CachedObject::from( + &checkpoint.physical_hash, + resolved_dataset.as_checkpoint_repo(), + ) + .await?; let physical_hash_actual = checkpoint_hashing_helper.physical_hash().await.int_err()?; @@ -193,15 +195,13 @@ impl VerificationServiceImpl { } #[tracing::instrument(level = "info", skip_all)] - async fn check_metadata_integrity<'a>( - &'a self, - dataset_handle: &'a DatasetHandle, + async fn check_metadata_integrity( + &self, + resolved_dataset: &ResolvedDataset, block_range: (Option, Option), listener: Arc, ) -> Result<(), VerificationError> { - let dataset = self.dataset_repo.get_dataset_by_handle(dataset_handle); - - let chain = dataset.as_metadata_chain(); + let chain = resolved_dataset.as_metadata_chain(); let head = match block_range.1 { None => chain.resolve_ref(&BlockRef::Head).await?, @@ -211,7 +211,7 @@ impl VerificationServiceImpl { listener.begin_phase(VerificationPhase::MetadataIntegrity); - let blocks: Vec<_> = dataset + let blocks: Vec<_> = resolved_dataset .as_metadata_chain() .iter_blocks_interval(&head, tail.as_ref(), false) .try_collect() @@ -248,75 +248,78 @@ impl VerificationServiceImpl { } } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[async_trait::async_trait] impl VerificationService for VerificationServiceImpl { - #[tracing::instrument(level = "info", skip_all, fields(%dataset_ref, ?block_range))] + #[tracing::instrument( + level = "info", + skip_all, + fields( + target_alias=%request.target.get_alias(), + block_range=?request.block_range + ) + )] async fn verify( &self, - dataset_ref: &DatasetRef, - block_range: (Option, Option), - options: VerificationOptions, + request: VerificationRequest, maybe_listener: Option>, ) -> VerificationResult { - let dataset_handle = match self.dataset_repo.resolve_dataset_ref(dataset_ref).await { - Ok(v) => v, - Err(e) => return VerificationResult::err_no_handle(e), - }; - - match self - .dataset_authorizer - .check_action_allowed(&dataset_handle, domain::auth::DatasetAction::Read) - .await - { - Ok(_) => {} - Err(e) => return VerificationResult::err(dataset_handle, e), - }; - - let dataset = self.dataset_repo.get_dataset_by_handle(&dataset_handle); - - let dataset_kind = match dataset.get_summary(GetSummaryOpts::default()).await { + let dataset_kind = match request.target.get_summary(GetSummaryOpts::default()).await { Ok(summary) => summary.kind, - Err(e) => return VerificationResult::err(dataset_handle, e.int_err()), + Err(e) => return VerificationResult::err(request.target.take_handle(), e.int_err()), }; let listener = maybe_listener.unwrap_or(Arc::new(NullVerificationListener {})); listener.begin(); let outcome = try { - if options.check_integrity { + if request.options.check_integrity { self.check_metadata_integrity( - &dataset_handle, - block_range.clone(), + &request.target, + request.block_range.clone(), listener.clone(), ) .await?; self.check_data_integrity( - &dataset_handle, + &request.target, dataset_kind, - block_range.clone(), - options.check_logical_hashes, + request.block_range.clone(), + request.options.check_logical_hashes, listener.clone(), ) .await?; } - if dataset_kind == DatasetKind::Derivative && options.replay_transformations { - self.transform_service - .verify_transform( - &dataset_handle.as_local_ref(), - block_range.clone(), - Some(listener.clone()), + if dataset_kind == DatasetKind::Derivative && request.options.replay_transformations { + let plan = self + .transform_request_planner + .build_transform_verification_plan( + request.target.clone(), + request.block_range.clone(), ) - .await?; + .await + .map_err(|e| { + VerificationError::VerifyTransform(VerifyTransformError::Plan(e)) + })?; + + self.transform_execution_svc + .execute_verify_transform(request.target.clone(), plan, Some(listener.clone())) + .await + .map_err(|e| { + VerificationError::VerifyTransform(VerifyTransformError::Execute(e)) + })?; } }; let result = VerificationResult { - dataset_handle: Some(dataset_handle), + dataset_handle: Some(request.target.take_handle()), outcome, }; + tracing::debug!(result = ?result, "Dataset verification finished"); + match &result.outcome { Ok(_) => listener.success(&result), Err(error) => listener.error(error), @@ -325,36 +328,24 @@ impl VerificationService for VerificationServiceImpl { result } + #[tracing::instrument(level = "info", skip_all)] async fn verify_multi( &self, - requests: Vec, - options: VerificationOptions, - maybe_listener: Option>, + requests: Vec>, + maybe_multi_listener: Option>, ) -> Vec { - let listener = maybe_listener.unwrap_or(Arc::new(NullVerificationMultiListener {})); + let multi_listener = + maybe_multi_listener.unwrap_or(Arc::new(NullVerificationMultiListener {})); let mut results = Vec::new(); for request in requests { - let res = match self - .dataset_repo - .resolve_dataset_ref(&request.dataset_ref) - .await - { - Ok(dataset_handle) => { - self.verify( - &request.dataset_ref, - request.block_range, - options.clone(), - listener.begin_verify(&dataset_handle), - ) - .await - } - Err(e) => VerificationResult::err_no_handle(e), - }; - - results.push(res); + let dataset_handle = request.target.get_handle(); + let listener = multi_listener.begin_verify(dataset_handle); + results.push(self.verify(request, listener).await); } results } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/watermark_service_impl.rs b/src/infra/core/src/watermark_service_impl.rs new file mode 100644 index 0000000000..ae1fb5cdfb --- /dev/null +++ b/src/infra/core/src/watermark_service_impl.rs @@ -0,0 +1,152 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use chrono::{DateTime, Utc}; +use dill::*; +use internal_error::{ErrorIntoInternal, ResultIntoInternal}; +use kamu_core::{ + AppendError, + AppendValidationError, + BlockRef, + CommitError, + DataWriter, + GetAliasesError, + GetSummaryOpts, + GetWatermarkError, + MetadataChainExt, + RemoteAliasKind, + RemoteAliasesRegistry, + ResolvedDataset, + SearchAddDataVisitor, + SetWatermarkError, + SetWatermarkResult, + WatermarkService, + WriteWatermarkError, + WriteWatermarkOpts, +}; +use kamu_ingest_datafusion::DataWriterDataFusion; +use opendatafabric::DatasetKind; +use time_source::SystemTimeSource; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct WatermarkServiceImpl { + remote_alias_reg: Arc, + system_time_source: Arc, +} + +#[component(pub)] +#[interface(dyn WatermarkService)] +impl WatermarkServiceImpl { + pub fn new( + remote_alias_reg: Arc, + system_time_source: Arc, + ) -> Self { + Self { + remote_alias_reg, + system_time_source, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl WatermarkService for WatermarkServiceImpl { + /// Attempt reading watermark that is currently associated with a dataset + #[tracing::instrument(level = "info", skip_all)] + async fn try_get_current_watermark( + &self, + resolved_dataset: ResolvedDataset, + ) -> Result>, GetWatermarkError> { + let head = resolved_dataset + .as_metadata_chain() + .resolve_ref(&BlockRef::Head) + .await + .int_err()?; + + let mut add_data_visitor = SearchAddDataVisitor::new(); + + resolved_dataset + .as_metadata_chain() + .accept_by_hash(&mut [&mut add_data_visitor], &head) + .await + .int_err()?; + + let current_watermark = add_data_visitor.into_event().and_then(|e| e.new_watermark); + + Ok(current_watermark) + } + + /// Manually advances the watermark of a root dataset + #[tracing::instrument(level = "info", skip_all, fields(%new_watermark))] + async fn set_watermark( + &self, + target: ResolvedDataset, + new_watermark: DateTime, + ) -> Result { + let aliases = match self + .remote_alias_reg + .get_remote_aliases(target.get_handle()) + .await + { + Ok(v) => Ok(v), + Err(GetAliasesError::Internal(e)) => Err(SetWatermarkError::Internal(e)), + }?; + + if !aliases.is_empty(RemoteAliasKind::Pull) { + return Err(SetWatermarkError::IsRemote); + } + + let summary = target + .get_summary(GetSummaryOpts::default()) + .await + .int_err()?; + + if summary.kind != DatasetKind::Root { + return Err(SetWatermarkError::IsDerivative); + } + + let mut writer = DataWriterDataFusion::builder( + (*target).clone(), + datafusion::prelude::SessionContext::new(), + ) + .with_metadata_state_scanned(None) + .await + .int_err()? + .build(); + + match writer + .write_watermark( + new_watermark, + WriteWatermarkOpts { + system_time: self.system_time_source.now(), + new_source_state: None, + }, + ) + .await + { + Ok(res) => Ok(SetWatermarkResult::Updated { + old_head: Some(res.old_head), + new_head: res.new_head, + }), + Err( + WriteWatermarkError::EmptyCommit(_) + | WriteWatermarkError::CommitError(CommitError::MetadataAppendError( + AppendError::InvalidBlock(AppendValidationError::WatermarkIsNotMonotonic), + )), + ) => Ok(SetWatermarkResult::UpToDate), + Err(e) => Err(e.int_err().into()), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/benches/parallel_simple_transfer_protocol.rs b/src/infra/core/tests/benches/parallel_simple_transfer_protocol.rs index be0d7cee6b..4f9d95fa50 100644 --- a/src/infra/core/tests/benches/parallel_simple_transfer_protocol.rs +++ b/src/infra/core/tests/benches/parallel_simple_transfer_protocol.rs @@ -24,16 +24,19 @@ use kamu::utils::ipfs_wrapper::IpfsClient; use kamu::utils::simple_transfer_protocol::ENV_VAR_SIMPLE_PROTOCOL_MAX_PARALLEL_TRANSFERS; use kamu::{ DatasetFactoryImpl, + DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter, DependencyGraphServiceInMemory, IpfsGateway, RemoteReposDir, RemoteRepositoryRegistryImpl, + SyncRequestBuilder, SyncServiceImpl, }; use kamu_accounts::CurrentAccountSubject; use opendatafabric::*; +use time_source::SystemTimeSourceDefault; use url::Url; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -47,31 +50,46 @@ async fn setup_dataset( tmp_workspace_dir: &Path, dataset_alias: &DatasetAlias, ipfs: Option<(IpfsGateway, IpfsClient)>, -) -> (Arc, Arc) { +) -> ( + Arc, + Arc, + Arc, +) { let (ipfs_gateway, ipfs_client) = ipfs.unwrap_or_default(); + let datasets_dir = tmp_workspace_dir.join("datasets"); + std::fs::create_dir(&datasets_dir).unwrap(); + + let repos_dir = tmp_workspace_dir.join("repos"); + std::fs::create_dir(&repos_dir).unwrap(); + let catalog = dill::CatalogBuilder::new() .add::() .add_value(ipfs_gateway) .add_value(ipfs_client) .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(tmp_workspace_dir.join("datasets")) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() - .add_value(RemoteReposDir::new(tmp_workspace_dir.join("repos"))) + .bind::() + .add::() + .add_value(RemoteReposDir::new(repos_dir)) .add::() .add::() .add::() .add::() + .add::() + .add::() .add::() .add::() .build(); + init_on_startup::run_startup_jobs(&catalog).await.unwrap(); + let sync_svc = catalog.get_one::().unwrap(); - let dataset_repo = catalog.get_one::().unwrap(); + let sync_request_builder = catalog.get_one::().unwrap(); + let dataset_registry = catalog.get_one::().unwrap(); + let dataset_repo_writer = catalog.get_one::().unwrap(); // Add dataset let snapshot = MetadataFactory::dataset_snapshot() @@ -80,7 +98,7 @@ async fn setup_dataset( .push_event(MetadataFactory::set_data_schema().build()) .build(); - let _ = dataset_repo + let _ = dataset_repo_writer .create_dataset_from_snapshot(snapshot) .await .unwrap() @@ -89,37 +107,43 @@ async fn setup_dataset( append_data_to_dataset( AMOUNT_OF_BLOCKS_TO_APPEND, - dataset_repo.as_ref(), + dataset_registry.as_ref(), dataset_alias, ) .await; - (sync_svc, dataset_repo) + (sync_svc, sync_request_builder, dataset_registry) } async fn append_data_to_dataset( block_amount: usize, - dataset_repo: &dyn DatasetRepository, + dataset_registry: &dyn DatasetRegistry, dataset_ref: &DatasetAlias, ) { for _ in 1..block_amount { - let _ = - DatasetTestHelper::append_random_data(dataset_repo, dataset_ref, FILE_DATA_ARRAY_SIZE) - .await; + let _ = DatasetTestHelper::append_random_data( + dataset_registry, + dataset_ref, + FILE_DATA_ARRAY_SIZE, + ) + .await; } } async fn do_test_sync( sync_svc: Arc, + sync_request_builder: Arc, dataset_alias: &DatasetAlias, pull_repo_url: &DatasetRefRemote, push_repo_url: &DatasetRefRemote, - dataset_repo: Arc, + dataset_registry: Arc, ) { let _push_res = sync_svc .sync( - &dataset_alias.as_any_ref(), - &push_repo_url.as_any_ref(), + sync_request_builder + .build_sync_request(dataset_alias.as_any_ref(), push_repo_url.as_any_ref(), true) + .await + .unwrap(), SyncOptions::default(), None, ) @@ -127,8 +151,10 @@ async fn do_test_sync( let _pull_res = sync_svc .sync( - &pull_repo_url.as_any_ref(), - &dataset_alias.as_any_ref(), + sync_request_builder + .build_sync_request(pull_repo_url.as_any_ref(), dataset_alias.as_any_ref(), true) + .await + .unwrap(), SyncOptions::default(), None, ) @@ -138,7 +164,7 @@ async fn do_test_sync( // the same as previous append_data_to_dataset( AMOUNT_OF_BLOCKS_TO_APPEND, - dataset_repo.as_ref(), + dataset_registry.as_ref(), dataset_alias, ) .await; @@ -170,7 +196,7 @@ fn bench_with_1_parallel(c: &mut Criterion) { let (dataset_alias, pull_repo_url, push_repo_url) = rt.block_on(build_temp_dirs(&rt)); - let (sync_service_impl, dataset_repo) = rt.block_on(setup_dataset( + let (sync_service_impl, sync_request_builder, dataset_registry) = rt.block_on(setup_dataset( tmp_workspace_dir.path(), &dataset_alias, None, @@ -183,10 +209,11 @@ fn bench_with_1_parallel(c: &mut Criterion) { b.iter(|| { rt.block_on(do_test_sync( sync_service_impl.clone(), + sync_request_builder.clone(), &dataset_alias, &DatasetRefRemote::from(&pull_repo_url), &DatasetRefRemote::from(&push_repo_url), - dataset_repo.clone(), + dataset_registry.clone(), )); }); }); @@ -200,7 +227,7 @@ fn bench_with_10_parallels(c: &mut Criterion) { let (dataset_alias, pull_repo_url, push_repo_url) = rt.block_on(build_temp_dirs(&rt)); - let (sync_service_impl, dataset_repo) = rt.block_on(setup_dataset( + let (sync_service_impl, sync_request_builder, dataset_repo) = rt.block_on(setup_dataset( tmp_workspace_dir.path(), &dataset_alias, None, @@ -213,6 +240,7 @@ fn bench_with_10_parallels(c: &mut Criterion) { b.iter(|| { rt.block_on(do_test_sync( sync_service_impl.clone(), + sync_request_builder.clone(), &dataset_alias, &DatasetRefRemote::from(&pull_repo_url), &DatasetRefRemote::from(&push_repo_url), diff --git a/src/infra/core/tests/tests/engine/test_engine_io.rs b/src/infra/core/tests/tests/engine/test_engine_io.rs index 77e0332bfb..c5466c360f 100644 --- a/src/infra/core/tests/tests/engine/test_engine_io.rs +++ b/src/infra/core/tests/tests/engine/test_engine_io.rs @@ -22,6 +22,10 @@ use kamu_datasets_services::DatasetKeyValueServiceSysEnv; use opendatafabric::*; use time_source::SystemTimeSourceDefault; +use crate::TransformTestHelper; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + async fn test_engine_io_common< TDatasetRepo: DatasetRepository + DatasetRepositoryWriter + 'static, >( @@ -37,18 +41,14 @@ async fn test_engine_io_common< let engine_provisioner = Arc::new(EngineProvisionerLocal::new( EngineProvisionerLocalConfig::default(), Arc::new(ContainerRuntime::default()), - dataset_repo.clone(), run_info_dir.clone(), )); - let dataset_action_authorizer = Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()); let object_store_registry = Arc::new(ObjectStoreRegistryImpl::new(object_stores)); let time_source = Arc::new(SystemTimeSourceDefault); let dataset_env_var_sys_env = Arc::new(DatasetKeyValueServiceSysEnv::new()); let ingest_svc = PollingIngestServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer.clone(), Arc::new(FetchService::new( Arc::new(ContainerRuntime::default()), None, @@ -66,18 +66,15 @@ async fn test_engine_io_common< time_source.clone(), ); - let transform_svc = TransformServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer.clone(), - engine_provisioner.clone(), - Arc::new(SystemTimeSourceDefault), + let transform_helper = TransformTestHelper::build( + Arc::new(DatasetRegistryRepoBridge::new(dataset_repo.clone())), + time_source.clone(), Arc::new(CompactionServiceImpl::new( - dataset_action_authorizer.clone(), - dataset_repo.clone(), object_store_registry.clone(), time_source.clone(), run_info_dir.clone(), )), + engine_provisioner.clone(), ); /////////////////////////////////////////////////////////////////////////// @@ -122,14 +119,15 @@ async fn test_engine_io_common< let root_alias = root_snapshot.name.clone(); - dataset_repo + let root_created = dataset_repo .create_dataset_from_snapshot(root_snapshot) .await - .unwrap(); + .unwrap() + .create_dataset_result; ingest_svc .ingest( - &root_alias.as_local_ref(), + ResolvedDataset::from(&root_created), PollingIngestOptions::default(), None, ) @@ -151,29 +149,19 @@ async fn test_engine_io_common< ) .build(); - let deriv_alias = deriv_snapshot.name.clone(); - - let dataset_deriv = dataset_repo + let deriv_created = dataset_repo .create_dataset_from_snapshot(deriv_snapshot) .await .unwrap() - .create_dataset_result - .dataset; + .create_dataset_result; - let block_hash = match transform_svc - .transform( - &deriv_alias.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap() - { + let block_hash = match transform_helper.transform_dataset(&deriv_created).await { TransformResult::Updated { new_head, .. } => new_head, v => panic!("Unexpected result: {v:?}"), }; - let block = dataset_deriv + let block = deriv_created + .dataset .as_metadata_chain() .get_block(&block_hash) .await @@ -207,27 +195,20 @@ async fn test_engine_io_common< ingest_svc .ingest( - &root_alias.as_local_ref(), + ResolvedDataset::from(&root_created), PollingIngestOptions::default(), None, ) .await .unwrap(); - let block_hash = match transform_svc - .transform( - &deriv_alias.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap() - { + let block_hash = match transform_helper.transform_dataset(&deriv_created).await { TransformResult::Updated { new_head, .. } => new_head, v => panic!("Unexpected result: {v:?}"), }; - let block = dataset_deriv + let block = deriv_created + .dataset .as_metadata_chain() .get_block(&block_hash) .await @@ -244,13 +225,12 @@ async fn test_engine_io_common< // Verify /////////////////////////////////////////////////////////////////////////// - let verify_result = transform_svc - .verify_transform(&deriv_alias.as_local_ref(), (None, None), None) - .await; - + let verify_result = transform_helper.verify_transform(&deriv_created).await; assert_matches!(verify_result, Ok(())); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[test_group::group(containerized, engine, transform, datafusion)] #[test_log::test(tokio::test)] async fn test_engine_io_local_file_mount() { @@ -267,11 +247,8 @@ async fn test_engine_io_local_file_mount() { .add::() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .build(); @@ -292,6 +269,8 @@ async fn test_engine_io_local_file_mount() { .await; } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[test_group::group(containerized, engine, transform, datafusion)] #[test_log::test(tokio::test)] async fn test_engine_io_s3_to_local_file_mount_proxy() { @@ -308,11 +287,8 @@ async fn test_engine_io_s3_to_local_file_mount_proxy() { .add::() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryS3::builder() - .with_s3_context(s3_context.clone()) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryS3::builder().with_s3_context(s3_context.clone())) .bind::() .build(); @@ -337,3 +313,5 @@ async fn test_engine_io_s3_to_local_file_mount_proxy() { ) .await; } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/engine/test_engine_transform.rs b/src/infra/core/tests/tests/engine/test_engine_transform.rs index 4c46efbcab..001ee3bb35 100644 --- a/src/infra/core/tests/tests/engine/test_engine_transform.rs +++ b/src/infra/core/tests/tests/engine/test_engine_transform.rs @@ -26,6 +26,8 @@ use kamu_datasets_services::DatasetKeyValueServiceSysEnv; use opendatafabric::*; use time_source::{SystemTimeSource, SystemTimeSourceStub}; +use crate::TransformTestHelper; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct DatasetHelper { @@ -215,10 +217,10 @@ impl DatasetHelper { struct TestHarness { tempdir: tempfile::TempDir, - dataset_repo: Arc, + dataset_repo_writer: Arc, ingest_svc: Arc, push_ingest_svc: Arc, - transform_svc: Arc, + transform_helper: TransformTestHelper, time_source: Arc, } @@ -239,12 +241,11 @@ impl TestHarness { .add::() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() + .bind::() + .add::() .add_value(EngineProvisionerLocalConfig::default()) .add::() .add_value(ObjectStoreRegistryImpl::new(vec![Arc::new( @@ -255,7 +256,9 @@ impl TestHarness { .add::() .add::() .add::() - .add::() + .add::() + .add::() + .add::() .add::() .add::() .add_value(SystemTimeSourceStub::new_set( @@ -264,13 +267,15 @@ impl TestHarness { .bind::() .build(); + let transform_helper = TransformTestHelper::from_catalog(&catalog); + Self { tempdir, - dataset_repo: catalog.get_one().unwrap(), + dataset_repo_writer: catalog.get_one().unwrap(), ingest_svc: catalog.get_one().unwrap(), push_ingest_svc: catalog.get_one().unwrap(), - transform_svc: catalog.get_one().unwrap(), time_source: catalog.get_one().unwrap(), + transform_helper, } } } @@ -325,16 +330,17 @@ async fn test_transform_common(transform: Transform, test_retractions: bool) { let root_alias = root_snapshot.name.clone(); - harness - .dataset_repo + let root_created = harness + .dataset_repo_writer .create_dataset_from_snapshot(root_snapshot) .await - .unwrap(); + .unwrap() + .create_dataset_result; harness .ingest_svc .ingest( - &root_alias.as_local_ref(), + ResolvedDataset::from(&root_created), PollingIngestOptions::default(), None, ) @@ -356,32 +362,24 @@ async fn test_transform_common(transform: Transform, test_retractions: bool) { ) .build(); - let deriv_alias = deriv_snapshot.name.clone(); - - let dataset = harness - .dataset_repo + let deriv_created = harness + .dataset_repo_writer .create_dataset_from_snapshot(deriv_snapshot) .await .unwrap() - .create_dataset_result - .dataset; + .create_dataset_result; - let deriv_helper = DatasetHelper::new(dataset.clone(), harness.tempdir.path()); - let deriv_data_helper = DatasetDataHelper::new(dataset); + let deriv_helper = DatasetHelper::new(deriv_created.dataset.clone(), harness.tempdir.path()); + let deriv_data_helper = DatasetDataHelper::new(deriv_created.dataset.clone()); harness .time_source .set(Utc.with_ymd_and_hms(2050, 1, 2, 12, 0, 0).unwrap()); let res = harness - .transform_svc - .transform( - &deriv_alias.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap(); + .transform_helper + .transform_dataset(&deriv_created) + .await; assert_matches!(res, TransformResult::Updated { .. }); // First transform writes two blocks: SetDataSchema, ExecuteTransform @@ -444,7 +442,7 @@ async fn test_transform_common(transform: Transform, test_retractions: bool) { harness .ingest_svc .ingest( - &root_alias.as_local_ref(), + ResolvedDataset::from(&root_created), PollingIngestOptions::default(), None, ) @@ -456,14 +454,9 @@ async fn test_transform_common(transform: Transform, test_retractions: bool) { .set(Utc.with_ymd_and_hms(2050, 1, 3, 12, 0, 0).unwrap()); let res = harness - .transform_svc - .transform( - &deriv_alias.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap(); + .transform_helper + .transform_dataset(&deriv_created) + .await; assert_matches!(res, TransformResult::Updated { .. }); // Only one block written this time @@ -512,8 +505,8 @@ async fn test_transform_common(transform: Transform, test_retractions: bool) { .await; let verify_result = harness - .transform_svc - .verify_transform(&deriv_alias.as_local_ref(), (None, None), None) + .transform_helper + .verify_transform(&deriv_created) .await; assert_matches!(verify_result, Ok(())); @@ -527,13 +520,15 @@ async fn test_transform_common(transform: Transform, test_retractions: bool) { .await; let verify_result = harness - .transform_svc - .verify_transform(&deriv_alias.as_local_ref(), (None, None), None) + .transform_helper + .verify_transform(&deriv_created) .await; assert_matches!( verify_result, - Err(VerificationError::DataNotReproducible(_)) + Err(VerifyTransformError::Execute( + VerifyTransformExecuteError::DataNotReproducible(_) + )) ); } @@ -687,7 +682,7 @@ async fn test_transform_empty_inputs() { /////////////////////////////////////////////////////////////////////////// let root = harness - .dataset_repo + .dataset_repo_writer .create_dataset_from_snapshot( MetadataFactory::dataset_snapshot() .name("root") @@ -719,7 +714,7 @@ async fn test_transform_empty_inputs() { .build(); let deriv = harness - .dataset_repo + .dataset_repo_writer .create_dataset_from_snapshot(deriv_snapshot) .await .unwrap() @@ -733,16 +728,7 @@ async fn test_transform_empty_inputs() { // 1: Input doesn't have schema yet - skip update completely /////////////////////////////////////////////////////////////////////////// - let res = harness - .transform_svc - .transform( - &deriv.dataset_handle.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap(); - + let res = harness.transform_helper.transform_dataset(&deriv).await; assert_matches!(res, TransformResult::UpToDate); /////////////////////////////////////////////////////////////////////////// @@ -772,7 +758,7 @@ async fn test_transform_empty_inputs() { let ingest_result = harness .push_ingest_svc .ingest_from_file_stream( - &root.dataset_handle.as_local_ref(), + ResolvedDataset::from(&root), None, Box::new(tokio::io::BufReader::new(std::io::Cursor::new(b""))), PushIngestOpts::default(), @@ -783,16 +769,7 @@ async fn test_transform_empty_inputs() { assert_matches!(ingest_result, PushIngestResult::Updated { .. }); - let res = harness - .transform_svc - .transform( - &deriv.dataset_handle.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap(); - + let res = harness.transform_helper.transform_dataset(&deriv).await; assert_matches!(res, TransformResult::Updated { .. }); let deriv_helper = DatasetDataHelper::new(deriv.dataset.clone()); @@ -822,7 +799,7 @@ async fn test_transform_empty_inputs() { let ingest_result = harness .push_ingest_svc .ingest_from_file_stream( - &root.dataset_handle.as_local_ref(), + ResolvedDataset::from(&root), None, Box::new(tokio::io::BufReader::new(std::io::Cursor::new( br#"{"city": "A", "population": 100}"#, @@ -835,16 +812,7 @@ async fn test_transform_empty_inputs() { assert_matches!(ingest_result, PushIngestResult::Updated { .. }); - let res = harness - .transform_svc - .transform( - &deriv.dataset_handle.as_local_ref(), - TransformOptions::default(), - None, - ) - .await - .unwrap(); - + let res = harness.transform_helper.transform_dataset(&deriv).await; assert_matches!(res, TransformResult::Updated { .. }); deriv_helper diff --git a/src/infra/core/tests/tests/ingest/test_polling_ingest.rs b/src/infra/core/tests/tests/ingest/test_polling_ingest.rs index ceb0f56f96..90e5478ebc 100644 --- a/src/infra/core/tests/tests/ingest/test_polling_ingest.rs +++ b/src/infra/core/tests/tests/ingest/test_polling_ingest.rs @@ -97,7 +97,7 @@ async fn test_ingest_polling_snapshot() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; // Round 1 @@ -114,7 +114,7 @@ async fn test_ingest_polling_snapshot() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_eq( @@ -172,7 +172,7 @@ async fn test_ingest_polling_snapshot() { .time_source .set(Utc.with_ymd_and_hms(2050, 2, 1, 12, 0, 0).unwrap()); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_records_eq(indoc!( @@ -215,7 +215,7 @@ async fn test_ingest_polling_snapshot() { .time_source .set(Utc.with_ymd_and_hms(2050, 2, 1, 12, 0, 0).unwrap()); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); let event = data_helper.get_last_block_typed::().await.event; assert_eq!(event.new_data, None); @@ -271,7 +271,7 @@ async fn test_ingest_polling_ledger() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; // Round 1 @@ -288,7 +288,7 @@ async fn test_ingest_polling_ledger() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_eq( indoc!( @@ -341,7 +341,7 @@ async fn test_ingest_polling_ledger() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_records_eq(indoc!( @@ -377,7 +377,7 @@ async fn test_ingest_polling_ledger() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_records_eq(indoc!( @@ -413,7 +413,7 @@ async fn test_ingest_polling_ledger() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); let event = data_helper.get_last_block_typed::().await.event; assert_eq!(event.new_data, None); @@ -426,7 +426,7 @@ async fn test_ingest_polling_ledger() { // Round 5 (empty data, commit only updates the source state) std::fs::write(&src_path, "").unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); let event = data_helper.get_last_block_typed::().await.event; assert_eq!(event.new_data, None); @@ -484,11 +484,11 @@ async fn test_ingest_polling_empty_data() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; std::fs::write(&src_path, "").unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); // Should only contain source state let event = data_helper.get_last_block_typed::().await.event; @@ -543,7 +543,7 @@ async fn test_ingest_polling_event_time_as_date() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; std::fs::write( @@ -559,7 +559,7 @@ async fn test_ingest_polling_event_time_as_date() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_eq( @@ -639,9 +639,7 @@ async fn test_ingest_polling_event_time_of_invalid_type() { }) .build(); - let dataset_alias = dataset_snapshot.name.clone(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; std::fs::write( &src_path, @@ -656,7 +654,7 @@ async fn test_ingest_polling_event_time_of_invalid_type() { ) .unwrap(); - let res = harness.ingest(&dataset_alias).await; + let res = harness.ingest(&created).await; assert_matches!(res, Err(PollingIngestError::BadInputSchema(_))); } @@ -700,7 +698,7 @@ async fn test_ingest_polling_bad_column_names_preserve() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; std::fs::write( @@ -715,7 +713,7 @@ async fn test_ingest_polling_bad_column_names_preserve() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_eq( indoc!( @@ -792,7 +790,7 @@ async fn test_ingest_polling_bad_column_names_rename() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; std::fs::write( @@ -807,7 +805,7 @@ async fn test_ingest_polling_bad_column_names_rename() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_eq( @@ -885,7 +883,7 @@ async fn test_ingest_polling_schema_case_sensitivity() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; // Round 1 @@ -902,7 +900,7 @@ async fn test_ingest_polling_schema_case_sensitivity() { ) .unwrap(); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_eq( @@ -961,7 +959,7 @@ async fn test_ingest_polling_schema_case_sensitivity() { .time_source .set(Utc.with_ymd_and_hms(2050, 1, 2, 12, 0, 0).unwrap()); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); data_helper .assert_last_data_records_eq(indoc!( @@ -1004,7 +1002,7 @@ async fn test_ingest_polling_schema_case_sensitivity() { .time_source .set(Utc.with_ymd_and_hms(2050, 1, 3, 12, 0, 0).unwrap()); - harness.ingest(&dataset_alias).await.unwrap(); + harness.ingest(&created).await.unwrap(); let event = data_helper.get_last_block_typed::().await.event; assert_eq!(event.new_data, None); @@ -1073,8 +1071,8 @@ async fn test_ingest_polling_preprocess_with_spark() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; - harness.ingest(&dataset_alias).await.unwrap(); + let created = harness.create_dataset(dataset_snapshot).await; + harness.ingest(&created).await.unwrap(); let data_helper = harness.dataset_data_helper(&dataset_alias).await; @@ -1165,8 +1163,8 @@ async fn test_ingest_polling_preprocess_with_flink() { let dataset_alias = dataset_snapshot.name.clone(); - harness.create_dataset(dataset_snapshot).await; - harness.ingest(&dataset_alias).await.unwrap(); + let created = harness.create_dataset(dataset_snapshot).await; + harness.ingest(&created).await.unwrap(); let data_helper = harness.dataset_data_helper(&dataset_alias).await; @@ -1201,54 +1199,10 @@ async fn test_ingest_polling_preprocess_with_flink() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[test_group::group(engine, ingest, datafusion)] -#[test_log::test(tokio::test)] -async fn test_ingest_checks_auth() { - let harness = IngestTestHarness::new_with_authorizer( - MockDatasetActionAuthorizer::new().expect_check_write_dataset( - &DatasetAlias::new(None, DatasetName::new_unchecked("foo.bar")), - 1, - true, - ), - ); - let src_path = harness.temp_dir.path().join("data.json"); - - let dataset_snapshot = MetadataFactory::dataset_snapshot() - .name("foo.bar") - .kind(DatasetKind::Root) - .push_event( - MetadataFactory::set_polling_source() - .fetch_file(&src_path) - .read(ReadStepNdJson { - schema: Some(vec![ - "event_time TIMESTAMP".to_string(), - "city STRING".to_string(), - "population BIGINT".to_string(), - ]), - ..ReadStepNdJson::default() - }) - .build(), - ) - .build(); - - let dataset_alias = dataset_snapshot.name.clone(); - - harness.create_dataset(dataset_snapshot).await; - - std::fs::write( - &src_path, - r#"{"event_time": "2020-01-01T12:00:00", "city": "A", "population": 1000}"#, - ) - .unwrap(); - - harness.ingest(&dataset_alias).await.unwrap(); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - struct IngestTestHarness { temp_dir: TempDir, - dataset_repo: Arc, + dataset_registry: Arc, + dataset_repo_writer: Arc, ingest_svc: Arc, time_source: Arc, ctx: SessionContext, @@ -1256,12 +1210,6 @@ struct IngestTestHarness { impl IngestTestHarness { fn new() -> Self { - Self::new_with_authorizer(kamu_core::auth::AlwaysHappyDatasetActionAuthorizer::new()) - } - - fn new_with_authorizer( - dataset_action_authorizer: TDatasetAuthorizer, - ) -> Self { let temp_dir = tempfile::tempdir().unwrap(); let run_info_dir = temp_dir.path().join("run"); let cache_dir = temp_dir.path().join("cache"); @@ -1278,14 +1226,11 @@ impl IngestTestHarness { .add::() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_value(dataset_action_authorizer) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() + .bind::() + .add::() .add_value(EngineProvisionerLocalConfig::default()) .add::() .add_value(SystemTimeSourceStub::new_set( @@ -1298,33 +1243,36 @@ impl IngestTestHarness { .add::() .build(); - let dataset_repo = catalog.get_one::().unwrap(); + let dataset_registry = catalog.get_one::().unwrap(); + let dataset_repo_writer = catalog.get_one::().unwrap(); let ingest_svc = catalog.get_one::().unwrap(); let time_source = catalog.get_one::().unwrap(); Self { temp_dir, - dataset_repo, + dataset_registry, + dataset_repo_writer, ingest_svc, time_source, ctx: SessionContext::new_with_config(SessionConfig::new().with_target_partitions(1)), } } - async fn create_dataset(&self, dataset_snapshot: DatasetSnapshot) { - self.dataset_repo + async fn create_dataset(&self, dataset_snapshot: DatasetSnapshot) -> CreateDatasetResult { + self.dataset_repo_writer .create_dataset_from_snapshot(dataset_snapshot) .await - .unwrap(); + .unwrap() + .create_dataset_result } async fn ingest( &self, - dataset_alias: &DatasetAlias, + created: &CreateDatasetResult, ) -> Result { self.ingest_svc .ingest( - &dataset_alias.as_local_ref(), + ResolvedDataset::from(created), PollingIngestOptions::default(), None, ) @@ -1332,12 +1280,12 @@ impl IngestTestHarness { } async fn dataset_data_helper(&self, dataset_alias: &DatasetAlias) -> DatasetDataHelper { - let dataset = self - .dataset_repo - .find_dataset_by_ref(&dataset_alias.as_local_ref()) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(&dataset_alias.as_local_ref()) .await .unwrap(); - DatasetDataHelper::new_with_context(dataset, self.ctx.clone()) + DatasetDataHelper::new_with_context((*resolved_dataset).clone(), self.ctx.clone()) } } diff --git a/src/infra/core/tests/tests/ingest/test_push_ingest.rs b/src/infra/core/tests/tests/ingest/test_push_ingest.rs index a4540187b0..846693ddc9 100644 --- a/src/infra/core/tests/tests/ingest/test_push_ingest.rs +++ b/src/infra/core/tests/tests/ingest/test_push_ingest.rs @@ -55,9 +55,7 @@ async fn test_ingest_push_url_stream() { .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; // Round 1: Push from URL @@ -78,7 +76,7 @@ async fn test_ingest_push_url_stream() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts::default(), @@ -138,7 +136,7 @@ async fn test_ingest_push_url_stream() { harness .push_ingest_svc .ingest_from_file_stream( - &dataset_ref, + ResolvedDataset::from(&created), None, Box::new(data), PushIngestOpts::default(), @@ -203,9 +201,7 @@ async fn test_ingest_push_media_type_override() { .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; // Push CSV conversion @@ -223,7 +219,7 @@ async fn test_ingest_push_media_type_override() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts { @@ -276,7 +272,7 @@ async fn test_ingest_push_media_type_override() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts { @@ -331,7 +327,7 @@ async fn test_ingest_push_media_type_override() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts { @@ -398,9 +394,7 @@ async fn test_ingest_push_schema_stability() { .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; // Round 1: Push from URL @@ -421,7 +415,7 @@ async fn test_ingest_push_schema_stability() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts::default(), @@ -489,9 +483,7 @@ async fn test_ingest_inference_automatic_coercion_of_event_time_from_string() { .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; let src_path = harness.temp_dir.path().join("data.ndjson"); @@ -508,7 +500,7 @@ async fn test_ingest_inference_automatic_coercion_of_event_time_from_string() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts { @@ -569,9 +561,7 @@ async fn test_ingest_inference_automatic_coercion_of_event_time_from_unixtime() .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; let src_path = harness.temp_dir.path().join("data.ndjson"); @@ -588,7 +578,7 @@ async fn test_ingest_inference_automatic_coercion_of_event_time_from_unixtime() harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts { @@ -649,9 +639,7 @@ async fn test_ingest_inference_automatic_renaming_of_conflicting_columns() { .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; let src_path = harness.temp_dir.path().join("data.ndjson"); @@ -668,7 +656,7 @@ async fn test_ingest_inference_automatic_renaming_of_conflicting_columns() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts { @@ -750,9 +738,7 @@ async fn test_ingest_sql_case_sensitivity() { .build(); let dataset_alias = dataset_snapshot.name.clone(); - let dataset_ref = dataset_alias.as_local_ref(); - - harness.create_dataset(dataset_snapshot).await; + let created = harness.create_dataset(dataset_snapshot).await; let data_helper = harness.dataset_data_helper(&dataset_alias).await; let src_path = harness.temp_dir.path().join("data.ndjson"); @@ -769,7 +755,7 @@ async fn test_ingest_sql_case_sensitivity() { harness .push_ingest_svc .ingest_from_url( - &dataset_ref, + ResolvedDataset::from(&created), None, url::Url::from_file_path(&src_path).unwrap(), PushIngestOpts::default(), @@ -812,7 +798,8 @@ async fn test_ingest_sql_case_sensitivity() { struct IngestTestHarness { temp_dir: TempDir, - dataset_repo: Arc, + dataset_registry: Arc, + dataset_repo_writer: Arc, push_ingest_svc: Arc, ctx: SessionContext, } @@ -832,12 +819,11 @@ impl IngestTestHarness { .add_value(CacheDir::new(cache_dir)) .add_value(CurrentAccountSubject::new_test()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() + .bind::() + .add::() .add_value(SystemTimeSourceStub::new_set( Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), )) @@ -851,26 +837,28 @@ impl IngestTestHarness { Self { temp_dir, - dataset_repo: catalog.get_one().unwrap(), + dataset_registry: catalog.get_one().unwrap(), + dataset_repo_writer: catalog.get_one().unwrap(), push_ingest_svc: catalog.get_one().unwrap(), ctx: SessionContext::new_with_config(SessionConfig::new().with_target_partitions(1)), } } - async fn create_dataset(&self, dataset_snapshot: DatasetSnapshot) { - self.dataset_repo + async fn create_dataset(&self, dataset_snapshot: DatasetSnapshot) -> CreateDatasetResult { + self.dataset_repo_writer .create_dataset_from_snapshot(dataset_snapshot) .await - .unwrap(); + .unwrap() + .create_dataset_result } async fn dataset_data_helper(&self, dataset_alias: &DatasetAlias) -> DatasetDataHelper { - let dataset = self - .dataset_repo - .find_dataset_by_ref(&dataset_alias.as_local_ref()) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(&dataset_alias.as_local_ref()) .await .unwrap(); - DatasetDataHelper::new_with_context(dataset, self.ctx.clone()) + DatasetDataHelper::new_with_context((*resolved_dataset).clone(), self.ctx.clone()) } } diff --git a/src/infra/core/tests/tests/ingest/test_writer.rs b/src/infra/core/tests/tests/ingest/test_writer.rs index 4e5124cc56..45e1238e88 100644 --- a/src/infra/core/tests/tests/ingest/test_writer.rs +++ b/src/infra/core/tests/tests/ingest/test_writer.rs @@ -952,11 +952,8 @@ impl Harness { let catalog = dill::CatalogBuilder::new() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .build(); diff --git a/src/infra/core/tests/tests/mod.rs b/src/infra/core/tests/tests/mod.rs index a1ea4bd2dc..748aa4745c 100644 --- a/src/infra/core/tests/tests/mod.rs +++ b/src/infra/core/tests/tests/mod.rs @@ -16,7 +16,8 @@ mod test_dataset_ownership_service_inmem; mod test_datasets_filtering; mod test_dependency_graph_inmem; mod test_metadata_chain_comparator; -mod test_pull_service_impl; +mod test_pull_request_planner_impl; +mod test_push_request_planner_impl; mod test_query_service_impl; mod test_reset_service_impl; mod test_resource_loader_impl; @@ -27,4 +28,5 @@ mod test_setup; mod test_sync_service_impl; mod test_transform_service_impl; mod test_verification_service_impl; +mod test_watermark_service_impl; mod use_cases; diff --git a/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs b/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs index 1dbef4b09d..30922dc34c 100644 --- a/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs +++ b/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs @@ -13,7 +13,7 @@ use dill::Component; use domain::DatasetRepository; use kamu::*; use kamu_accounts::{CurrentAccountSubject, DEFAULT_ACCOUNT_NAME}; -use kamu_core::CreateDatasetFromSnapshotUseCase; +use kamu_core::{CreateDatasetFromSnapshotUseCase, TenancyConfig}; use messaging_outbox::{Outbox, OutboxImmediateImpl}; use tempfile::TempDir; use time_source::SystemTimeSourceDefault; @@ -29,7 +29,7 @@ struct LocalFsRepoHarness { } impl LocalFsRepoHarness { - pub fn create(tempdir: &TempDir, multi_tenant: bool) -> Self { + pub fn create(tempdir: &TempDir, tenancy_config: TenancyConfig) -> Self { let datasets_dir = tempdir.path().join("datasets"); std::fs::create_dir(&datasets_dir).unwrap(); @@ -41,11 +41,8 @@ impl LocalFsRepoHarness { ) .bind::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(multi_tenant), - ) + .add_value(tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() .add::(); @@ -69,7 +66,7 @@ impl LocalFsRepoHarness { #[tokio::test] async fn test_create_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_create_dataset(harness.dataset_repo.as_ref(), None).await; } @@ -79,7 +76,7 @@ async fn test_create_dataset() { #[tokio::test] async fn test_create_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_create_dataset( harness.dataset_repo.as_ref(), @@ -93,7 +90,7 @@ async fn test_create_dataset_multi_tenant() { #[tokio::test] async fn test_create_dataset_same_name_multiple_tenants() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_create_dataset_same_name_multiple_tenants( harness.dataset_repo.as_ref(), @@ -106,7 +103,7 @@ async fn test_create_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_create_dataset_from_snapshot() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_create_dataset_from_snapshot( harness.dataset_repo.as_ref(), @@ -120,7 +117,7 @@ async fn test_create_dataset_from_snapshot() { #[tokio::test] async fn test_create_dataset_from_snapshot_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_create_dataset_from_snapshot( harness.dataset_repo.as_ref(), @@ -134,7 +131,7 @@ async fn test_create_dataset_from_snapshot_multi_tenant() { #[tokio::test] async fn test_rename_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_rename_dataset(harness.dataset_repo.as_ref(), None).await; } @@ -144,7 +141,7 @@ async fn test_rename_dataset() { #[tokio::test] async fn test_rename_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_rename_dataset( harness.dataset_repo.as_ref(), @@ -158,7 +155,7 @@ async fn test_rename_dataset_multi_tenant() { #[tokio::test] async fn test_rename_dataset_same_name_multiple_tenants() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_rename_dataset_same_name_multiple_tenants( harness.dataset_repo.as_ref(), @@ -171,7 +168,7 @@ async fn test_rename_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_delete_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_delete_dataset( harness.dataset_repo.as_ref(), @@ -186,7 +183,7 @@ async fn test_delete_dataset() { #[tokio::test] async fn test_delete_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_delete_dataset( harness.dataset_repo.as_ref(), @@ -201,7 +198,7 @@ async fn test_delete_dataset_multi_tenant() { #[tokio::test] async fn test_iterate_datasets() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_iterate_datasets(harness.dataset_repo.as_ref()).await; } @@ -211,7 +208,7 @@ async fn test_iterate_datasets() { #[tokio::test] async fn test_iterate_datasets_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_iterate_datasets_multi_tenant( harness.dataset_repo.as_ref(), @@ -224,7 +221,7 @@ async fn test_iterate_datasets_multi_tenant() { #[tokio::test] async fn test_create_and_get_case_insensetive_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_create_and_get_case_insensetive_dataset( harness.dataset_repo.as_ref(), @@ -238,7 +235,7 @@ async fn test_create_and_get_case_insensetive_dataset() { #[tokio::test] async fn test_create_and_get_case_insensetive_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_create_and_get_case_insensetive_dataset( harness.dataset_repo.as_ref(), @@ -252,7 +249,7 @@ async fn test_create_and_get_case_insensetive_dataset_multi_tenant() { #[tokio::test] async fn test_create_multiple_datasets_with_same_id() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, false); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::SingleTenant); test_dataset_repository_shared::test_create_multiple_datasets_with_same_id( harness.dataset_repo.as_ref(), @@ -266,7 +263,7 @@ async fn test_create_multiple_datasets_with_same_id() { #[tokio::test] async fn test_create_multiple_datasets_with_same_id_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let harness = LocalFsRepoHarness::create(&tempdir, true); + let harness = LocalFsRepoHarness::create(&tempdir, TenancyConfig::MultiTenant); test_dataset_repository_shared::test_create_multiple_datasets_with_same_id( harness.dataset_repo.as_ref(), diff --git a/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs b/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs index ca06b98ffb..6809ebbd06 100644 --- a/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs +++ b/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs @@ -19,7 +19,7 @@ use kamu::{ S3RegistryCache, }; use kamu_accounts::{CurrentAccountSubject, DEFAULT_ACCOUNT_NAME}; -use kamu_core::{CreateDatasetFromSnapshotUseCase, DatasetRepository}; +use kamu_core::{CreateDatasetFromSnapshotUseCase, DatasetRepository, TenancyConfig}; use messaging_outbox::{Outbox, OutboxImmediateImpl}; use time_source::SystemTimeSourceDefault; @@ -34,7 +34,11 @@ struct S3RepoHarness { } impl S3RepoHarness { - pub async fn create(s3: &LocalS3Server, multi_tenant: bool, registry_caching: bool) -> Self { + pub async fn create( + s3: &LocalS3Server, + tenancy_config: TenancyConfig, + registry_caching: bool, + ) -> Self { let s3_context = S3Context::from_url(&s3.url).await; let mut b = dill::CatalogBuilder::new(); @@ -46,11 +50,8 @@ impl S3RepoHarness { ) .bind::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryS3::builder() - .with_s3_context(s3_context) - .with_multi_tenant(multi_tenant), - ) + .add_value(tenancy_config) + .add_builder(DatasetRepositoryS3::builder().with_s3_context(s3_context)) .bind::() .bind::() .add::(); @@ -79,7 +80,7 @@ impl S3RepoHarness { #[tokio::test] async fn test_create_dataset() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, false, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::SingleTenant, false).await; test_dataset_repository_shared::test_create_dataset(harness.dataset_repo.as_ref(), None).await; } @@ -90,7 +91,7 @@ async fn test_create_dataset() { #[tokio::test] async fn test_create_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_create_dataset( harness.dataset_repo.as_ref(), @@ -105,7 +106,7 @@ async fn test_create_dataset_multi_tenant() { #[tokio::test] async fn test_create_dataset_multi_tenant_with_caching() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, true).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, true).await; test_dataset_repository_shared::test_create_dataset( harness.dataset_repo.as_ref(), @@ -120,7 +121,7 @@ async fn test_create_dataset_multi_tenant_with_caching() { #[tokio::test] async fn test_create_dataset_same_name_multiple_tenants() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_create_dataset_same_name_multiple_tenants( harness.dataset_repo.as_ref(), @@ -135,7 +136,7 @@ async fn test_create_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_create_dataset_from_snapshot() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, false, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::SingleTenant, false).await; test_dataset_repository_shared::test_create_dataset_from_snapshot( harness.dataset_repo.as_ref(), @@ -150,7 +151,7 @@ async fn test_create_dataset_from_snapshot() { #[tokio::test] async fn test_create_dataset_from_snapshot_multi_tenant() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_create_dataset_from_snapshot( harness.dataset_repo.as_ref(), @@ -165,7 +166,7 @@ async fn test_create_dataset_from_snapshot_multi_tenant() { #[tokio::test] async fn test_rename_dataset() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, false, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::SingleTenant, false).await; test_dataset_repository_shared::test_rename_dataset(harness.dataset_repo.as_ref(), None).await; } @@ -176,7 +177,7 @@ async fn test_rename_dataset() { #[tokio::test] async fn test_rename_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_rename_dataset( harness.dataset_repo.as_ref(), @@ -191,7 +192,7 @@ async fn test_rename_dataset_multi_tenant() { #[tokio::test] async fn test_rename_dataset_multi_tenant_with_caching() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, true).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, true).await; test_dataset_repository_shared::test_rename_dataset( harness.dataset_repo.as_ref(), @@ -206,7 +207,7 @@ async fn test_rename_dataset_multi_tenant_with_caching() { #[tokio::test] async fn test_rename_dataset_same_name_multiple_tenants() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_rename_dataset_same_name_multiple_tenants( harness.dataset_repo.as_ref(), @@ -220,7 +221,7 @@ async fn test_rename_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_delete_dataset() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, false, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::SingleTenant, false).await; test_dataset_repository_shared::test_delete_dataset( harness.dataset_repo.as_ref(), @@ -236,7 +237,7 @@ async fn test_delete_dataset() { #[tokio::test] async fn test_delete_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_delete_dataset( harness.dataset_repo.as_ref(), @@ -252,7 +253,7 @@ async fn test_delete_dataset_multi_tenant() { #[tokio::test] async fn test_iterate_datasets() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, false, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::SingleTenant, false).await; test_dataset_repository_shared::test_iterate_datasets(harness.dataset_repo.as_ref()).await; } @@ -263,7 +264,7 @@ async fn test_iterate_datasets() { #[tokio::test] async fn test_iterate_datasets_multi_tenant() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_iterate_datasets_multi_tenant( harness.dataset_repo.as_ref(), @@ -277,7 +278,7 @@ async fn test_iterate_datasets_multi_tenant() { #[tokio::test] async fn test_create_and_get_case_insensetive_dataset() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, false, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::SingleTenant, false).await; test_dataset_repository_shared::test_create_and_get_case_insensetive_dataset( harness.dataset_repo.as_ref(), @@ -292,7 +293,7 @@ async fn test_create_and_get_case_insensetive_dataset() { #[tokio::test] async fn test_create_and_get_case_insensetive_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let harness = S3RepoHarness::create(&s3, true, false).await; + let harness = S3RepoHarness::create(&s3, TenancyConfig::MultiTenant, false).await; test_dataset_repository_shared::test_create_and_get_case_insensetive_dataset( harness.dataset_repo.as_ref(), diff --git a/src/infra/core/tests/tests/repos/test_dataset_repository_shared.rs b/src/infra/core/tests/tests/repos/test_dataset_repository_shared.rs index 0c7adfcbea..3be678d05e 100644 --- a/src/infra/core/tests/tests/repos/test_dataset_repository_shared.rs +++ b/src/infra/core/tests/tests/repos/test_dataset_repository_shared.rs @@ -27,7 +27,7 @@ pub async fn test_create_dataset< let dataset_alias = DatasetAlias::new(account_name, DatasetName::new_unchecked("foo")); assert_matches!( - repo.find_dataset_by_ref(&dataset_alias.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .err() .unwrap(), @@ -47,7 +47,7 @@ pub async fn test_create_dataset< // We should see the dataset assert!(repo - .find_dataset_by_ref(&dataset_alias.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .is_ok()); @@ -78,7 +78,7 @@ pub async fn test_create_and_get_case_insensetive_dataset< DatasetAlias::new(account_name.clone(), DatasetName::new_unchecked("Foo")); assert_matches!( - repo.find_dataset_by_ref(&dataset_alias_to_create.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&dataset_alias_to_create.as_local_ref()) .await .err() .unwrap(), @@ -105,7 +105,7 @@ pub async fn test_create_and_get_case_insensetive_dataset< // We should see the dataset assert!(repo - .find_dataset_by_ref(&dataset_alias_in_another_registry.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias_in_another_registry.as_local_ref()) .await .is_ok()); @@ -172,7 +172,7 @@ pub async fn test_create_dataset_same_name_multiple_tenants< ); assert_matches!( - repo.find_dataset_by_ref(&dataset_alias_my.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&dataset_alias_my.as_local_ref()) .await .err() .unwrap(), @@ -180,7 +180,7 @@ pub async fn test_create_dataset_same_name_multiple_tenants< ); assert_matches!( - repo.find_dataset_by_ref(&dataset_alias_her.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&dataset_alias_her.as_local_ref()) .await .err() .unwrap(), @@ -217,12 +217,12 @@ pub async fn test_create_dataset_same_name_multiple_tenants< // We should see the datasets assert!(repo - .find_dataset_by_ref(&dataset_alias_my.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias_my.as_local_ref()) .await .is_ok()); assert!(repo - .find_dataset_by_ref(&dataset_alias_her.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias_her.as_local_ref()) .await .is_ok()); @@ -265,7 +265,7 @@ pub async fn test_create_dataset_from_snapshot< let dataset_alias = DatasetAlias::new(account_name.clone(), DatasetName::new_unchecked("foo")); assert_matches!( - repo.find_dataset_by_ref(&dataset_alias.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .err() .unwrap(), @@ -284,10 +284,11 @@ pub async fn test_create_dataset_from_snapshot< .unwrap() .create_dataset_result; - let dataset = repo - .find_dataset_by_ref(&create_result.dataset_handle.into()) + let hdl = repo + .resolve_dataset_handle_by_ref(&create_result.dataset_handle.into()) .await .unwrap(); + let dataset = repo.get_dataset_by_handle(&hdl); let actual_head = dataset .as_metadata_chain() @@ -355,10 +356,11 @@ pub async fn test_rename_dataset< .await .unwrap(); - let baz = repo - .find_dataset_by_ref(&alias_baz.as_local_ref()) + let baz_hdl = repo + .resolve_dataset_handle_by_ref(&alias_baz.as_local_ref()) .await .unwrap(); + let baz = repo.get_dataset_by_handle(&baz_hdl); use futures::StreamExt; assert_eq!(baz.as_metadata_chain().iter_blocks().count().await, 2); @@ -423,15 +425,17 @@ pub async fn test_rename_dataset_same_name_multiple_tenants< .await .unwrap(); - let my_bar = repo - .find_dataset_by_ref(&DatasetRef::try_from("my/bar").unwrap()) + let my_bar_hdl = repo + .resolve_dataset_handle_by_ref(&DatasetRef::try_from("my/bar").unwrap()) .await .unwrap(); + let my_bar = repo.get_dataset_by_handle(&my_bar_hdl); - let her_bar = repo - .find_dataset_by_ref(&DatasetRef::try_from("her/bar").unwrap()) + let her_bar_hdl = repo + .resolve_dataset_handle_by_ref(&DatasetRef::try_from("her/bar").unwrap()) .await .unwrap(); + let her_bar = repo.get_dataset_by_handle(&her_bar_hdl); assert_eq!( my_bar @@ -483,7 +487,7 @@ pub async fn test_delete_dataset< .unwrap(); assert!(repo - .find_dataset_by_ref(&alias_foo.as_local_ref()) + .resolve_dataset_handle_by_ref(&alias_foo.as_local_ref()) .await .is_ok()); @@ -492,7 +496,7 @@ pub async fn test_delete_dataset< .unwrap(); assert_matches!( - repo.find_dataset_by_ref(&alias_foo.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&alias_foo.as_local_ref()) .await .err() .unwrap(), @@ -536,21 +540,21 @@ pub async fn test_iterate_datasets< // All check_expected_datasets( vec![alias_bar.clone(), alias_foo.clone()], - repo.get_all_datasets(), + repo.all_dataset_handles(), ) .await; // Default account check_expected_datasets( vec![alias_bar, alias_foo], - repo.get_datasets_by_owner(&DEFAULT_ACCOUNT_NAME), + repo.all_dataset_handles_by_owner(&DEFAULT_ACCOUNT_NAME), ) .await; // Random account check_expected_datasets( vec![], - repo.get_datasets_by_owner(&AccountName::new_unchecked("unknown-account")), + repo.all_dataset_handles_by_owner(&AccountName::new_unchecked("unknown-account")), ) .await; } @@ -625,26 +629,26 @@ pub async fn test_iterate_datasets_multi_tenant< alias_my_baz.clone(), alias_my_foo.clone(), ], - repo.get_all_datasets(), + repo.all_dataset_handles(), ) .await; check_expected_datasets( vec![alias_my_baz, alias_my_foo], - repo.get_datasets_by_owner(&account_my), + repo.all_dataset_handles_by_owner(&account_my), ) .await; check_expected_datasets( vec![alias_her_bar, alias_her_foo], - repo.get_datasets_by_owner(&account_her), + repo.all_dataset_handles_by_owner(&account_her), ) .await; // Random account check_expected_datasets( vec![], - repo.get_datasets_by_owner(&AccountName::new_unchecked("unknown-account")), + repo.all_dataset_handles_by_owner(&AccountName::new_unchecked("unknown-account")), ) .await; } @@ -679,7 +683,7 @@ pub async fn test_create_multiple_datasets_with_same_id< let dataset_alias = DatasetAlias::new(account_name.clone(), DatasetName::new_unchecked("foo")); assert_matches!( - repo.find_dataset_by_ref(&dataset_alias.as_local_ref()) + repo.resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .err() .unwrap(), @@ -698,7 +702,7 @@ pub async fn test_create_multiple_datasets_with_same_id< // We should see the dataset assert!(repo - .find_dataset_by_ref(&dataset_alias.as_local_ref()) + .resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .is_ok()); diff --git a/src/infra/core/tests/tests/test_compact_service_impl.rs b/src/infra/core/tests/tests/test_compact_service_impl.rs index 0f37034365..227b4a4ca2 100644 --- a/src/infra/core/tests/tests/test_compact_service_impl.rs +++ b/src/infra/core/tests/tests/test_compact_service_impl.rs @@ -8,7 +8,7 @@ // by the Apache License, Version 2.0. use std::assert_matches::assert_matches; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use chrono::{DateTime, NaiveDate, TimeDelta, TimeZone, Utc}; use datafusion::execution::config::SessionConfig; @@ -26,8 +26,7 @@ use kamu_core::auth; use opendatafabric::*; use time_source::{SystemTimeSource, SystemTimeSourceStub}; -use super::test_pull_service_impl::TestTransformService; -use crate::mock_engine_provisioner; +use crate::{mock_engine_provisioner, TransformTestHelper}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -54,9 +53,7 @@ async fn test_dataset_compact() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let prev_head = created .dataset @@ -67,12 +64,7 @@ async fn test_dataset_compact() { assert_matches!( harness - .compaction_svc - .compact_dataset( - &created.dataset_handle, - CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})) - ) + .compact_dataset(&created, CompactionOptions::default()) .await, Ok(CompactionResult::NothingToDo) ); @@ -103,20 +95,13 @@ async fn test_dataset_compact() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let old_blocks = harness.get_dataset_blocks(&dataset_ref).await; assert_matches!( harness - .compaction_svc - .compact_dataset( - &created.dataset_handle, - CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})) - ) + .compact_dataset(&created, CompactionOptions::default()) .await, Ok(CompactionResult::Success { new_head, @@ -126,7 +111,7 @@ async fn test_dataset_compact() { }) if new_head != old_head, ); - assert!(harness.verify_dataset(&dataset_ref).await); + assert!(harness.verify_dataset(&created).await); data_helper .assert_last_data_eq( @@ -196,9 +181,7 @@ async fn test_dataset_compact_s3() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let prev_head = created .dataset @@ -209,12 +192,7 @@ async fn test_dataset_compact_s3() { assert_matches!( harness - .compaction_svc - .compact_dataset( - &created.dataset_handle, - CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})) - ) + .compact_dataset(&created, CompactionOptions::default()) .await, Ok(CompactionResult::NothingToDo) ); @@ -245,21 +223,14 @@ async fn test_dataset_compact_s3() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let old_blocks = harness.get_dataset_blocks(&dataset_ref).await; assert_matches!( harness - .compaction_svc - .compact_dataset( - &created.dataset_handle, - CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})) - ) - .await, + .compact_dataset(&created, CompactionOptions::default()) + .await, Ok(CompactionResult::Success { new_head, old_head, @@ -268,7 +239,7 @@ async fn test_dataset_compact_s3() { }) if new_head != old_head, ); - assert!(harness.verify_dataset(&dataset_ref).await); + assert!(harness.verify_dataset(&created).await); let new_blocks = harness.get_dataset_blocks(&dataset_ref).await; @@ -306,9 +277,7 @@ async fn test_dataset_compaction_watermark_only_blocks() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; created .dataset @@ -339,9 +308,7 @@ async fn test_dataset_compaction_watermark_only_blocks() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; created .dataset @@ -371,12 +338,7 @@ async fn test_dataset_compaction_watermark_only_blocks() { // After: ... <- add_data(6 records, wm2, src2) let res = harness - .compaction_svc - .compact_dataset( - &created.dataset_handle, - CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})), - ) + .compact_dataset(&created, CompactionOptions::default()) .await .unwrap(); @@ -474,9 +436,7 @@ async fn test_dataset_compaction_limits() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let data_str = indoc!( " @@ -487,9 +447,7 @@ async fn test_dataset_compaction_limits() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let data_str = indoc!( " @@ -503,9 +461,7 @@ async fn test_dataset_compaction_limits() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let data_str = indoc!( " @@ -516,9 +472,7 @@ async fn test_dataset_compaction_limits() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let data_str = indoc!( " @@ -527,9 +481,7 @@ async fn test_dataset_compaction_limits() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; // Initial state: // seed <- add_push_source <- set_vocab <- set_schema <- add_data(3r) <- @@ -538,14 +490,12 @@ async fn test_dataset_compaction_limits() { assert_matches!( harness - .compaction_svc .compact_dataset( - &created.dataset_handle, + &created, CompactionOptions { max_slice_records: Some(6), ..CompactionOptions::default() }, - Some(Arc::new(NullCompactionListener {})) ) .await, Ok(CompactionResult::Success { @@ -555,7 +505,7 @@ async fn test_dataset_compaction_limits() { old_num_blocks: 9 }) if new_head != old_head, ); - assert!(harness.verify_dataset(&dataset_ref).await); + assert!(harness.verify_dataset(&created).await); data_helper .assert_last_data_eq( @@ -642,9 +592,7 @@ async fn test_dataset_compaction_keep_all_non_data_blocks() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let data_str = indoc!( " @@ -655,9 +603,7 @@ async fn test_dataset_compaction_keep_all_non_data_blocks() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let current_head = harness .get_dataset_head(&created.dataset_handle.as_local_ref()) @@ -675,9 +621,7 @@ async fn test_dataset_compaction_keep_all_non_data_blocks() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; let data_str = indoc!( " @@ -688,9 +632,7 @@ async fn test_dataset_compaction_keep_all_non_data_blocks() { " ); - harness - .ingest_data(data_str.to_string(), &dataset_ref) - .await; + harness.ingest_data(data_str.to_string(), &created).await; // seed <- add_push_source <- set_vocab <- set_schema <- add_data(3r) <- // add_data(3r) <- set_licence <- add_data(3r) <- add_data(3r) @@ -698,11 +640,9 @@ async fn test_dataset_compaction_keep_all_non_data_blocks() { assert_matches!( harness - .compaction_svc .compact_dataset( - &created.dataset_handle, + &created, CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})) ) .await, Ok(CompactionResult::Success { @@ -712,7 +652,7 @@ async fn test_dataset_compaction_keep_all_non_data_blocks() { old_num_blocks: 9 }) if new_head != old_head, ); - assert!(harness.verify_dataset(&dataset_ref).await); + assert!(harness.verify_dataset(&created).await); data_helper .assert_last_data_eq( @@ -785,12 +725,7 @@ async fn test_dataset_compaction_derive_error() { assert_matches!( harness - .compaction_svc - .compact_dataset( - &created.dataset_handle, - CompactionOptions::default(), - Some(Arc::new(NullCompactionListener {})) - ) + .compact_dataset(&created, CompactionOptions::default(),) .await, Err(CompactionError::InvalidDatasetKind(_)), ); @@ -806,7 +741,7 @@ async fn test_large_dataset_compact() { let created = harness.create_test_root_dataset().await; let dataset_ref = created.dataset_handle.as_local_ref(); - harness.ingest_multiple_blocks(&dataset_ref, 100).await; + harness.ingest_multiple_blocks(&created, 100).await; let data_helper = harness.dataset_data_helper(&dataset_ref).await; @@ -844,15 +779,13 @@ async fn test_large_dataset_compact() { assert_matches!( harness - .compaction_svc .compact_dataset( - &created.dataset_handle, + &created, CompactionOptions { max_slice_records: Some(10), max_slice_size: None, ..CompactionOptions::default() }, - Some(Arc::new(NullCompactionListener {})) ) .await, Ok(CompactionResult::Success { @@ -862,7 +795,7 @@ async fn test_large_dataset_compact() { old_num_blocks: 104 }) if new_head != old_head, ); - assert!(harness.verify_dataset(&dataset_ref).await); + assert!(harness.verify_dataset(&created).await); let new_blocks = harness.get_dataset_blocks(&dataset_ref).await; @@ -917,7 +850,6 @@ async fn test_dataset_keep_metadata_only_compact() { let created_root = harness.create_test_root_dataset().await; let created_derived = harness.create_test_derived_dataset().await; - let root_dataset_ref = created_root.dataset_handle.as_local_ref(); let derived_dataset_ref = created_derived.dataset_handle.as_local_ref(); let data_str = indoc!( @@ -930,7 +862,7 @@ async fn test_dataset_keep_metadata_only_compact() { ); harness - .ingest_data(data_str.to_string(), &root_dataset_ref) + .ingest_data(data_str.to_string(), &created_root) .await; let prev_head = created_derived @@ -942,14 +874,12 @@ async fn test_dataset_keep_metadata_only_compact() { assert_matches!( harness - .compaction_svc .compact_dataset( - &created_derived.dataset_handle, + &created_derived, CompactionOptions { keep_metadata_only: true, ..CompactionOptions::default() }, - Some(Arc::new(NullCompactionListener {})) ) .await, Ok(CompactionResult::NothingToDo) @@ -971,22 +901,19 @@ async fn test_dataset_keep_metadata_only_compact() { // // After: seed <- set_transform let res = harness - .transform_svc - .transform(&derived_dataset_ref, TransformOptions::default(), None) - .await - .unwrap(); + .transform_helper + .transform_dataset(&created_derived) + .await; assert_matches!(res, TransformResult::Updated { .. }); assert_matches!( harness - .compaction_svc .compact_dataset( - &created_derived.dataset_handle, + &created_derived, CompactionOptions { keep_metadata_only: true, ..CompactionOptions::default() }, - Some(Arc::new(NullCompactionListener {})) ) .await, Ok(CompactionResult::Success { @@ -997,7 +924,7 @@ async fn test_dataset_keep_metadata_only_compact() { }) if new_head != old_head, ); - assert!(harness.verify_dataset(&derived_dataset_ref).await); + assert!(harness.verify_dataset(&created_derived).await); assert!( !harness .check_is_data_slices_exist(&derived_dataset_ref) @@ -1021,19 +948,17 @@ async fn test_dataset_keep_metadata_only_compact() { ); harness - .ingest_data(data_str.to_string(), &root_dataset_ref) + .ingest_data(data_str.to_string(), &created_root) .await; assert_matches!( harness - .compaction_svc .compact_dataset( - &created_root.dataset_handle, + &created_root, CompactionOptions { keep_metadata_only: true, ..CompactionOptions::default() }, - Some(Arc::new(NullCompactionListener {})) ) .await, Ok(CompactionResult::Success { @@ -1044,7 +969,7 @@ async fn test_dataset_keep_metadata_only_compact() { }) if new_head != old_head, ); - assert!(harness.verify_dataset(&derived_dataset_ref).await); + assert!(harness.verify_dataset(&created_root).await); assert!( !harness .check_is_data_slices_exist(&derived_dataset_ref) @@ -1056,12 +981,12 @@ async fn test_dataset_keep_metadata_only_compact() { struct CompactTestHarness { _temp_dir: tempfile::TempDir, - dataset_repo: Arc, + dataset_registry: Arc, dataset_repo_writer: Arc, compaction_svc: Arc, push_ingest_svc: Arc, + transform_helper: TransformTestHelper, verification_svc: Arc, - transform_svc: Arc, current_date_time: DateTime, ctx: SessionContext, } @@ -1082,13 +1007,11 @@ impl CompactTestHarness { let catalog = dill::CatalogBuilder::new() .add_value(RunInfoDir::new(run_info_dir)) .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add_value(SystemTimeSourceStub::new_set(current_date_time)) .bind::() @@ -1097,28 +1020,31 @@ impl CompactTestHarness { .add::() .add::() .add::() + .add::() + .add::() + .add::() .add_value( mock_engine_provisioner::MockEngineProvisioner::new().stub_provision_engine(), ) .bind::() - .add::() .add::() .build(); - let dataset_repo = catalog.get_one::().unwrap(); + let dataset_registry = catalog.get_one::().unwrap(); let dataset_repo_writer = catalog.get_one::().unwrap(); let compaction_svc = catalog.get_one::().unwrap(); let push_ingest_svc = catalog.get_one::().unwrap(); - let transform_svc = catalog.get_one::().unwrap(); let verification_svc = catalog.get_one::().unwrap(); + let transform_helper = TransformTestHelper::from_catalog(&catalog); + Self { _temp_dir: temp_dir, - dataset_repo, + dataset_registry, dataset_repo_writer, compaction_svc, push_ingest_svc, - transform_svc, + transform_helper, verification_svc, current_date_time, ctx: SessionContext::new_with_config(SessionConfig::new().with_target_partitions(1)), @@ -1134,13 +1060,11 @@ impl CompactTestHarness { let catalog = dill::CatalogBuilder::new() .add_builder(run_info_dir.clone()) - .add_builder( - DatasetRepositoryS3::builder() - .with_s3_context(s3_context.clone()) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryS3::builder().with_s3_context(s3_context.clone())) .bind::() .bind::() + .add::() .add::() .add_value(SystemTimeSourceStub::new_set(current_date_time)) .bind::() @@ -1149,10 +1073,11 @@ impl CompactTestHarness { .add::() .add_value(ObjectStoreBuilderS3::new(s3_context.clone(), true)) .bind::() - .add_value(TestTransformService::new(Arc::new(Mutex::new(Vec::new())))) - .bind::() .add::() .add::() + .add::() + .add::() + .add::() .add::() .add::() .add_value(CurrentAccountSubject::new_test()) @@ -1160,27 +1085,29 @@ impl CompactTestHarness { let ctx = new_session_context(catalog.get_one().unwrap()); + let transform_helper = TransformTestHelper::from_catalog(&catalog); + Self { _temp_dir: temp_dir, - dataset_repo: catalog.get_one().unwrap(), + dataset_registry: catalog.get_one().unwrap(), dataset_repo_writer: catalog.get_one().unwrap(), compaction_svc: catalog.get_one().unwrap(), push_ingest_svc: catalog.get_one().unwrap(), + transform_helper, verification_svc: catalog.get_one().unwrap(), - transform_svc: catalog.get_one().unwrap(), current_date_time, ctx, } } async fn get_dataset_head(&self, dataset_ref: &DatasetRef) -> Multihash { - let dataset = self - .dataset_repo - .find_dataset_by_ref(dataset_ref) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(dataset_ref) .await .unwrap(); - dataset + resolved_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) .await @@ -1188,14 +1115,14 @@ impl CompactTestHarness { } async fn get_dataset_blocks(&self, dataset_ref: &DatasetRef) -> Vec { - let dataset = self - .dataset_repo - .find_dataset_by_ref(dataset_ref) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(dataset_ref) .await .unwrap(); let head = self.get_dataset_head(dataset_ref).await; - dataset + resolved_dataset .as_metadata_chain() .iter_blocks_interval(&head, None, false) .map_ok(|(_, b)| b) @@ -1272,16 +1199,16 @@ impl CompactTestHarness { } async fn dataset_data_helper(&self, dataset_ref: &DatasetRef) -> DatasetDataHelper { - let dataset = self - .dataset_repo - .find_dataset_by_ref(dataset_ref) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(dataset_ref) .await .unwrap(); - DatasetDataHelper::new_with_context(dataset, self.ctx.clone()) + DatasetDataHelper::new_with_context((*resolved_dataset).clone(), self.ctx.clone()) } - async fn ingest_multiple_blocks(&self, dataset_ref: &DatasetRef, amount: i64) { + async fn ingest_multiple_blocks(&self, dataset_created: &CreateDatasetResult, amount: i64) { let start_date = NaiveDate::parse_from_str("2020-01-01", "%Y-%m-%d").unwrap(); for i in 0..amount { @@ -1297,16 +1224,16 @@ impl CompactTestHarness { a_date.to_string(), b_date.to_string() ); - self.ingest_data(start_date_str, dataset_ref).await; + self.ingest_data(start_date_str, dataset_created).await; } } - async fn ingest_data(&self, data_str: String, dataset_ref: &DatasetRef) { + async fn ingest_data(&self, data_str: String, dataset_created: &CreateDatasetResult) { let data = std::io::Cursor::new(data_str); self.push_ingest_svc .ingest_from_file_stream( - dataset_ref, + ResolvedDataset::from(dataset_created), None, Box::new(data), PushIngestOpts::default(), @@ -1317,9 +1244,9 @@ impl CompactTestHarness { } async fn commit_set_licence_block(&self, dataset_ref: &DatasetRef, head: &Multihash) { - let dataset = self - .dataset_repo - .find_dataset_by_ref(dataset_ref) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(dataset_ref) .await .unwrap(); let event = SetLicense { @@ -1329,7 +1256,7 @@ impl CompactTestHarness { website_url: "http://set-license.com".to_owned(), }; - dataset + resolved_dataset .commit_event( event.into(), CommitOpts { @@ -1388,17 +1315,33 @@ impl CompactTestHarness { assert_eq!(data.offset_interval, *expected); } - async fn verify_dataset(&self, dataset_ref: &DatasetRef) -> bool { + async fn verify_dataset(&self, dataset_create_result: &CreateDatasetResult) -> bool { let result = self .verification_svc .verify( - dataset_ref, - (None, None), - VerificationOptions::default(), + VerificationRequest { + target: ResolvedDataset::from(dataset_create_result), + block_range: (None, None), + options: VerificationOptions::default(), + }, None, ) .await; result.outcome.is_ok() } + + async fn compact_dataset( + &self, + dataset_create_result: &CreateDatasetResult, + compaction_options: CompactionOptions, + ) -> Result { + self.compaction_svc + .compact_dataset( + ResolvedDataset::from(dataset_create_result), + compaction_options, + Some(Arc::new(NullCompactionListener {})), + ) + .await + } } diff --git a/src/infra/core/tests/tests/test_dataset_changes_service_impl.rs b/src/infra/core/tests/tests/test_dataset_changes_service_impl.rs index 1da97c75cc..85288efe1d 100644 --- a/src/infra/core/tests/tests/test_dataset_changes_service_impl.rs +++ b/src/infra/core/tests/tests/test_dataset_changes_service_impl.rs @@ -10,41 +10,28 @@ use std::sync::Arc; use chrono::Utc; -use dill::Component; use kamu::testing::MetadataFactory; -use kamu::{DatasetChangesServiceImpl, DatasetRepositoryLocalFs, DatasetRepositoryWriter}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::{ - CommitOpts, - CreateDatasetResult, - DatasetChangesService, - DatasetIntervalIncrement, - DatasetRepository, -}; -use opendatafabric::{ - Checkpoint, - DatasetAlias, - DatasetID, - DatasetKind, - DatasetName, - MetadataEvent, - Multihash, -}; -use tempfile::TempDir; -use time_source::SystemTimeSourceDefault; +use kamu::DatasetChangesServiceImpl; +use kamu_core::{CommitOpts, DatasetChangesService, DatasetIntervalIncrement, TenancyConfig}; +use opendatafabric::{Checkpoint, DatasetAlias, DatasetID, DatasetName, MetadataEvent, Multihash}; + +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_initial_increment() { let harness = DatasetChangesHarness::new(); - let foo_result = harness.create_root_dataset("foo").await; + + let foo = harness + .create_root_dataset(&DatasetAlias::new(None, DatasetName::new_unchecked("foo"))) + .await; // "foo" initially has Seed and SetPollingSource events let increment_between = harness .dataset_changes_service - .get_increment_between(&foo_result.dataset_handle.id, None, &foo_result.head) + .get_increment_between(&foo.dataset_handle.id, None, &foo.head) .await .unwrap(); assert_eq!( @@ -58,7 +45,7 @@ async fn test_initial_increment() { let increment_since = harness .dataset_changes_service - .get_increment_since(&foo_result.dataset_handle.id, None) + .get_increment_since(&foo.dataset_handle.id, None) .await .unwrap(); assert_eq!( @@ -76,22 +63,21 @@ async fn test_initial_increment() { #[test_log::test(tokio::test)] async fn test_no_changes_with_same_bounds() { let harness = DatasetChangesHarness::new(); - let foo_result = harness.create_root_dataset("foo").await; + + let foo = harness + .create_root_dataset(&DatasetAlias::new(None, DatasetName::new_unchecked("foo"))) + .await; let increment_between = harness .dataset_changes_service - .get_increment_between( - &foo_result.dataset_handle.id, - Some(&foo_result.head), - &foo_result.head, - ) + .get_increment_between(&foo.dataset_handle.id, Some(&foo.head), &foo.head) .await .unwrap(); assert_eq!(increment_between, DatasetIntervalIncrement::default()); let increment_since = harness .dataset_changes_service - .get_increment_since(&foo_result.dataset_handle.id, Some(&foo_result.head)) + .get_increment_since(&foo.dataset_handle.id, Some(&foo.head)) .await .unwrap(); assert_eq!(increment_since, DatasetIntervalIncrement::default()); @@ -102,15 +88,15 @@ async fn test_no_changes_with_same_bounds() { #[test_log::test(tokio::test)] async fn test_add_data_differences() { let harness = DatasetChangesHarness::new(); - let foo_result = harness.create_root_dataset("foo").await; - let dataset = harness - .dataset_repo - .get_dataset_by_handle(&foo_result.dataset_handle); + let foo = harness + .create_root_dataset(&DatasetAlias::new(None, DatasetName::new_unchecked("foo"))) + .await; // Commit SetDataSchema and 2 data nodes - let commit_result_1 = dataset + let commit_result_1 = foo + .dataset .commit_event( MetadataEvent::SetDataSchema(MetadataFactory::set_data_schema().build()), CommitOpts::default(), @@ -120,7 +106,8 @@ async fn test_add_data_differences() { let new_watermark_time = Utc::now(); - let commit_result_2 = dataset + let commit_result_2 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -140,7 +127,8 @@ async fn test_add_data_differences() { .await .unwrap(); - let commit_result_3 = dataset + let commit_result_3 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -236,7 +224,7 @@ async fn test_add_data_differences() { ]; harness - .check_between_cases(&foo_result.dataset_handle.id, &between_cases) + .check_between_cases(&foo.dataset_handle.id, &between_cases) .await; let since_cases = [ @@ -279,7 +267,7 @@ async fn test_add_data_differences() { ]; harness - .check_since_cases(&foo_result.dataset_handle.id, &since_cases) + .check_since_cases(&foo.dataset_handle.id, &since_cases) .await; } @@ -288,18 +276,23 @@ async fn test_add_data_differences() { #[test_log::test(tokio::test)] async fn test_execute_transform_differences() { let harness = DatasetChangesHarness::new(); - harness.create_root_dataset("foo").await; - let bar_result = harness.create_derived_dataset("bar", vec!["foo"]).await; - let bar_dataset = harness - .dataset_repo - .get_dataset_by_handle(&bar_result.dataset_handle); + let foo = harness + .create_root_dataset(&DatasetAlias::new(None, DatasetName::new_unchecked("foo"))) + .await; + let bar = harness + .create_derived_dataset( + &DatasetAlias::new(None, DatasetName::new_unchecked("bar")), + vec![foo.dataset_handle.as_local_ref()], + ) + .await; // Commit SetDataSchema and 2 trasnform data nodes let new_watermark_time = Utc::now(); - let commit_result_1 = bar_dataset + let commit_result_1 = bar + .dataset .commit_event( MetadataEvent::SetDataSchema(MetadataFactory::set_data_schema().build()), CommitOpts::default(), @@ -307,11 +300,12 @@ async fn test_execute_transform_differences() { .await .unwrap(); - let commit_result_2 = bar_dataset + let commit_result_2 = bar + .dataset .commit_event( MetadataEvent::ExecuteTransform( MetadataFactory::execute_transform() - .empty_query_inputs_from_seeded_ids(["foo"]) + .empty_query_inputs_from_particular_ids([foo.dataset_handle.id.clone()]) .some_new_data_with_offset(0, 14) .new_checkpoint(Some(Checkpoint { physical_hash: Multihash::from_digest_sha3_256(b"checkpoint-1"), @@ -327,11 +321,12 @@ async fn test_execute_transform_differences() { .await .unwrap(); - let commit_result_3 = bar_dataset + let commit_result_3 = bar + .dataset .commit_event( MetadataEvent::ExecuteTransform( MetadataFactory::execute_transform() - .empty_query_inputs_from_seeded_ids(["foo"]) + .empty_query_inputs_from_particular_ids([foo.dataset_handle.id.clone()]) .some_new_data_with_offset(15, 19) .prev_checkpoint(Some(Multihash::from_digest_sha3_256(b"checkpoint-1"))) .new_checkpoint(Some(Checkpoint { @@ -423,7 +418,7 @@ async fn test_execute_transform_differences() { ]; harness - .check_between_cases(&bar_result.dataset_handle.id, &between_cases) + .check_between_cases(&bar.dataset_handle.id, &between_cases) .await; let since_cases = [ @@ -466,7 +461,7 @@ async fn test_execute_transform_differences() { ]; harness - .check_since_cases(&bar_result.dataset_handle.id, &since_cases) + .check_since_cases(&bar.dataset_handle.id, &since_cases) .await; } @@ -475,15 +470,15 @@ async fn test_execute_transform_differences() { #[test_log::test(tokio::test)] async fn test_multiple_watermarks_within_interval() { let harness = DatasetChangesHarness::new(); - let foo_result = harness.create_root_dataset("foo").await; - let dataset = harness - .dataset_repo - .get_dataset_by_handle(&foo_result.dataset_handle); + let foo = harness + .create_root_dataset(&DatasetAlias::new(None, DatasetName::new_unchecked("foo"))) + .await; // Commit SetDataSchema and 2 data nodes each having a watermark - let commit_result_1 = dataset + let commit_result_1 = foo + .dataset .commit_event( MetadataEvent::SetDataSchema(MetadataFactory::set_data_schema().build()), CommitOpts::default(), @@ -493,7 +488,8 @@ async fn test_multiple_watermarks_within_interval() { let watermark_1_time = Utc::now(); - let commit_result_2 = dataset + let commit_result_2 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -516,7 +512,8 @@ async fn test_multiple_watermarks_within_interval() { let watermark_2_time = Utc::now(); - let commit_result_3 = dataset + let commit_result_3 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -582,7 +579,7 @@ async fn test_multiple_watermarks_within_interval() { ]; harness - .check_between_cases(&foo_result.dataset_handle.id, &between_cases) + .check_between_cases(&foo.dataset_handle.id, &between_cases) .await; let since_cases = [ @@ -607,7 +604,7 @@ async fn test_multiple_watermarks_within_interval() { ]; harness - .check_since_cases(&foo_result.dataset_handle.id, &since_cases) + .check_since_cases(&foo.dataset_handle.id, &since_cases) .await; } @@ -616,15 +613,14 @@ async fn test_multiple_watermarks_within_interval() { #[test_log::test(tokio::test)] async fn test_older_watermark_before_interval() { let harness = DatasetChangesHarness::new(); - let foo_result = harness.create_root_dataset("foo").await; - let dataset = harness - .dataset_repo - .get_dataset_by_handle(&foo_result.dataset_handle); + let foo = harness + .create_root_dataset(&DatasetAlias::new(None, DatasetName::new_unchecked("foo"))) + .await; // Commit SetDataSchema and 3 data nodes, with #1,3 containing watermark - dataset + foo.dataset .commit_event( MetadataEvent::SetDataSchema(MetadataFactory::set_data_schema().build()), CommitOpts::default(), @@ -634,7 +630,8 @@ async fn test_older_watermark_before_interval() { let watermark_1_time = Utc::now(); - let commit_result_2 = dataset + let commit_result_2 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -655,7 +652,8 @@ async fn test_older_watermark_before_interval() { .await .unwrap(); - let commit_result_3 = dataset + let commit_result_3 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -679,7 +677,8 @@ async fn test_older_watermark_before_interval() { let watermark_2_time = Utc::now(); - let commit_result_4 = dataset + let commit_result_4 = foo + .dataset .commit_event( MetadataEvent::AddData( MetadataFactory::add_data() @@ -755,7 +754,7 @@ async fn test_older_watermark_before_interval() { ]; harness - .check_between_cases(&foo_result.dataset_handle.id, &between_cases) + .check_between_cases(&foo.dataset_handle.id, &between_cases) .await; let since_cases = [ @@ -780,110 +779,34 @@ async fn test_older_watermark_before_interval() { ]; harness - .check_since_cases(&foo_result.dataset_handle.id, &since_cases) + .check_since_cases(&foo.dataset_handle.id, &since_cases) .await; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseRepoHarness, base_repo_harness)] struct DatasetChangesHarness { - _workdir: TempDir, - _catalog: dill::Catalog, - dataset_repo: Arc, - dataset_repo_writer: Arc, + base_repo_harness: BaseRepoHarness, dataset_changes_service: Arc, } impl DatasetChangesHarness { fn new() -> Self { - let workdir = tempfile::tempdir().unwrap(); - let datasets_dir = workdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); - - let catalog = dill::CatalogBuilder::new() - .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) + let base_repo_harness = BaseRepoHarness::new(TenancyConfig::SingleTenant); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) .add::() .build(); - let dataset_repo = catalog.get_one::().unwrap(); - let dataset_repo_writer = catalog.get_one::().unwrap(); - let dataset_changes_service = catalog.get_one::().unwrap(); Self { - _workdir: workdir, - _catalog: catalog, - dataset_repo, - dataset_repo_writer, + base_repo_harness, dataset_changes_service, } } - async fn create_root_dataset(&self, dataset_name: &str) -> CreateDatasetResult { - let alias = DatasetAlias::new(None, DatasetName::new_unchecked(dataset_name)); - let create_result = self - .dataset_repo_writer - .create_dataset( - &alias, - MetadataFactory::metadata_block( - MetadataFactory::seed(DatasetKind::Root) - .id_from(alias.dataset_name.as_str()) - .build(), - ) - .build_typed(), - ) - .await - .unwrap(); - - let commit_result = create_result - .dataset - .commit_event( - MetadataEvent::SetPollingSource(MetadataFactory::set_polling_source().build()), - CommitOpts::default(), - ) - .await - .unwrap(); - - CreateDatasetResult { - dataset_handle: create_result.dataset_handle, - dataset: create_result.dataset, - head: commit_result.new_head, - } - } - - async fn create_derived_dataset( - &self, - dataset_name: &str, - input_dataset_names: Vec<&str>, - ) -> CreateDatasetResult { - self.dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(DatasetAlias::new( - None, - DatasetName::new_unchecked(dataset_name), - )) - .kind(DatasetKind::Derivative) - .push_event( - MetadataFactory::set_transform() - .inputs_from_aliases_and_seeded_ids(input_dataset_names) - .build(), - ) - .build(), - ) - .await - .unwrap() - .create_dataset_result - } - async fn check_between_cases( &self, dataset_id: &DatasetID, diff --git a/src/infra/core/tests/tests/test_dataset_ownership_service_inmem.rs b/src/infra/core/tests/tests/test_dataset_ownership_service_inmem.rs index 4d1f94118c..fccf03d58e 100644 --- a/src/infra/core/tests/tests/test_dataset_ownership_service_inmem.rs +++ b/src/infra/core/tests/tests/test_dataset_ownership_service_inmem.rs @@ -11,18 +11,10 @@ use std::collections::HashMap; use std::sync::Arc; use database_common::{DatabaseTransactionRunner, NoOpDatabasePlugin}; -use dill::Component; -use kamu::testing::MetadataFactory; -use kamu::{ - DatasetOwnershipServiceInMemory, - DatasetOwnershipServiceInMemoryStateInitializer, - DatasetRepositoryLocalFs, - DatasetRepositoryWriter, -}; +use kamu::{DatasetOwnershipServiceInMemory, DatasetOwnershipServiceInMemoryStateInitializer}; use kamu_accounts::{ AccountConfig, AuthenticationService, - CurrentAccountSubject, JwtAuthenticationConfig, PredefinedAccountsConfig, DEFAULT_ACCOUNT_ID, @@ -34,16 +26,16 @@ use kamu_accounts_services::{ LoginPasswordAuthProvider, PredefinedAccountsRegistrator, }; -use kamu_core::{DatasetOwnershipService, DatasetRepository}; -use opendatafabric::{AccountID, AccountName, DatasetAlias, DatasetID, DatasetKind, DatasetName}; -use tempfile::TempDir; -use time_source::SystemTimeSourceDefault; +use kamu_core::{DatasetOwnershipService, TenancyConfig}; +use opendatafabric::{AccountID, AccountName, DatasetAlias, DatasetID, DatasetName}; + +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_multi_tenant_dataset_owners() { - let mut harness = DatasetOwnershipHarness::new(true).await; + let mut harness = DatasetOwnershipHarness::new(TenancyConfig::MultiTenant).await; harness.create_multi_tenant_datasets().await; harness.eager_initialization().await; @@ -84,20 +76,21 @@ async fn test_multi_tenant_dataset_owners() { } } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseRepoHarness, base_repo_harness)] struct DatasetOwnershipHarness { - _workdir: TempDir, + base_repo_harness: BaseRepoHarness, catalog: dill::Catalog, - dataset_repo_writer: Arc, dataset_ownership_service: Arc, auth_svc: Arc, account_datasets: HashMap>, } impl DatasetOwnershipHarness { - async fn new(multi_tenant: bool) -> Self { - let workdir = tempfile::tempdir().unwrap(); - let datasets_dir = workdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); + async fn new(tenancy_config: TenancyConfig) -> Self { + let base_repo_harness = BaseRepoHarness::new(tenancy_config); + let predefined_accounts = [ AccountName::new_unchecked("alice"), AccountName::new_unchecked("bob"), @@ -111,18 +104,9 @@ impl DatasetOwnershipHarness { } let base_catalog = { - let mut b = dill::CatalogBuilder::new(); - - b.add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(multi_tenant), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add::() + let mut b = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()); + + b.add::() .add::() .add_value(predefined_accounts_config.clone()) .add_value(JwtAuthenticationConfig::default()) @@ -150,15 +134,12 @@ impl DatasetOwnershipHarness { b.build() }; - let dataset_repo_writer = catalog.get_one::().unwrap(); - let dataset_ownership_service = catalog.get_one::().unwrap(); let auth_svc = catalog.get_one::().unwrap(); Self { - _workdir: workdir, + base_repo_harness, catalog, - dataset_repo_writer, dataset_ownership_service, auth_svc, account_datasets: HashMap::new(), @@ -242,20 +223,12 @@ impl DatasetOwnershipHarness { .unwrap(); let created_dataset = self - .dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(DatasetAlias::new( - account_name, - DatasetName::new_unchecked(dataset_name), - )) - .kind(DatasetKind::Root) - .push_event(MetadataFactory::set_polling_source().build()) - .build(), - ) - .await - .unwrap() - .create_dataset_result; + ._super() + .create_root_dataset(&DatasetAlias::new( + account_name, + DatasetName::new_unchecked(dataset_name), + )) + .await; self.account_datasets .entry(account_id.clone()) @@ -279,24 +252,15 @@ impl DatasetOwnershipHarness { .unwrap(); let created_dataset = self - .dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(DatasetAlias::new( - account_name, - DatasetName::new_unchecked(dataset_name), - )) - .kind(DatasetKind::Derivative) - .push_event( - MetadataFactory::set_transform() - .inputs_from_refs(input_aliases) - .build(), - ) - .build(), + ._super() + .create_derived_dataset( + &DatasetAlias::new(account_name, DatasetName::new_unchecked(dataset_name)), + input_aliases + .iter() + .map(DatasetAlias::as_local_ref) + .collect(), ) - .await - .unwrap() - .create_dataset_result; + .await; self.account_datasets .entry(account_id.clone()) diff --git a/src/infra/core/tests/tests/test_datasets_filtering.rs b/src/infra/core/tests/tests/test_datasets_filtering.rs index 0e3ae07378..4c9a34186a 100644 --- a/src/infra/core/tests/tests/test_datasets_filtering.rs +++ b/src/infra/core/tests/tests/test_datasets_filtering.rs @@ -8,33 +8,18 @@ // by the Apache License, Version 2.0. use std::str::FromStr; -use std::sync::Arc; -use dill::Component; use futures::TryStreamExt; -use kamu::testing::MetadataFactory; use kamu::utils::datasets_filtering::{ get_local_datasets_stream, matches_local_ref_pattern, matches_remote_ref_pattern, }; -use kamu::{DatasetRepositoryLocalFs, DatasetRepositoryWriter}; -use kamu_accounts::{CurrentAccountSubject, DEFAULT_ACCOUNT_NAME}; -use kamu_core::DatasetRepository; -use opendatafabric::{ - AccountName, - DatasetAlias, - DatasetAliasRemote, - DatasetHandle, - DatasetID, - DatasetKind, - DatasetName, - DatasetRefAny, - DatasetRefAnyPattern, - RepoName, -}; -use tempfile::TempDir; -use time_source::SystemTimeSourceDefault; +use kamu_accounts::DEFAULT_ACCOUNT_NAME; +use kamu_core::TenancyConfig; +use opendatafabric::*; + +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -154,20 +139,19 @@ fn test_matches_remote_ref_pattern() { #[test_log::test(tokio::test)] async fn test_get_local_datasets_stream_single_tenant() { - let dataset_filtering_harness = DatasetFilteringHarness::new(false); - let foo_handle = dataset_filtering_harness - .create_root_dataset(None, "foo") - .await; - let bar_handle = dataset_filtering_harness - .create_root_dataset(None, "bar") - .await; - let baz_handle = dataset_filtering_harness - .create_root_dataset(None, "baz") - .await; + let harness = DatasetFilteringHarness::new(TenancyConfig::SingleTenant); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(None, DatasetName::new_unchecked("baz")); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let baz = harness.create_root_dataset(&alias_baz).await; let pattern = DatasetRefAnyPattern::from_str("f%").unwrap(); let res: Vec<_> = get_local_datasets_stream( - dataset_filtering_harness.dataset_repo.as_ref(), + harness.dataset_registry(), vec![pattern], &DEFAULT_ACCOUNT_NAME, ) @@ -175,11 +159,11 @@ async fn test_get_local_datasets_stream_single_tenant() { .await .unwrap(); - assert_eq!(res, vec![foo_handle.as_any_ref()]); + assert_eq!(res, vec![foo.dataset_handle.as_any_ref()]); let pattern = DatasetRefAnyPattern::from_str("b%").unwrap(); let mut res: Vec<_> = get_local_datasets_stream( - dataset_filtering_harness.dataset_repo.as_ref(), + harness.dataset_registry(), vec![pattern], &DEFAULT_ACCOUNT_NAME, ) @@ -188,11 +172,17 @@ async fn test_get_local_datasets_stream_single_tenant() { .unwrap(); DatasetFilteringHarness::sort_datasets_by_dataset_name(&mut res); - assert_eq!(res, vec![bar_handle.as_any_ref(), baz_handle.as_any_ref()]); + assert_eq!( + res, + vec![ + bar.dataset_handle.as_any_ref(), + baz.dataset_handle.as_any_ref() + ] + ); let pattern = DatasetRefAnyPattern::from_str("s%").unwrap(); let res: Vec<_> = get_local_datasets_stream( - dataset_filtering_harness.dataset_repo.as_ref(), + harness.dataset_registry(), vec![pattern.clone()], &DEFAULT_ACCOUNT_NAME, ) @@ -207,115 +197,58 @@ async fn test_get_local_datasets_stream_single_tenant() { #[test_log::test(tokio::test)] async fn test_get_local_datasets_stream_multi_tenant() { - let dataset_filtering_harness = DatasetFilteringHarness::new(true); + let harness = DatasetFilteringHarness::new(TenancyConfig::MultiTenant); + let account_1 = AccountName::new_unchecked("account1"); let account_2 = AccountName::new_unchecked("account2"); - let foo_handle = dataset_filtering_harness - .create_root_dataset(Some(account_1.clone()), "foo") - .await; - let bar_handle = dataset_filtering_harness - .create_root_dataset(Some(account_2.clone()), "bar") - .await; - let baz_handle = dataset_filtering_harness - .create_root_dataset(Some(account_1.clone()), "baz") - .await; + let alias_foo = DatasetAlias::new(Some(account_1.clone()), DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(Some(account_2.clone()), DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(Some(account_1.clone()), DatasetName::new_unchecked("baz")); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let baz = harness.create_root_dataset(&alias_baz).await; let pattern = DatasetRefAnyPattern::from_str("account1/f%").unwrap(); - let res: Vec<_> = get_local_datasets_stream( - dataset_filtering_harness.dataset_repo.as_ref(), - vec![pattern], - &account_1, - ) - .try_collect() - .await - .unwrap(); + let res: Vec<_> = + get_local_datasets_stream(harness.dataset_registry(), vec![pattern], &account_1) + .try_collect() + .await + .unwrap(); - assert_eq!(res, vec![foo_handle.as_any_ref()]); + assert_eq!(res, vec![foo.dataset_handle.as_any_ref()]); let pattern = DatasetRefAnyPattern::from_str("account2/b%").unwrap(); - let res: Vec<_> = get_local_datasets_stream( - dataset_filtering_harness.dataset_repo.as_ref(), - vec![pattern], - &account_2, - ) - .try_collect() - .await - .unwrap(); + let res: Vec<_> = + get_local_datasets_stream(harness.dataset_registry(), vec![pattern], &account_2) + .try_collect() + .await + .unwrap(); - assert_eq!(res, vec![bar_handle.as_any_ref()]); + assert_eq!(res, vec![bar.dataset_handle.as_any_ref()]); let pattern = DatasetRefAnyPattern::from_str("account1/b%").unwrap(); - let res: Vec<_> = get_local_datasets_stream( - dataset_filtering_harness.dataset_repo.as_ref(), - vec![pattern], - &account_1, - ) - .try_collect() - .await - .unwrap(); + let res: Vec<_> = + get_local_datasets_stream(harness.dataset_registry(), vec![pattern], &account_1) + .try_collect() + .await + .unwrap(); - assert_eq!(res, vec![baz_handle.as_any_ref()]); + assert_eq!(res, vec![baz.dataset_handle.as_any_ref()]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseRepoHarness, base_repo_harness)] struct DatasetFilteringHarness { - _workdir: TempDir, - _catalog: dill::Catalog, - dataset_repo: Arc, - dataset_repo_writer: Arc, + base_repo_harness: BaseRepoHarness, } impl DatasetFilteringHarness { - fn new(is_multi_tenant: bool) -> Self { - let workdir = tempfile::tempdir().unwrap(); - let datasets_dir = workdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); - - let catalog = dill::CatalogBuilder::new() - .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(is_multi_tenant), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .build(); - - let dataset_repo = catalog.get_one::().unwrap(); - let dataset_repo_writer = catalog.get_one::().unwrap(); - - Self { - _workdir: workdir, - _catalog: catalog, - dataset_repo, - dataset_repo_writer, - } - } - - async fn create_root_dataset( - &self, - account_name: Option, - dataset_name: &str, - ) -> DatasetHandle { - self.dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(DatasetAlias::new( - account_name, - DatasetName::new_unchecked(dataset_name), - )) - .kind(DatasetKind::Root) - .push_event(MetadataFactory::set_polling_source().build()) - .build(), - ) - .await - .unwrap() - .create_dataset_result - .dataset_handle + fn new(tenancy_config: TenancyConfig) -> Self { + let base_repo_harness = BaseRepoHarness::new(tenancy_config); + Self { base_repo_harness } } fn sort_datasets_by_dataset_name(datasets: &mut [DatasetRefAny]) { @@ -335,3 +268,5 @@ impl DatasetFilteringHarness { }); } } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_dependency_graph_inmem.rs b/src/infra/core/tests/tests/test_dependency_graph_inmem.rs index e31fd62445..601d6a1d5c 100644 --- a/src/infra/core/tests/tests/test_dependency_graph_inmem.rs +++ b/src/infra/core/tests/tests/test_dependency_graph_inmem.rs @@ -15,18 +15,17 @@ use futures::{future, StreamExt, TryStreamExt}; use internal_error::ResultIntoInternal; use kamu::testing::MetadataFactory; use kamu::*; -use kamu_accounts::CurrentAccountSubject; use kamu_core::*; use messaging_outbox::{register_message_dispatcher, Outbox, OutboxImmediateImpl}; use opendatafabric::*; -use tempfile::TempDir; -use time_source::SystemTimeSourceDefault; + +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] async fn test_single_tenant_repository() { - let harness = DependencyGraphHarness::new(false); + let harness = DependencyGraphHarness::new(TenancyConfig::SingleTenant); let all_dependencies: Vec<_> = harness.list_all_dependencies().await; assert_eq!( @@ -55,7 +54,7 @@ async fn test_single_tenant_repository() { #[test_log::test(tokio::test)] async fn test_multi_tenant_repository() { - let harness = DependencyGraphHarness::new(true); + let harness = DependencyGraphHarness::new(TenancyConfig::MultiTenant); let all_dependencies: Vec<_> = harness.list_all_dependencies().await; assert_eq!( @@ -84,7 +83,7 @@ async fn test_multi_tenant_repository() { #[test_log::test(tokio::test)] async fn test_service_queries() { - let harness = DependencyGraphHarness::new(false); + let harness = DependencyGraphHarness::new(TenancyConfig::SingleTenant); harness.create_single_tenant_graph().await; harness.eager_initialization().await; @@ -123,7 +122,7 @@ async fn test_service_queries() { #[test_log::test(tokio::test)] async fn test_service_new_datasets() { - let harness = DependencyGraphHarness::new(false); + let harness = DependencyGraphHarness::new(TenancyConfig::SingleTenant); harness.create_single_tenant_graph().await; harness.eager_initialization().await; @@ -167,7 +166,7 @@ async fn test_service_new_datasets() { #[test_log::test(tokio::test)] async fn test_service_derived_dataset_modifies_links() { - let harness = DependencyGraphHarness::new(false); + let harness = DependencyGraphHarness::new(TenancyConfig::SingleTenant); harness.create_single_tenant_graph().await; harness.eager_initialization().await; @@ -259,7 +258,7 @@ async fn test_service_derived_dataset_modifies_links() { #[test_log::test(tokio::test)] async fn test_service_dataset_deleted() { - let harness = DependencyGraphHarness::new(false); + let harness = DependencyGraphHarness::new(TenancyConfig::SingleTenant); harness.create_single_tenant_graph().await; harness.eager_initialization().await; @@ -604,40 +603,29 @@ async fn test_in_dependency_order() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseRepoHarness, base_repo_harness)] struct DependencyGraphHarness { - _workdir: TempDir, + base_repo_harness: BaseRepoHarness, catalog: dill::Catalog, - dataset_repo: Arc, dependency_graph_service: Arc, dependency_graph_repository: Arc, } impl DependencyGraphHarness { - fn new(multi_tenant: bool) -> Self { - let workdir = tempfile::tempdir().unwrap(); - let datasets_dir = workdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); - - let mut b = dill::CatalogBuilder::new(); - b.add::() - .add_builder( - messaging_outbox::OutboxImmediateImpl::builder() - .with_consumer_filter(messaging_outbox::ConsumerFilter::AllConsumers), - ) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(multi_tenant), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add::() - .add::() - .add::() - .add::() - .add::(); + fn new(tenancy_config: TenancyConfig) -> Self { + let base_repo_harness = BaseRepoHarness::new(tenancy_config); + + let mut b = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()); + b.add_builder( + messaging_outbox::OutboxImmediateImpl::builder() + .with_consumer_filter(messaging_outbox::ConsumerFilter::AllConsumers), + ) + .bind::() + .add::() + .add::() + .add::() + .add::() + .add::(); register_message_dispatcher::( &mut b, @@ -655,9 +643,8 @@ impl DependencyGraphHarness { Arc::new(DependencyGraphRepositoryInMemory::new(dataset_repo.clone())); Self { - _workdir: workdir, + base_repo_harness, catalog, - dataset_repo, dependency_graph_service, dependency_graph_repository, } @@ -679,15 +666,15 @@ impl DependencyGraphHarness { } = dataset_dependencies; let downstream_hdl = self - .dataset_repo - .resolve_dataset_ref(&downstream_dataset_id.as_local_ref()) + .dataset_registry() + .resolve_dataset_handle_by_ref(&downstream_dataset_id.as_local_ref()) .await .unwrap(); for upstream_dataset_id in upstream_dataset_ids { let upstream_hdl = self - .dataset_repo - .resolve_dataset_ref(&upstream_dataset_id.as_local_ref()) + .dataset_registry() + .resolve_dataset_handle_by_ref(&upstream_dataset_id.as_local_ref()) .await .unwrap(); @@ -856,8 +843,8 @@ impl DependencyGraphHarness { async fn dataset_id_by_name(&self, dataset_name: &str) -> DatasetID { let dataset_alias = DatasetAlias::try_from(dataset_name).unwrap(); let dataset_hdl = self - .dataset_repo - .resolve_dataset_ref(&dataset_alias.as_local_ref()) + .dataset_registry() + .resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .unwrap(); dataset_hdl.id @@ -866,8 +853,8 @@ impl DependencyGraphHarness { async fn dataset_alias_by_id(&self, dataset_id: &DatasetID) -> DatasetAlias { let dataset_ref = dataset_id.as_local_ref(); let dataset_hdl = self - .dataset_repo - .resolve_dataset_ref(&dataset_ref) + .dataset_registry() + .resolve_dataset_handle_by_ref(&dataset_ref) .await .unwrap(); dataset_hdl.alias @@ -999,8 +986,8 @@ impl DependencyGraphHarness { DatasetAlias::new(account_name, DatasetName::new_unchecked(dataset_name)); let dataset_handle = self - .dataset_repo - .resolve_dataset_ref(&dataset_alias.as_local_ref()) + .dataset_registry() + .resolve_dataset_handle_by_ref(&dataset_alias.as_local_ref()) .await .unwrap(); @@ -1035,7 +1022,7 @@ impl DependencyGraphHarness { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// async fn create_large_dataset_graph() -> DependencyGraphHarness { - let dependency_harness = DependencyGraphHarness::new(false); + let dependency_harness = DependencyGraphHarness::new(TenancyConfig::SingleTenant); dependency_harness.create_single_tenant_graph().await; dependency_harness.eager_initialization().await; diff --git a/src/infra/core/tests/tests/test_pull_request_planner_impl.rs b/src/infra/core/tests/tests/test_pull_request_planner_impl.rs new file mode 100644 index 0000000000..2ad4bc2e77 --- /dev/null +++ b/src/infra/core/tests/tests/test_pull_request_planner_impl.rs @@ -0,0 +1,849 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::convert::TryFrom; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use kamu::domain::*; +use kamu::testing::*; +use kamu::utils::ipfs_wrapper::IpfsClient; +use kamu::utils::simple_transfer_protocol::SimpleTransferProtocol; +use kamu::*; +use kamu_accounts::CurrentAccountSubject; +use messaging_outbox::DummyOutboxImpl; +use opendatafabric::*; +use time_source::SystemTimeSourceDefault; + +use crate::BaseRepoHarness; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +macro_rules! n { + ($s:expr) => { + DatasetAlias::new(None, DatasetName::try_from($s).unwrap()) + }; +} + +macro_rules! mn { + ($s:expr) => { + DatasetAlias::try_from($s).unwrap() + }; +} + +macro_rules! rl { + ($s:expr) => { + DatasetRef::Alias(DatasetAlias::new(None, DatasetName::try_from($s).unwrap())) + }; +} + +macro_rules! rr { + ($s:expr) => { + DatasetRefRemote::try_from($s).unwrap() + }; +} + +macro_rules! ar { + ($s:expr) => { + DatasetRefAny::try_from($s).unwrap() + }; +} + +macro_rules! names { + [] => { + vec![] + }; + [$x:expr] => { + vec![n!($x)] + }; + [$x:expr, $($y:expr),+] => { + vec![n!($x), $(n!($y)),+] + }; +} + +macro_rules! mnames { + [] => { + vec![] + }; + [$x:expr] => { + vec![mn!($x)] + }; + [$x:expr, $($y:expr),+] => { + vec![mn!($x), $(mn!($y)),+] + }; +} + +macro_rules! refs { + [] => { + vec![] + }; + [$x:expr] => { + vec![ar!($x)] + }; + [$x:expr, $($y:expr),+] => { + vec![ar!($x), $(ar!($y)),+] + }; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +async fn create_graph( + dataset_repo_writer: &dyn DatasetRepositoryWriter, + datasets: Vec<(DatasetAlias, Vec)>, +) { + for (dataset_alias, deps) in datasets { + dataset_repo_writer + .create_dataset_from_snapshot( + MetadataFactory::dataset_snapshot() + .name(dataset_alias) + .kind(if deps.is_empty() { + DatasetKind::Root + } else { + DatasetKind::Derivative + }) + .push_event::(if deps.is_empty() { + MetadataFactory::set_polling_source().build().into() + } else { + MetadataFactory::set_transform() + .inputs_from_refs(deps) + .build() + .into() + }) + .build(), + ) + .await + .unwrap(); + } +} + +// Adding a remote dataset is a bit of a pain. +// We cannot add a local dataset and then add a pull alias without adding all of +// its dependencies too. So instead we're creating a repository based on temp +// dir and syncing it into the main workspace. TODO: Add simpler way to import +// remote dataset +async fn create_graph_remote( + remote_repo_name: &str, + harness: &PullTestHarness, + datasets: Vec<(DatasetAlias, Vec)>, + to_import: Vec, +) -> tempfile::TempDir { + let tmp_repo_dir = tempfile::tempdir().unwrap(); + + let remote_dataset_repo = DatasetRepositoryLocalFs::new( + tmp_repo_dir.path().to_owned(), + Arc::new(CurrentAccountSubject::new_test()), + Arc::new(TenancyConfig::SingleTenant), + Arc::new(SystemTimeSourceDefault), + ); + + create_graph(&remote_dataset_repo, datasets).await; + + let tmp_repo_name = RepoName::new_unchecked(remote_repo_name); + + harness + .remote_repo_reg + .add_repository( + &tmp_repo_name, + url::Url::from_file_path(tmp_repo_dir.path()).unwrap(), + ) + .unwrap(); + + for import_alias in to_import { + harness + .sync_service + .sync( + harness + .sync_request_builder + .build_sync_request( + import_alias + .as_remote_alias(tmp_repo_name.clone()) + .into_any_ref(), + import_alias.into_any_ref(), + true, + ) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); + } + + tmp_repo_dir +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_pull_batching_chain() { + let harness = PullTestHarness::new(TenancyConfig::SingleTenant); + + // A - B - C + create_graph( + harness.dataset_repo_writer(), + vec![ + (n!("a"), names![]), + (n!("b"), names!["a"]), + (n!("c"), names!["b"]), + ], + ) + .await; + + assert_eq!( + harness + .pull(refs!["c"], PullOptions::default()) + .await + .unwrap(), + vec![vec![PullJob::Transform(ar!["c"])]] + ); + + assert_eq!( + harness + .pull(refs!["c", "a"], PullOptions::default()) + .await + .unwrap(), + vec![ + vec![PullJob::Ingest(ar!["a"])], + vec![PullJob::Transform(ar!["c"])] + ], + ); + + assert_eq!( + harness + .pull( + refs!["c"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![ + vec![PullJob::Ingest(ar!["a"])], + vec![PullJob::Transform(ar!["b"])], + vec![PullJob::Transform(ar!["c"])], + ] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_pull_batching_chain_multi_tenant() { + let harness = PullTestHarness::new(TenancyConfig::MultiTenant); + + // XA - YB - ZC + create_graph( + harness.dataset_repo_writer(), + vec![ + (mn!("x/a"), mnames![]), + (mn!("y/b"), mnames!["x/a"]), + (mn!("z/c"), mnames!["y/b"]), + ], + ) + .await; + + assert_eq!( + harness + .pull(refs!["z/c"], PullOptions::default()) + .await + .unwrap(), + vec![vec![PullJob::Transform(mn!["z/c"].as_any_ref())]] + ); + + assert_eq!( + harness + .pull(refs!["z/c", "x/a"], PullOptions::default()) + .await + .unwrap(), + vec![ + vec![PullJob::Ingest(mn!["x/a"].as_any_ref())], + vec![PullJob::Transform(mn!["z/c"].as_any_ref())], + ], + ); + + assert_eq!( + harness + .pull( + refs!["z/c"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![ + vec![PullJob::Ingest(mn!["x/a"].as_any_ref())], + vec![PullJob::Transform(mn!["y/b"].as_any_ref())], + vec![PullJob::Transform(mn!["z/c"].as_any_ref())], + ] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_pull_batching_complex() { + let harness = PullTestHarness::new(TenancyConfig::SingleTenant); + + // / C \ + // A < > > E + // \ D / / + // / + // B - - -/ + create_graph( + harness.dataset_repo_writer(), + vec![ + (n!("a"), names![]), + (n!("b"), names![]), + (n!("c"), names!["a"]), + (n!("d"), names!["a"]), + (n!("e"), names!["c", "d", "b"]), + ], + ) + .await; + + assert_eq!( + harness + .pull(refs!["e"], PullOptions::default()) + .await + .unwrap(), + vec![vec![PullJob::Transform(ar!["e"])]] + ); + + assert_matches!( + harness + .pull(vec![ar!("z")], PullOptions::default()) + .await + .err() + .unwrap()[0], + PullResponse { + result: Err(PullError::NotFound(_)), + .. + }, + ); + + assert_eq!( + harness + .pull( + refs!["e"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![ + vec![PullJob::Ingest(ar!["a"]), PullJob::Ingest(ar!["b"])], + vec![PullJob::Transform(ar!["c"]), PullJob::Transform(ar!["d"])], + vec![PullJob::Transform(ar!["e"])], + ] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_pull_batching_complex_with_remote() { + let harness = PullTestHarness::new(TenancyConfig::SingleTenant); + + // (A) - (E) - F - G + // (B) --/ / / + // C --------/ / + // D -----------/ + let _remote_tmp_dir = create_graph_remote( + "kamu.dev", + &harness, + vec![ + (n!("a"), names![]), + (n!("b"), names![]), + (n!("e"), names!["a", "b"]), + ], + names!("e"), + ) + .await; + create_graph( + harness.dataset_repo_writer(), + vec![ + (n!("c"), names![]), + (n!("d"), names![]), + (n!("f"), names!["e", "c"]), + (n!("g"), names!["f", "d"]), + ], + ) + .await; + + // Add remote pull alias to E + harness + .get_remote_aliases(&rl!("e")) + .await + .add( + &DatasetRefRemote::try_from("kamu.dev/e").unwrap(), + RemoteAliasKind::Pull, + ) + .await + .unwrap(); + + // Pulling E results in a sync + assert_eq!( + harness + .pull( + refs!["e"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![vec![PullJob::Sync(( + rr!("kamu.dev/e").into(), + n!("e").into() + ))]], + ); + + // Explicit remote reference associates with E + assert_eq!( + harness + .pull( + refs!["kamu.dev/e"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![vec![PullJob::Sync(( + rr!("kamu.dev/e").into(), + n!("e").into() + ))]], + ); + + // Remote is recursed onto + assert_eq!( + harness + .pull( + refs!["g"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![ + vec![ + PullJob::Sync((rr!("kamu.dev/e").into(), n!("e").into())), + PullJob::Ingest(ar!("c")), + PullJob::Ingest(ar!("d")), + ], + vec![PullJob::Transform(ar!("f"))], + vec![PullJob::Transform(ar!("g"))], + ], + ); + + // Remote is recursed onto while also specified explicitly (via local ID) + assert_eq!( + harness + .pull( + refs!["g", "e"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![ + vec![ + PullJob::Sync((rr!("kamu.dev/e").into(), n!("e").into())), + PullJob::Ingest(ar!("c")), + PullJob::Ingest(ar!("d")) + ], + vec![PullJob::Transform(ar!("f"))], + vec![PullJob::Transform(ar!("g"))], + ], + ); + + // Remote is recursed onto while also specified explicitly (via remote ref) + assert_eq!( + harness + .pull( + refs!["g", "kamu.dev/e"], + PullOptions { + recursive: true, + ..PullOptions::default() + } + ) + .await + .unwrap(), + vec![ + vec![ + PullJob::Sync((rr!("kamu.dev/e").into(), n!("e").into())), + PullJob::Ingest(ar!("c")), + PullJob::Ingest(ar!("d")) + ], + vec![PullJob::Transform(ar!("f"))], + vec![PullJob::Transform(ar!("g"))], + ], + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_sync_from() { + let harness = PullTestHarness::new(TenancyConfig::SingleTenant); + + let _remote_tmp_dir = + create_graph_remote("kamu.dev", &harness, vec![(n!("foo"), names![])], names!()).await; + + let res = harness + .pull_with_requests( + vec![PullRequest::Remote(PullRequestRemote { + maybe_local_alias: Some(n!("bar")), + remote_ref: rr!("kamu.dev/foo"), + })], + PullOptions::default(), + ) + .await + .unwrap(); + + assert_eq!( + res, + vec![vec![PullJob::Sync(( + rr!("kamu.dev/foo").into(), + n!("bar").into() + ))]] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_sync_from_url_and_local_ref() { + let harness = PullTestHarness::new(TenancyConfig::SingleTenant); + + let _remote_tmp_dir = + create_graph_remote("kamu.dev", &harness, vec![(n!("bar"), names![])], names!()).await; + + let res = harness + .pull_with_requests( + vec![PullRequest::Remote(PullRequestRemote { + maybe_local_alias: Some(n!("bar")), + remote_ref: rr!("kamu.dev/bar"), + })], + PullOptions::default(), + ) + .await + .unwrap(); + + assert_eq!( + res, + vec![vec![PullJob::Sync(( + rr!("kamu.dev/bar").into(), + n!("bar").into() + ))]] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_sync_from_url_and_local_multi_tenant_ref() { + let harness = PullTestHarness::new(TenancyConfig::MultiTenant); + + let _remote_tmp_dir = + create_graph_remote("kamu.dev", &harness, vec![(n!("bar"), names![])], names!()).await; + + let res = harness + .pull_with_requests( + vec![PullRequest::Remote(PullRequestRemote { + maybe_local_alias: Some(mn!("x/bar")), + remote_ref: rr!("kamu.dev/bar"), + })], + PullOptions::default(), + ) + .await + .unwrap(); + + assert_eq!( + res, + vec![vec![PullJob::Sync(( + rr!("kamu.dev/bar").into(), + mn!("x/bar").into() + ))]] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_sync_from_url_only() { + let harness = PullTestHarness::new(TenancyConfig::SingleTenant); + + let _remote_tmp_dir = + create_graph_remote("kamu.dev", &harness, vec![(n!("bar"), names![])], names!()).await; + + let res = harness + .pull_with_requests( + vec![PullRequest::Remote(PullRequestRemote { + maybe_local_alias: None, + remote_ref: rr!("kamu.dev/bar"), + })], + PullOptions::default(), + ) + .await + .unwrap(); + + assert_eq!( + res, + vec![vec![PullJob::Sync(( + rr!("kamu.dev/bar").into(), + n!("bar").into() + ))]] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_sync_from_url_only_multi_tenant_case() { + let harness = PullTestHarness::new(TenancyConfig::MultiTenant); + + let _remote_tmp_dir = + create_graph_remote("kamu.dev", &harness, vec![(n!("bar"), names![])], names!()).await; + + let res = harness + .pull_with_requests( + vec![PullRequest::Remote(PullRequestRemote { + maybe_local_alias: None, + remote_ref: rr!("kamu.dev/bar"), + })], + PullOptions::default(), + ) + .await + .unwrap(); + + assert_eq!( + res, + vec![vec![PullJob::Sync(( + rr!("kamu.dev/bar").into(), + n!("bar").into() + ))]] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseRepoHarness, base_repo_harness)] +struct PullTestHarness { + base_repo_harness: BaseRepoHarness, + calls: Arc>>>, + sync_service: Arc, + sync_request_builder: Arc, + remote_repo_reg: Arc, + remote_alias_reg: Arc, + pull_request_planner: Arc, + tenancy_config: TenancyConfig, +} + +impl PullTestHarness { + fn new(tenancy_config: TenancyConfig) -> Self { + let base_repo_harness = BaseRepoHarness::new(tenancy_config); + + let calls = Arc::new(Mutex::new(Vec::new())); + + let repos_dir = base_repo_harness.temp_dir_path().join("repos"); + std::fs::create_dir(&repos_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) + .add_value(RemoteRepositoryRegistryImpl::create(repos_dir).unwrap()) + .bind::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add_value(IpfsClient::default()) + .add_value(IpfsGateway::default()) + .build(); + + Self { + base_repo_harness, + calls, + sync_service: catalog.get_one().unwrap(), + sync_request_builder: catalog.get_one().unwrap(), + remote_repo_reg: catalog.get_one().unwrap(), + remote_alias_reg: catalog.get_one().unwrap(), + pull_request_planner: catalog.get_one().unwrap(), + tenancy_config, + } + } + + fn collect_calls(&self) -> Vec> { + let mut calls = Vec::new(); + std::mem::swap(self.calls.lock().unwrap().as_mut(), &mut calls); + calls + } + + async fn pull( + &self, + refs: Vec, + options: PullOptions, + ) -> Result>, Vec> { + let requests: Vec<_> = refs + .into_iter() + .map(|r| { + PullRequest::from_any_ref(&r, |_| { + self.tenancy_config == TenancyConfig::SingleTenant + }) + }) + .collect(); + self.pull_with_requests(requests, options).await + } + + async fn pull_with_requests( + &self, + requests: Vec, + options: PullOptions, + ) -> Result>, Vec> { + let (plan_iterations, errors) = self + .pull_request_planner + .build_pull_multi_plan(&requests, &options, self.tenancy_config) + .await; + if !errors.is_empty() { + return Err(errors); + } + + for iteration in plan_iterations { + let mut jobs = Vec::new(); + for job in iteration.jobs { + match job { + PullPlanIterationJob::Ingest(pii) => { + jobs.push(PullJob::Ingest(pii.target.get_handle().as_any_ref())); + } + PullPlanIterationJob::Transform(pti) => { + jobs.push(PullJob::Transform(pti.target.get_handle().as_any_ref())); + } + PullPlanIterationJob::Sync(psi) => { + jobs.push(PullJob::Sync(( + psi.sync_request.src.as_user_friendly_any_ref(), + psi.sync_request.dst.as_user_friendly_any_ref(), + ))); + } + }; + } + + self.calls.lock().unwrap().push(jobs); + } + + tokio::time::sleep(Duration::from_millis(1)).await; + + Ok(self.collect_calls()) + } + + async fn get_remote_aliases(&self, dataset_ref: &DatasetRef) -> Box { + let hdl = self + .dataset_registry() + .resolve_dataset_handle_by_ref(dataset_ref) + .await + .unwrap(); + self.remote_alias_reg + .get_remote_aliases(&hdl) + .await + .unwrap() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Eq)] +enum PullJob { + Ingest(DatasetRefAny), + Transform(DatasetRefAny), + Sync((DatasetRefAny, DatasetRefAny)), +} + +impl PullJob { + fn cmp_ref(lhs: &DatasetRefAny, rhs: &DatasetRefAny) -> bool { + #[allow(clippy::type_complexity)] + fn tuplify( + v: &DatasetRefAny, + ) -> ( + Option<&DatasetID>, + Option<&url::Url>, + Option<&str>, + Option<&str>, + Option<&DatasetName>, + ) { + match v { + DatasetRefAny::ID(_, id) => (Some(id), None, None, None, None), + DatasetRefAny::Url(url) => (None, Some(url), None, None, None), + DatasetRefAny::LocalAlias(a, n) => { + (None, None, None, a.as_ref().map(AsRef::as_ref), Some(n)) + } + DatasetRefAny::RemoteAlias(r, a, n) => ( + None, + None, + Some(r.as_ref()), + a.as_ref().map(AsRef::as_ref), + Some(n), + ), + DatasetRefAny::AmbiguousAlias(ra, n) => { + (None, None, Some(ra.as_ref()), None, Some(n)) + } + DatasetRefAny::LocalHandle(h) => ( + None, + None, + None, + h.alias.account_name.as_ref().map(AccountName::as_str), + Some(&h.alias.dataset_name), + ), + DatasetRefAny::RemoteHandle(h) => ( + None, + None, + Some(h.alias.repo_name.as_str()), + h.alias.account_name.as_ref().map(AccountName::as_str), + Some(&h.alias.dataset_name), + ), + } + } + tuplify(lhs) == tuplify(rhs) + } +} + +impl std::cmp::PartialEq for PullJob { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Transform(l), Self::Transform(r)) | (Self::Ingest(l), Self::Ingest(r)) => { + Self::cmp_ref(l, r) + } + (Self::Sync(l), Self::Sync(r)) => { + Self::cmp_ref(&l.0, &r.0) && Self::cmp_ref(&l.1, &r.1) + } + _ => false, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_pull_service_impl.rs b/src/infra/core/tests/tests/test_pull_service_impl.rs deleted file mode 100644 index f8efabea8d..0000000000 --- a/src/infra/core/tests/tests/test_pull_service_impl.rs +++ /dev/null @@ -1,1252 +0,0 @@ -// Copyright Kamu Data, Inc. and contributors. All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use std::assert_matches::assert_matches; -use std::convert::TryFrom; -use std::path::Path; -use std::sync::{Arc, Mutex}; - -use chrono::prelude::*; -use dill::*; -use domain::auth::AlwaysHappyDatasetActionAuthorizer; -use kamu::domain::*; -use kamu::testing::*; -use kamu::*; -use kamu_accounts::{CurrentAccountSubject, DEFAULT_ACCOUNT_NAME_STR}; -use opendatafabric::*; -use time_source::SystemTimeSourceDefault; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -macro_rules! n { - ($s:expr) => { - DatasetAlias::new(None, DatasetName::try_from($s).unwrap()) - }; -} - -macro_rules! mn { - ($s:expr) => { - DatasetAlias::try_from($s).unwrap() - }; -} - -macro_rules! rl { - ($s:expr) => { - DatasetRef::Alias(DatasetAlias::new(None, DatasetName::try_from($s).unwrap())) - }; -} - -macro_rules! mrl { - ($s:expr) => { - DatasetRef::Alias(DatasetAlias::try_from($s).unwrap()) - }; -} - -macro_rules! rr { - ($s:expr) => { - DatasetRefRemote::try_from($s).unwrap() - }; -} - -macro_rules! ar { - ($s:expr) => { - DatasetRefAny::try_from($s).unwrap() - }; -} - -macro_rules! names { - [] => { - vec![] - }; - [$x:expr] => { - vec![n!($x)] - }; - [$x:expr, $($y:expr),+] => { - vec![n!($x), $(n!($y)),+] - }; -} - -macro_rules! mnames { - [] => { - vec![] - }; - [$x:expr] => { - vec![mn!($x)] - }; - [$x:expr, $($y:expr),+] => { - vec![mn!($x), $(mn!($y)),+] - }; -} - -macro_rules! refs { - [] => { - vec![] - }; - [$x:expr] => { - vec![ar!($x)] - }; - [$x:expr, $($y:expr),+] => { - vec![ar!($x), $(ar!($y)),+] - }; -} - -macro_rules! refs_local { - [] => { - vec![] - }; - [$x:expr] => { - vec![mn!($x).as_any_ref()] - }; - [$x:expr, $($y:expr),+] => { - vec![mn!($x).as_any_ref(), $(mn!($y).as_any_ref()),+] - }; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -async fn create_graph( - repo: &DatasetRepositoryLocalFs, - datasets: Vec<(DatasetAlias, Vec)>, -) { - for (dataset_alias, deps) in datasets { - repo.create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(dataset_alias) - .kind(if deps.is_empty() { - DatasetKind::Root - } else { - DatasetKind::Derivative - }) - .push_event::(if deps.is_empty() { - MetadataFactory::set_polling_source().build().into() - } else { - MetadataFactory::set_transform() - .inputs_from_refs(deps) - .build() - .into() - }) - .build(), - ) - .await - .unwrap(); - } -} - -// Adding a remote dataset is a bit of a pain. -// We cannot add a local dataset and then add a pull alias without adding all of -// its dependencies too. So instead we're creating a repository based on temp -// dir and syncing it into the main workspace. TODO: Add simpler way to import -// remote dataset -async fn create_graph_remote( - dataset_repo: Arc, - dataset_repo_writer: Arc, - reg: Arc, - datasets: Vec<(DatasetAlias, Vec)>, - to_import: Vec, -) { - let tmp_repo_dir = tempfile::tempdir().unwrap(); - - let remote_dataset_repo = DatasetRepositoryLocalFs::new( - tmp_repo_dir.path().to_owned(), - Arc::new(CurrentAccountSubject::new_test()), - false, - Arc::new(SystemTimeSourceDefault), - ); - - create_graph(&remote_dataset_repo, datasets).await; - - let tmp_repo_name = RepoName::new_unchecked("tmp"); - - reg.add_repository( - &tmp_repo_name, - url::Url::from_file_path(tmp_repo_dir.path()).unwrap(), - ) - .unwrap(); - - let sync_service = SyncServiceImpl::new( - reg.clone(), - dataset_repo, - dataset_repo_writer, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - Arc::new(DatasetFactoryImpl::new( - IpfsGateway::default(), - Arc::new(auth::DummyOdfServerAccessTokenResolver::new()), - )), - Arc::new(DummySmartTransferProtocolClient::new()), - Arc::new(kamu::utils::ipfs_wrapper::IpfsClient::default()), - ); - - for import_alias in to_import { - sync_service - .sync( - &import_alias - .as_remote_alias(tmp_repo_name.clone()) - .into_any_ref(), - &import_alias.into_any_ref(), - SyncOptions::default(), - None, - ) - .await - .unwrap(); - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[test_log::test(tokio::test)] -async fn test_pull_batching_chain() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_dir.path(), false); - - // A - B - C - create_graph( - harness.dataset_repo.as_ref(), - vec![ - (n!("a"), names![]), - (n!("b"), names!["a"]), - (n!("c"), names!["b"]), - ], - ) - .await; - - assert_eq!( - harness.pull(refs!["c"], PullMultiOptions::default()).await, - vec![PullBatch::Transform(refs!["c"])] - ); - - assert_eq!( - harness - .pull(refs!["c", "a"], PullMultiOptions::default()) - .await, - vec![ - PullBatch::Ingest(refs!["a"]), - PullBatch::Transform(refs!["c"]), - ], - ); - - assert_eq!( - harness - .pull( - refs!["c"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![ - PullBatch::Ingest(refs!["a"]), - PullBatch::Transform(refs!["b"]), - PullBatch::Transform(refs!["c"]), - ] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[test_log::test(tokio::test)] -async fn test_pull_batching_chain_multi_tenant() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_dir.path(), true); - - // XA - YB - ZC - create_graph( - harness.dataset_repo.as_ref(), - vec![ - (mn!("x/a"), mnames![]), - (mn!("y/b"), mnames!["x/a"]), - (mn!("z/c"), mnames!["y/b"]), - ], - ) - .await; - - assert_eq!( - harness - .pull(refs!["z/c"], PullMultiOptions::default()) - .await, - vec![PullBatch::Transform(refs_local!["z/c"])] - ); - - assert_eq!( - harness - .pull(refs!["z/c", "x/a"], PullMultiOptions::default()) - .await, - vec![ - PullBatch::Ingest(refs_local!["x/a"]), - PullBatch::Transform(refs_local!["z/c"]), - ], - ); - - assert_eq!( - harness - .pull( - refs!["z/c"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![ - PullBatch::Ingest(refs_local!["x/a"]), - PullBatch::Transform(refs_local!["y/b"]), - PullBatch::Transform(refs_local!["z/c"]), - ] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[test_log::test(tokio::test)] -async fn test_pull_batching_complex() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_dir.path(), false); - - // / C \ - // A < > > E - // \ D / / - // / - // B - - -/ - create_graph( - harness.dataset_repo.as_ref(), - vec![ - (n!("a"), names![]), - (n!("b"), names![]), - (n!("c"), names!["a"]), - (n!("d"), names!["a"]), - (n!("e"), names!["c", "d", "b"]), - ], - ) - .await; - - assert_eq!( - harness.pull(refs!["e"], PullMultiOptions::default()).await, - vec![PullBatch::Transform(refs!["e"])] - ); - - assert_matches!( - harness - .pull_svc - .pull_multi(vec![ar!("z")], PullMultiOptions::default(), None) - .await - .unwrap()[0], - PullResponse { - result: Err(PullError::NotFound(_)), - .. - }, - ); - - assert_eq!( - harness - .pull( - refs!["e"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![ - PullBatch::Ingest(refs!["a", "b"]), - PullBatch::Transform(refs!["c", "d"]), - PullBatch::Transform(refs!["e"]), - ] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[test_log::test(tokio::test)] -async fn test_pull_batching_complex_with_remote() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_dir.path(), false); - - // (A) - (E) - F - G - // (B) --/ / / - // C --------/ / - // D -----------/ - create_graph_remote( - harness.dataset_repo.clone(), - harness.dataset_repo.clone(), - harness.remote_repo_reg.clone(), - vec![ - (n!("a"), names![]), - (n!("b"), names![]), - (n!("e"), names!["a", "b"]), - ], - names!("e"), - ) - .await; - create_graph( - harness.dataset_repo.as_ref(), - vec![ - (n!("c"), names![]), - (n!("d"), names![]), - (n!("f"), names!["e", "c"]), - (n!("g"), names!["f", "d"]), - ], - ) - .await; - - // Add remote pull alias to E - harness - .remote_alias_reg - .get_remote_aliases(&rl!("e")) - .await - .unwrap() - .add( - &DatasetRefRemote::try_from("kamu.dev/anonymous/e").unwrap(), - RemoteAliasKind::Pull, - ) - .await - .unwrap(); - - // Pulling E results in a sync - assert_eq!( - harness - .pull( - refs!["e"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![PullBatch::Sync(vec![( - rr!("kamu.dev/anonymous/e").into(), - n!("e").into() - )])], - ); - - // Explicit remote reference associates with E - assert_eq!( - harness - .pull( - refs!["kamu.dev/anonymous/e"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![PullBatch::Sync(vec![( - rr!("kamu.dev/anonymous/e").into(), - n!("e").into() - )])], - ); - - // Remote is recursed onto - assert_eq!( - harness - .pull( - refs!["g"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![ - PullBatch::Sync(vec![(rr!("kamu.dev/anonymous/e").into(), n!("e").into())]), - PullBatch::Ingest(refs!("c", "d")), - PullBatch::Transform(refs!("f")), - PullBatch::Transform(refs!("g")), - ], - ); - - // Remote is recursed onto while also specified explicitly (via local ID) - assert_eq!( - harness - .pull( - refs!["g", "e"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![ - PullBatch::Sync(vec![(rr!("kamu.dev/anonymous/e").into(), n!("e").into())]), - PullBatch::Ingest(refs!("c", "d")), - PullBatch::Transform(refs!("f")), - PullBatch::Transform(refs!("g")), - ], - ); - - // Remote is recursed onto while also specified explicitly (via remote ref) - assert_eq!( - harness - .pull( - refs!["g", "kamu.dev/anonymous/e"], - PullMultiOptions { - recursive: true, - ..PullMultiOptions::default() - } - ) - .await, - vec![ - PullBatch::Sync(vec![(rr!("kamu.dev/anonymous/e").into(), n!("e").into())]), - PullBatch::Ingest(refs!("c", "d")), - PullBatch::Transform(refs!("f")), - PullBatch::Transform(refs!("g")), - ], - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_sync_from() { - let tmp_ws_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_ws_dir.path(), false); - - harness - .remote_repo_reg - .add_repository( - &RepoName::new_unchecked("myrepo"), - url::Url::parse("file:///tmp/nowhere").unwrap(), - ) - .unwrap(); - - let res = harness - .pull_svc - .pull_multi_ext( - vec![PullRequest { - local_ref: Some(n!("bar").into()), - remote_ref: Some(rr!("myrepo/foo")), - }], - PullMultiOptions::default(), - None, - ) - .await - .unwrap(); - - assert_eq!(res.len(), 1); - assert_matches!( - res[0], - PullResponse { - result: Ok(PullResult::Updated { old_head: None, .. }), - .. - } - ); - - let aliases = harness - .remote_alias_reg - .get_remote_aliases(&rl!("bar")) - .await - .unwrap(); - let pull_aliases: Vec<_> = aliases - .get_by_kind(RemoteAliasKind::Pull) - .cloned() - .collect(); - - assert_eq!( - pull_aliases, - vec![DatasetRefRemote::try_from("myrepo/foo").unwrap()] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_sync_from_url_and_local_ref() { - let tmp_ws_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_ws_dir.path(), false); - - let res = harness - .pull_svc - .pull_multi_ext( - vec![PullRequest { - local_ref: Some(n!("bar").into()), - remote_ref: Some(rr!("http://example.com/odf/bar")), - }], - PullMultiOptions::default(), - None, - ) - .await - .unwrap(); - - assert_eq!(res.len(), 1); - assert_matches!( - res[0], - PullResponse { - result: Ok(PullResult::Updated { old_head: None, .. }), - .. - } - ); - - let aliases = harness - .remote_alias_reg - .get_remote_aliases(&rl!("bar")) - .await - .unwrap(); - let pull_aliases: Vec<_> = aliases - .get_by_kind(RemoteAliasKind::Pull) - .cloned() - .collect(); - - assert_eq!( - pull_aliases, - vec![DatasetRefRemote::try_from("http://example.com/odf/bar").unwrap()] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_sync_from_url_and_local_multi_tenant_ref() { - let tmp_ws_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_ws_dir.path(), true); - - let res = harness - .pull_svc - .pull_multi_ext( - vec![PullRequest { - local_ref: Some(mn!("x/bar").into()), - remote_ref: Some(rr!("http://example.com/odf/bar")), - }], - PullMultiOptions::default(), - None, - ) - .await - .unwrap(); - - assert_eq!(res.len(), 1); - assert_matches!( - res[0], - PullResponse { - result: Ok(PullResult::Updated { old_head: None, .. }), - .. - } - ); - - let aliases = harness - .remote_alias_reg - .get_remote_aliases(&mrl!("x/bar")) - .await - .unwrap(); - let pull_aliases: Vec<_> = aliases - .get_by_kind(RemoteAliasKind::Pull) - .cloned() - .collect(); - - assert_eq!( - pull_aliases, - vec![DatasetRefRemote::try_from("http://example.com/odf/bar").unwrap()] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_sync_from_url_only() { - let tmp_ws_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_ws_dir.path(), false); - - let res = harness - .pull_svc - .pull_multi_ext( - vec![PullRequest { - local_ref: None, - remote_ref: Some(rr!("http://example.com/odf/bar")), - }], - PullMultiOptions::default(), - None, - ) - .await - .unwrap(); - - assert_eq!(res.len(), 1); - assert_matches!( - res[0], - PullResponse { - result: Ok(PullResult::Updated { old_head: None, .. }), - .. - } - ); - - let aliases = harness - .remote_alias_reg - .get_remote_aliases(&rl!("bar")) - .await - .unwrap(); - let pull_aliases: Vec<_> = aliases - .get_by_kind(RemoteAliasKind::Pull) - .cloned() - .collect(); - - assert_eq!( - pull_aliases, - vec![DatasetRefRemote::try_from("http://example.com/odf/bar").unwrap()] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_sync_from_url_only_multi_tenant_case() { - let tmp_ws_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new(tmp_ws_dir.path(), true); - - let res = harness - .pull_svc - .pull_multi_ext( - vec![PullRequest { - local_ref: None, - remote_ref: Some(rr!("http://example.com/odf/bar")), - }], - PullMultiOptions::default(), - None, - ) - .await - .unwrap(); - - assert_eq!(res.len(), 1); - assert_matches!( - res[0], - PullResponse { - result: Ok(PullResult::Updated { old_head: None, .. }), - .. - } - ); - - let aliases = harness - .remote_alias_reg - .get_remote_aliases(&mrl!(format!("{}/{}", DEFAULT_ACCOUNT_NAME_STR, "bar"))) - .await - .unwrap(); - let pull_aliases: Vec<_> = aliases - .get_by_kind(RemoteAliasKind::Pull) - .cloned() - .collect(); - - assert_eq!( - pull_aliases, - vec![DatasetRefRemote::try_from("http://example.com/odf/bar").unwrap()] - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_set_watermark() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new_with_authorizer( - tmp_dir.path(), - MockDatasetActionAuthorizer::new().expect_check_write_dataset( - &DatasetAlias::new(None, DatasetName::new_unchecked("foo")), - 4, - true, - ), - false, - ); - - let dataset_alias = n!("foo"); - harness.create_dataset(&dataset_alias).await; - - assert_eq!(harness.num_blocks(&dataset_alias).await, 1); - - assert_matches!( - harness - .pull_svc - .set_watermark( - &dataset_alias.as_local_ref(), - Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap() - ) - .await, - Ok(PullResult::Updated { .. }) - ); - assert_eq!(harness.num_blocks(&dataset_alias).await, 2); - - assert_matches!( - harness - .pull_svc - .set_watermark( - &dataset_alias.as_local_ref(), - Utc.with_ymd_and_hms(2000, 1, 3, 0, 0, 0).unwrap() - ) - .await, - Ok(PullResult::Updated { .. }) - ); - assert_eq!(harness.num_blocks(&dataset_alias).await, 3); - - assert_matches!( - harness - .pull_svc - .set_watermark( - &dataset_alias.as_local_ref(), - Utc.with_ymd_and_hms(2000, 1, 3, 0, 0, 0).unwrap() - ) - .await, - Ok(PullResult::UpToDate(_)) - ); - assert_eq!(harness.num_blocks(&dataset_alias).await, 3); - - assert_matches!( - harness - .pull_svc - .set_watermark( - &dataset_alias.as_local_ref(), - Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap() - ) - .await, - Ok(PullResult::UpToDate(_)) - ); - assert_eq!(harness.num_blocks(&dataset_alias).await, 3); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_set_watermark_unauthorized() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new_with_authorizer( - tmp_dir.path(), - MockDatasetActionAuthorizer::denying(), - true, - ); - - let dataset_alias = n!("foo"); - harness.create_dataset(&dataset_alias).await; - - assert_matches!( - harness - .pull_svc - .set_watermark( - &dataset_alias.as_local_ref(), - Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap() - ) - .await, - Err(SetWatermarkError::Access(AccessError::Forbidden(_))) - ); - - assert_eq!(harness.num_blocks(&dataset_alias).await, 1); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[tokio::test] -async fn test_set_watermark_rejects_on_derivative() { - let tmp_dir = tempfile::tempdir().unwrap(); - let harness = PullTestHarness::new_with_authorizer( - tmp_dir.path(), - AlwaysHappyDatasetActionAuthorizer::new(), - true, - ); - - let dataset_alias = n!("foo"); - - harness - .dataset_repo - .create_dataset( - &dataset_alias, - MetadataFactory::metadata_block(MetadataFactory::seed(DatasetKind::Derivative).build()) - .build_typed(), - ) - .await - .unwrap(); - - assert_matches!( - harness - .pull_svc - .set_watermark( - &dataset_alias.as_local_ref(), - Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap() - ) - .await, - Err(SetWatermarkError::IsDerivative) - ); - - assert_eq!(harness.num_blocks(&dataset_alias).await, 1); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct PullTestHarness { - calls: Arc>>, - dataset_repo: Arc, - remote_repo_reg: Arc, - remote_alias_reg: Arc, - pull_svc: Arc, -} - -impl PullTestHarness { - fn new(tmp_path: &Path, multi_tenant: bool) -> Self { - Self::new_with_authorizer( - tmp_path, - auth::AlwaysHappyDatasetActionAuthorizer::new(), - multi_tenant, - ) - } - - fn new_with_authorizer( - tmp_path: &Path, - dataset_action_authorizer: TDatasetAuthorizer, - multi_tenant: bool, - ) -> Self { - let calls = Arc::new(Mutex::new(Vec::new())); - - let datasets_dir_path = tmp_path.join("datasets"); - std::fs::create_dir(&datasets_dir_path).unwrap(); - - let catalog = dill::CatalogBuilder::new() - .add::() - .add_value(CurrentAccountSubject::new_test()) - .add_value(dataset_action_authorizer) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir_path) - .with_multi_tenant(multi_tenant), - ) - .bind::() - .bind::() - .add_value(RemoteRepositoryRegistryImpl::create(tmp_path.join("repos")).unwrap()) - .bind::() - .add::() - .add_value(TestIngestService::new(calls.clone())) - .bind::() - .add_value(TestTransformService::new(calls.clone())) - .bind::() - .add_builder(TestSyncService::builder().with_calls(calls.clone())) - .bind::() - .add::() - .build(); - - Self { - calls, - dataset_repo: catalog.get_one().unwrap(), - remote_repo_reg: catalog.get_one().unwrap(), - remote_alias_reg: catalog.get_one().unwrap(), - pull_svc: catalog.get_one().unwrap(), - } - } - - async fn create_dataset(&self, dataset_alias: &DatasetAlias) { - self.dataset_repo - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(DatasetAlias::new(None, dataset_alias.dataset_name.clone())) - .build(), - ) - .await - .unwrap(); - } - - async fn num_blocks(&self, dataset_alias: &DatasetAlias) -> usize { - let ds = self - .dataset_repo - .find_dataset_by_ref(&dataset_alias.as_local_ref()) - .await - .unwrap(); - - use futures::StreamExt; - ds.as_metadata_chain().iter_blocks().count().await - } - - fn collect_calls(&self) -> Vec { - let mut calls = Vec::new(); - std::mem::swap(self.calls.lock().unwrap().as_mut(), &mut calls); - calls - } - - async fn pull(&self, refs: Vec, options: PullMultiOptions) -> Vec { - let results = self.pull_svc.pull_multi(refs, options, None).await.unwrap(); - - for res in results { - assert_matches!(res, PullResponse { result: Ok(_), .. }); - } - - self.collect_calls() - } -} - -#[derive(Debug, Clone, Eq)] -pub enum PullBatch { - Ingest(Vec), - Transform(Vec), - Sync(Vec<(DatasetRefAny, DatasetRefAny)>), -} - -impl PullBatch { - fn cmp_ref(lhs: &DatasetRefAny, rhs: &DatasetRefAny) -> bool { - #[allow(clippy::type_complexity)] - fn tuplify( - v: &DatasetRefAny, - ) -> ( - Option<&DatasetID>, - Option<&url::Url>, - Option<&str>, - Option<&str>, - Option<&DatasetName>, - ) { - match v { - DatasetRefAny::ID(_, id) => (Some(id), None, None, None, None), - DatasetRefAny::Url(url) => (None, Some(url), None, None, None), - DatasetRefAny::LocalAlias(a, n) => { - (None, None, None, a.as_ref().map(AsRef::as_ref), Some(n)) - } - DatasetRefAny::RemoteAlias(r, a, n) => ( - None, - None, - Some(r.as_ref()), - a.as_ref().map(AsRef::as_ref), - Some(n), - ), - DatasetRefAny::AmbiguousAlias(ra, n) => { - (None, None, Some(ra.as_ref()), None, Some(n)) - } - DatasetRefAny::LocalHandle(h) => ( - None, - None, - None, - h.alias.account_name.as_ref().map(AccountName::as_str), - Some(&h.alias.dataset_name), - ), - DatasetRefAny::RemoteHandle(h) => ( - None, - None, - Some(h.alias.repo_name.as_str()), - h.alias.account_name.as_ref().map(AccountName::as_str), - Some(&h.alias.dataset_name), - ), - } - } - tuplify(lhs) == tuplify(rhs) - } -} - -impl std::cmp::PartialEq for PullBatch { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::Ingest(l), Self::Ingest(r)) => { - let mut l = l.clone(); - l.sort(); - let mut r = r.clone(); - r.sort(); - l.len() == r.len() && std::iter::zip(&l, &r).all(|(li, ri)| Self::cmp_ref(li, ri)) - } - (Self::Transform(l), Self::Transform(r)) => { - let mut l = l.clone(); - l.sort(); - let mut r = r.clone(); - r.sort(); - l.len() == r.len() && std::iter::zip(&l, &r).all(|(li, ri)| Self::cmp_ref(li, ri)) - } - (Self::Sync(l), Self::Sync(r)) => { - let mut l = l.clone(); - l.sort(); - let mut r = r.clone(); - r.sort(); - l.len() == r.len() - && std::iter::zip(&l, &r) - .all(|((l1, l2), (r1, r2))| Self::cmp_ref(l1, r1) && Self::cmp_ref(l2, r2)) - } - _ => false, - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct TestIngestService { - calls: Arc>>, -} - -impl TestIngestService { - fn new(calls: Arc>>) -> Self { - Self { calls } - } -} - -// TODO: Replace with a mock -#[async_trait::async_trait] -impl PollingIngestService for TestIngestService { - async fn get_active_polling_source( - &self, - _dataset_ref: &DatasetRef, - ) -> Result)>, GetDatasetError> { - unimplemented!() - } - - async fn ingest( - &self, - _dataset_ref: &DatasetRef, - _ingest_options: PollingIngestOptions, - _maybe_listener: Option>, - ) -> Result { - unimplemented!(); - } - - async fn ingest_multi( - &self, - dataset_refs: Vec, - _options: PollingIngestOptions, - _listener: Option>, - ) -> Vec { - let results = dataset_refs - .iter() - .map(|r| PollingIngestResponse { - dataset_ref: r.clone(), - result: Ok(PollingIngestResult::UpToDate { - no_source_defined: false, - uncacheable: false, - }), - }) - .collect(); - self.calls.lock().unwrap().push(PullBatch::Ingest( - dataset_refs.into_iter().map(Into::into).collect(), - )); - results - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -pub struct TestTransformService { - calls: Arc>>, -} - -impl TestTransformService { - pub fn new(calls: Arc>>) -> Self { - Self { calls } - } -} - -#[async_trait::async_trait] -impl TransformService for TestTransformService { - async fn get_active_transform( - &self, - _dataset_ref: &DatasetRef, - ) -> Result)>, GetDatasetError> { - unimplemented!() - } - - async fn transform( - &self, - _dataset_ref: &DatasetRef, - _options: TransformOptions, - _maybe_listener: Option>, - ) -> Result { - unimplemented!(); - } - - async fn transform_multi( - &self, - dataset_refs: Vec, - _options: TransformOptions, - _maybe_multi_listener: Option>, - ) -> Vec<(DatasetRef, Result)> { - let results = dataset_refs - .iter() - .map(|r| (r.clone(), Ok(TransformResult::UpToDate))) - .collect(); - self.calls.lock().unwrap().push(PullBatch::Transform( - dataset_refs.into_iter().map(Into::into).collect(), - )); - results - } - - async fn verify_transform( - &self, - _dataset_ref: &DatasetRef, - _block_range: (Option, Option), - _listener: Option>, - ) -> Result<(), VerificationError> { - unimplemented!() - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct TestSyncService { - calls: Arc>>, - dataset_repo: Arc, - dataset_repo_writer: Arc, -} - -#[dill::component(pub)] -impl TestSyncService { - fn new( - calls: Arc>>, - dataset_repo: Arc, - dataset_repo_writer: Arc, - ) -> Self { - Self { - calls, - dataset_repo, - dataset_repo_writer, - } - } -} - -#[async_trait::async_trait] -impl SyncService for TestSyncService { - async fn sync( - &self, - _src: &DatasetRefAny, - _dst: &DatasetRefAny, - _options: SyncOptions, - _listener: Option>, - ) -> Result { - unimplemented!() - } - - async fn sync_multi( - &self, - requests: Vec, - _options: SyncOptions, - _listener: Option>, - ) -> Vec { - let mut call = Vec::new(); - let mut results = Vec::new(); - for SyncRequest { src, dst } in requests { - call.push((src.clone(), dst.clone())); - - let local_ref = dst.as_local_single_tenant_ref().unwrap(); - - match self - .dataset_repo - .try_resolve_dataset_ref(&local_ref) - .await - .unwrap() - { - None => { - self.dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(local_ref.alias().unwrap().clone()) - .build(), - ) - .await - .unwrap(); - } - Some(_) => (), - } - - results.push(SyncResultMulti { - src, - dst, - result: Ok(SyncResult::Updated { - old_head: None, - new_head: Multihash::from_digest_sha3_256(b"boop"), - num_blocks: 1, - }), - }); - } - self.calls.lock().unwrap().push(PullBatch::Sync(call)); - results - } - - async fn ipfs_add(&self, _src: &DatasetRef) -> Result { - unimplemented!() - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_push_request_planner_impl.rs b/src/infra/core/tests/tests/test_push_request_planner_impl.rs new file mode 100644 index 0000000000..56e135d433 --- /dev/null +++ b/src/infra/core/tests/tests/test_push_request_planner_impl.rs @@ -0,0 +1,354 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::sync::Arc; + +use auth::DummyOdfServerAccessTokenResolver; +use kamu::*; +use kamu_core::*; +use opendatafabric::{ + DatasetAlias, + DatasetAliasRemote, + DatasetName, + DatasetPushTarget, + DatasetRefRemote, + RepoName, +}; +use tempfile::TempDir; +use url::Url; + +use crate::BaseRepoHarness; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_repo_target() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, true); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let remote_repo_data = harness.maybe_remote_repo_data.unwrap(); + + let (items, errors) = harness + .push_request_planner + .collect_plan( + &[foo.dataset_handle.clone()], + Some(&DatasetPushTarget::Repository( + remote_repo_data.remote_repo_name.clone(), + )), + ) + .await; + assert!(errors.is_empty()); + + assert_eq!(items.len(), 1); + assert_eq!( + *items.first().unwrap(), + PushItem { + local_handle: foo.dataset_handle, + remote_target: RemoteTarget { + url: remote_repo_data + .remote_repo_url + .join(&alias_foo.dataset_name) + .unwrap(), + repo_name: Some(remote_repo_data.remote_repo_name.clone()), + dataset_name: None, + account_name: None, + }, + push_target: Some(DatasetPushTarget::Repository( + remote_repo_data.remote_repo_name + )) + } + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_url_target() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, false); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let push_url = Url::parse("http://example.com/foo").unwrap(); + + let (items, errors) = harness + .push_request_planner + .collect_plan( + &[foo.dataset_handle.clone()], + Some(&DatasetPushTarget::Url(push_url.clone())), + ) + .await; + assert!(errors.is_empty()); + + assert_eq!(items.len(), 1); + assert_eq!( + *items.first().unwrap(), + PushItem { + local_handle: foo.dataset_handle, + remote_target: RemoteTarget { + url: push_url.clone(), + repo_name: None, + dataset_name: None, + account_name: None, + }, + push_target: Some(DatasetPushTarget::Url(push_url)) + } + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_remote_alias_target() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, true); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let remote_repo_data = harness.maybe_remote_repo_data.unwrap(); + + let remote_alias = DatasetAliasRemote { + repo_name: remote_repo_data.remote_repo_name.clone(), + dataset_name: DatasetName::new_unchecked("bar"), + account_name: None, + }; + + let (items, errors) = harness + .push_request_planner + .collect_plan( + &[foo.dataset_handle.clone()], + Some(&DatasetPushTarget::Alias(remote_alias.clone())), + ) + .await; + assert!(errors.is_empty()); + + assert_eq!(items.len(), 1); + assert_eq!( + *items.first().unwrap(), + PushItem { + local_handle: foo.dataset_handle, + remote_target: RemoteTarget { + url: remote_repo_data.remote_repo_url.join("bar").unwrap(), + repo_name: Some(remote_repo_data.remote_repo_name), + dataset_name: Some(remote_alias.dataset_name.clone()), + account_name: None, + }, + push_target: Some(DatasetPushTarget::Alias(remote_alias)) + } + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_remote_no_target_presaved_push_alias() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, true); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let remote_repo_data = harness.maybe_remote_repo_data.unwrap(); + + let mut aliases = harness + .remote_aliases_registry + .get_remote_aliases(&foo.dataset_handle) + .await + .unwrap(); + aliases + .add( + &DatasetRefRemote::Url(Arc::new(remote_repo_data.remote_repo_url.clone())), + RemoteAliasKind::Push, + ) + .await + .unwrap(); + + let (items, errors) = harness + .push_request_planner + .collect_plan(&[foo.dataset_handle.clone()], None) + .await; + assert!(errors.is_empty()); + + assert_eq!(items.len(), 1); + assert_eq!( + *items.first().unwrap(), + PushItem { + local_handle: foo.dataset_handle, + remote_target: RemoteTarget { + url: remote_repo_data.remote_repo_url, + repo_name: None, + dataset_name: None, + account_name: None, + }, + push_target: None, + } + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_remote_no_target_no_alias() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, true); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let remote_repo_data = harness.maybe_remote_repo_data.unwrap(); + + let (items, errors) = harness + .push_request_planner + .collect_plan(&[foo.dataset_handle.clone()], None) + .await; + assert!(errors.is_empty()); + + assert_eq!(items.len(), 1); + assert_eq!( + *items.first().unwrap(), + PushItem { + local_handle: foo.dataset_handle, + remote_target: RemoteTarget { + url: remote_repo_data + .remote_repo_url + .join(&alias_foo.dataset_name) + .unwrap(), + repo_name: Some(remote_repo_data.remote_repo_name), + dataset_name: None, + account_name: None, + }, + push_target: None, + } + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_remote_no_target_no_alias_multiple_repos_exist() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, true); + + let extra_remote_tmp_dir = tempfile::tempdir().unwrap(); + let extra_remote_repo_url = Url::from_directory_path(extra_remote_tmp_dir.path()).unwrap(); + let extra_remote_repo_name = RepoName::new_unchecked("extra-remote"); + harness + .remote_repo_registry + .add_repository(&extra_remote_repo_name, extra_remote_repo_url) + .unwrap(); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let (items, errors) = harness + .push_request_planner + .collect_plan(&[foo.dataset_handle.clone()], None) + .await; + assert!(items.is_empty()); + + assert_eq!(1, errors.len()); + assert_matches!( + errors.first().unwrap(), + PushResponse { + local_handle: Some(a_local_handle), + target: None, + result: Err(PushError::AmbiguousTarget), + } if *a_local_handle == foo.dataset_handle + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_push_remote_no_target_no_alias_no_repositories() { + let harness = PushTestHarness::new(TenancyConfig::SingleTenant, false); + + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let foo = harness.create_root_dataset(&alias_foo).await; + + let (items, errors) = harness + .push_request_planner + .collect_plan(&[foo.dataset_handle.clone()], None) + .await; + assert!(items.is_empty()); + + assert_eq!(1, errors.len()); + assert_matches!( + errors.first().unwrap(), + PushResponse { + local_handle: Some(a_local_handle), + target: None, + result: Err(PushError::NoTarget), + } if *a_local_handle == foo.dataset_handle + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseRepoHarness, base_repo_harness)] +struct PushTestHarness { + base_repo_harness: BaseRepoHarness, + push_request_planner: Arc, + remote_aliases_registry: Arc, + remote_repo_registry: Arc, + maybe_remote_repo_data: Option, +} + +struct RemoteRepoData { + remote_repo_name: RepoName, + remote_repo_url: Url, + _remote_tmp_dir: TempDir, +} + +impl PushTestHarness { + fn new(tenancy_config: TenancyConfig, create_remote_repo: bool) -> Self { + let base_repo_harness = BaseRepoHarness::new(tenancy_config); + + let repos_dir = base_repo_harness.temp_dir_path().join("repos"); + std::fs::create_dir(&repos_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) + .add_value(RemoteRepositoryRegistryImpl::create(repos_dir).unwrap()) + .bind::() + .add::() + .add::() + .add::() + .add::() + .build(); + + let maybe_remote_repo_data = if create_remote_repo { + let remote_tmp_dir = tempfile::tempdir().unwrap(); + let remote_repo_url = Url::from_directory_path(remote_tmp_dir.path()).unwrap(); + + let remote_repo_name = RepoName::new_unchecked("remote"); + let remote_repo_registry = catalog.get_one::().unwrap(); + remote_repo_registry + .add_repository(&remote_repo_name, remote_repo_url.clone()) + .unwrap(); + + Some(RemoteRepoData { + remote_repo_name, + remote_repo_url, + _remote_tmp_dir: remote_tmp_dir, + }) + } else { + None + }; + + Self { + base_repo_harness, + push_request_planner: catalog.get_one().unwrap(), + remote_aliases_registry: catalog.get_one().unwrap(), + remote_repo_registry: catalog.get_one().unwrap(), + maybe_remote_repo_data, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_query_service_impl.rs b/src/infra/core/tests/tests/test_query_service_impl.rs index 45accd7b5c..9402c8ea1f 100644 --- a/src/infra/core/tests/tests/test_query_service_impl.rs +++ b/src/infra/core/tests/tests/test_query_service_impl.rs @@ -125,13 +125,11 @@ fn create_catalog_with_local_workspace( dill::CatalogBuilder::new() .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add::() .add::() @@ -152,13 +150,11 @@ async fn create_catalog_with_s3_workspace( dill::CatalogBuilder::new() .add::() - .add_builder( - DatasetRepositoryS3::builder() - .with_s3_context(s3_context.clone()) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryS3::builder().with_s3_context(s3_context.clone())) .bind::() .bind::() + .add::() .add::() .add::() .add::() @@ -255,50 +251,53 @@ async fn test_dataset_arrow_schema(catalog: &Catalog, tempdir: &TempDir) { ); } -fn prepare_test_catalog() -> (TempDir, Catalog) { +fn prepare_schema_test_catalog() -> (TempDir, Catalog) { + let mut authorizer = MockDatasetActionAuthorizer::new().expect_check_read_a_dataset(1, true); + authorizer + .expect_filter_datasets_allowing() + .returning(|_, _| Ok(vec![])); + let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace( - tempdir.path(), - MockDatasetActionAuthorizer::new().expect_check_read_a_dataset(1, true), - ); + let catalog = create_catalog_with_local_workspace(tempdir.path(), authorizer); (tempdir, catalog) } -async fn prepare_test_s3_catalog() -> (LocalS3Server, Catalog) { +async fn prepare_schema_test_s3_catalog() -> (LocalS3Server, Catalog) { + let mut authorizer = MockDatasetActionAuthorizer::new().expect_check_read_a_dataset(1, true); + authorizer + .expect_filter_datasets_allowing() + .returning(|_, _| Ok(vec![])); + let s3 = LocalS3Server::new().await; - let catalog = create_catalog_with_s3_workspace( - &s3, - MockDatasetActionAuthorizer::new().expect_check_read_a_dataset(1, true), - ) - .await; + let catalog = create_catalog_with_s3_workspace(&s3, authorizer).await; (s3, catalog) } #[test_group::group(engine, datafusion)] #[test_log::test(tokio::test)] async fn test_dataset_parquet_schema_local_fs() { - let (tempdir, catalog) = prepare_test_catalog(); + let (tempdir, catalog) = prepare_schema_test_catalog(); test_dataset_parquet_schema(&catalog, &tempdir).await; } #[test_group::group(engine, datafusion)] #[test_log::test(tokio::test)] async fn test_dataset_arrow_schema_local_fs() { - let (tempdir, catalog) = prepare_test_catalog(); + let (tempdir, catalog) = prepare_schema_test_catalog(); test_dataset_arrow_schema(&catalog, &tempdir).await; } #[test_group::group(containerized, engine, datafusion)] #[test_log::test(tokio::test)] async fn test_dataset_parquet_schema_s3() { - let (s3, catalog) = prepare_test_s3_catalog().await; + let (s3, catalog) = prepare_schema_test_s3_catalog().await; test_dataset_parquet_schema(&catalog, &s3.tmp_dir).await; } #[test_group::group(containerized, engine, datafusion)] #[test_log::test(tokio::test)] async fn test_dataset_arrow_schema_s3() { - let (s3, catalog) = prepare_test_s3_catalog().await; + let (s3, catalog) = prepare_schema_test_s3_catalog().await; test_dataset_arrow_schema(&catalog, &s3.tmp_dir).await; } @@ -562,11 +561,16 @@ async fn test_dataset_sql_unauthorized_s3() { #[test_group::group(engine, datafusion)] #[test_log::test(tokio::test)] async fn test_sql_statement_not_found() { + let mut mock_authorizer = MockDatasetActionAuthorizer::new(); + mock_authorizer + .expect_check_action_allowed() + .returning(|_, _| Ok(())); + mock_authorizer + .expect_filter_datasets_allowing() + .returning(|_, _| Ok(vec![])); + let tempdir = tempfile::tempdir().unwrap(); - let catalog = create_catalog_with_local_workspace( - tempdir.path(), - MockDatasetActionAuthorizer::allowing(), - ); + let catalog = create_catalog_with_local_workspace(tempdir.path(), mock_authorizer); let _ = create_test_dataset(&catalog, tempdir.path()).await; diff --git a/src/infra/core/tests/tests/test_reset_service_impl.rs b/src/infra/core/tests/tests/test_reset_service_impl.rs index a54b0157b4..6dadd66fb3 100644 --- a/src/infra/core/tests/tests/test_reset_service_impl.rs +++ b/src/infra/core/tests/tests/test_reset_service_impl.rs @@ -10,14 +10,12 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use dill::Component; use kamu::domain::*; use kamu::testing::*; use kamu::*; -use kamu_accounts::CurrentAccountSubject; use opendatafabric::*; -use tempfile::TempDir; -use time_source::SystemTimeSourceDefault; + +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -30,7 +28,6 @@ async fn test_reset_dataset_with_2revisions_drop_last() { assert_eq!(test_case.hash_polling_source_block, current_head); let result = harness - .reset_svc .reset_dataset( &test_case.dataset_handle, Some(&test_case.hash_seed_block), @@ -57,7 +54,6 @@ async fn test_reset_dataset_with_2revisions_without_changes() { assert_eq!(test_case.hash_polling_source_block, current_head); let result = harness - .reset_svc .reset_dataset( &test_case.dataset_handle, Some(&test_case.hash_polling_source_block), @@ -84,7 +80,6 @@ async fn test_reset_dataset_to_non_existing_block_fails() { Multihash::from_multibase("zW1a3CNT52HXiJNniLkWMeev3CPRy9QiNRMWGyTrVNg4hY8").unwrap(); let result = harness - .reset_svc .reset_dataset( &test_case.dataset_handle, Some(&a_hash_not_present_in_chain), @@ -100,7 +95,6 @@ async fn test_reset_dataset_with_wrong_head() { let test_case = harness.a_chain_with_2_blocks().await; let result = harness - .reset_svc .reset_dataset( &test_case.dataset_handle, Some(&test_case.hash_seed_block), @@ -119,7 +113,6 @@ async fn test_reset_dataset_with_default_seed_block() { assert_eq!(test_case.hash_polling_source_block, current_head); let result = harness - .reset_svc .reset_dataset( &test_case.dataset_handle, None, @@ -159,42 +152,24 @@ impl ChainWith2BlocksTestCase { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseRepoHarness, base_repo_harness)] struct ResetTestHarness { - _temp_dir: TempDir, - dataset_repo: Arc, - dataset_repo_writer: Arc, + base_repo_harness: BaseRepoHarness, reset_svc: Arc, } impl ResetTestHarness { fn new() -> Self { - let tempdir = tempfile::tempdir().unwrap(); - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); - - let catalog = dill::CatalogBuilder::new() - .add::() - .add_value(CurrentAccountSubject::new_test()) - .add_value(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1, true)) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() + let base_repo_harness = BaseRepoHarness::new(TenancyConfig::SingleTenant); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) .add::() .build(); - let dataset_repo = catalog.get_one::().unwrap(); - let dataset_repo_writer = catalog.get_one::().unwrap(); let reset_svc = catalog.get_one::().unwrap(); Self { - _temp_dir: tempdir, - dataset_repo, - dataset_repo_writer, + base_repo_harness, reset_svc, } } @@ -210,7 +185,7 @@ impl ResetTestHarness { .build_typed(); let create_result = self - .dataset_repo_writer + .dataset_repo_writer() .create_dataset(&DatasetAlias::new(None, dataset_name.clone()), seed_block) .await .unwrap(); @@ -230,9 +205,21 @@ impl ResetTestHarness { ChainWith2BlocksTestCase::new(dataset_handle, hash_seed_block, hash_polling_source_block) } + async fn reset_dataset( + &self, + dataset_handle: &DatasetHandle, + block_hash: Option<&Multihash>, + old_head_maybe: Option<&Multihash>, + ) -> Result { + let resolved_dataset = self.resolve_dataset(dataset_handle); + self.reset_svc + .reset_dataset(resolved_dataset, block_hash, old_head_maybe) + .await + } + async fn get_dataset_head(&self, dataset_handle: &DatasetHandle) -> Multihash { - let dataset = self.resolve_dataset(dataset_handle); - dataset + let resolved_dataset = self.resolve_dataset(dataset_handle); + resolved_dataset .as_metadata_chain() .resolve_ref(&BlockRef::Head) .await @@ -240,15 +227,16 @@ impl ResetTestHarness { } async fn get_dataset_summary(&self, dataset_handle: &DatasetHandle) -> DatasetSummary { - let dataset = self.resolve_dataset(dataset_handle); - dataset + let resolved_dataset = self.resolve_dataset(dataset_handle); + resolved_dataset .get_summary(GetSummaryOpts::default()) .await .unwrap() } - fn resolve_dataset(&self, dataset_handle: &DatasetHandle) -> Arc { - self.dataset_repo.get_dataset_by_handle(dataset_handle) + fn resolve_dataset(&self, dataset_handle: &DatasetHandle) -> ResolvedDataset { + self.dataset_registry() + .get_dataset_by_handle(dataset_handle) } } diff --git a/src/infra/core/tests/tests/test_search_service_impl.rs b/src/infra/core/tests/tests/test_search_service_impl.rs index 274c826c9c..5b6008ceed 100644 --- a/src/infra/core/tests/tests/test_search_service_impl.rs +++ b/src/infra/core/tests/tests/test_search_service_impl.rs @@ -12,6 +12,7 @@ use std::path::Path; use dill::Component; use kamu::domain::*; use kamu::testing::*; +use kamu::utils::simple_transfer_protocol::SimpleTransferProtocol; use kamu::*; use kamu_accounts::CurrentAccountSubject; use messaging_outbox::DummyOutboxImpl; @@ -32,13 +33,11 @@ async fn do_test_search(tmp_workspace_dir: &Path, repo_url: Url) { let catalog = dill::CatalogBuilder::new() .add::() .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() .bind::() + .add::() .add::() .add_value(RemoteRepositoryRegistryImpl::create(tmp_workspace_dir.join("repos")).unwrap()) .bind::() @@ -47,7 +46,9 @@ async fn do_test_search(tmp_workspace_dir: &Path, repo_url: Url) { .add::() .add::() .add::() + .add::() .add::() + .add::() .add::() .add::() .add::() @@ -56,6 +57,7 @@ async fn do_test_search(tmp_workspace_dir: &Path, repo_url: Url) { let remote_repo_reg = catalog.get_one::().unwrap(); let dataset_repo_writer = catalog.get_one::().unwrap(); let sync_svc = catalog.get_one::().unwrap(); + let sync_request_builder = catalog.get_one::().unwrap(); let search_svc = catalog.get_one::().unwrap(); // Add repository @@ -77,8 +79,14 @@ async fn do_test_search(tmp_workspace_dir: &Path, repo_url: Url) { sync_svc .sync( - &dataset_local_alias.as_any_ref(), - &dataset_remote_alias.as_any_ref(), + sync_request_builder + .build_sync_request( + dataset_local_alias.as_any_ref(), + dataset_remote_alias.as_any_ref(), + true, + ) + .await + .unwrap(), SyncOptions::default(), None, ) diff --git a/src/infra/core/tests/tests/test_setup.rs b/src/infra/core/tests/tests/test_setup.rs index 1432650f66..b2acd21ccf 100644 --- a/src/infra/core/tests/tests/test_setup.rs +++ b/src/infra/core/tests/tests/test_setup.rs @@ -26,7 +26,6 @@ async fn test_setup_pull_images() { .ensure_image(docker_images::FLINK, None) .await .unwrap();*/ - // Disabled for disk space issue // See: https://github.com/kamu-data/kamu-cli/issues/599 // container_runtime diff --git a/src/infra/core/tests/tests/test_sync_service_impl.rs b/src/infra/core/tests/tests/test_sync_service_impl.rs index d6724da6c7..1d29d1c6f2 100644 --- a/src/infra/core/tests/tests/test_sync_service_impl.rs +++ b/src/infra/core/tests/tests/test_sync_service_impl.rs @@ -15,6 +15,7 @@ use dill::Component; use kamu::domain::*; use kamu::testing::*; use kamu::utils::ipfs_wrapper::IpfsClient; +use kamu::utils::simple_transfer_protocol::SimpleTransferProtocol; use kamu::*; use kamu_accounts::CurrentAccountSubject; use messaging_outbox::DummyOutboxImpl; @@ -49,39 +50,12 @@ async fn assert_in_sync( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct AuthorizationExpectations { - pub reads: usize, - pub writes: usize, -} - -impl Default for AuthorizationExpectations { - fn default() -> Self { - Self { - reads: 8, - writes: 1, - } - } -} - -fn construct_authorizer( - authorization_expectations: &AuthorizationExpectations, - alias: &DatasetAlias, -) -> impl auth::DatasetActionAuthorizer { - MockDatasetActionAuthorizer::new() - .expect_check_read_dataset(alias, authorization_expectations.reads, true) - .expect_check_write_dataset(alias, authorization_expectations.writes, true) -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - async fn do_test_sync( tmp_workspace_dir_foo: &Path, tmp_workspace_dir_bar: &Path, push_ref: &DatasetRefRemote, pull_ref: &DatasetRefRemote, ipfs: Option<(IpfsGateway, IpfsClient)>, - auth_expectations_foo: AuthorizationExpectations, - auth_expectations_bar: AuthorizationExpectations, ) { // Tests sync between "foo" -> remote -> "bar" let dataset_alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); @@ -90,9 +64,6 @@ async fn do_test_sync( let (ipfs_gateway, ipfs_client) = ipfs.unwrap_or_default(); - let dataset_authorizer_foo = construct_authorizer(&auth_expectations_foo, &dataset_alias_foo); - let dataset_authorizer_bar = construct_authorizer(&auth_expectations_bar, &dataset_alias_bar); - let datasets_dir_foo = tmp_workspace_dir_foo.join("datasets"); let datasets_dir_bar = tmp_workspace_dir_bar.join("datasets"); std::fs::create_dir(&datasets_dir_foo).unwrap(); @@ -103,21 +74,19 @@ async fn do_test_sync( .add_value(ipfs_gateway.clone()) .add_value(ipfs_client.clone()) .add_value(CurrentAccountSubject::new_test()) - .add_value(dataset_authorizer_foo) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir_foo) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir_foo)) .bind::() .bind::() + .add::() .add_value(RemoteReposDir::new(tmp_workspace_dir_foo.join("repos"))) .add::() .add::() .add::() .add::() + .add::() .add::() + .add::() .add::() .add::() .build(); @@ -127,51 +96,44 @@ async fn do_test_sync( .add_value(ipfs_gateway.clone()) .add_value(ipfs_client.clone()) .add_value(CurrentAccountSubject::new_test()) - .add_value(dataset_authorizer_bar) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir_bar) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir_bar)) .bind::() .bind::() + .add::() .add_value(RemoteReposDir::new(tmp_workspace_dir_bar.join("repos"))) .add::() .add::() .add::() .add::() + .add::() .add::() + .add::() .add::() .add::() .build(); let sync_svc_foo = catalog_foo.get_one::().unwrap(); - let sync_svc_bar = catalog_bar.get_one::().unwrap(); + let sync_request_builder_foo = catalog_foo.get_one::().unwrap(); let dataset_repo_foo = catalog_foo.get_one::().unwrap(); + let dataset_registry_foo = catalog_foo.get_one::().unwrap(); + + let sync_svc_bar = catalog_bar.get_one::().unwrap(); + let sync_request_builder_bar = catalog_bar.get_one::().unwrap(); let dataset_repo_bar = catalog_bar.get_one::().unwrap(); + let dataset_registry_bar = catalog_bar.get_one::().unwrap(); // Dataset does not exist locally / remotely assert_matches!( - sync_svc_foo - .sync( - &dataset_alias_foo.as_any_ref(), - &push_ref.as_any_ref(), - SyncOptions::default(), - None, - ) + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), true) .await, Err(SyncError::DatasetNotFound(e)) if e.dataset_ref == dataset_alias_foo.as_any_ref() ); assert_matches!( - sync_svc_bar - .sync( - &pull_ref.as_any_ref(), - &dataset_alias_bar.as_any_ref(), - SyncOptions::default(), - None, - ) + sync_request_builder_bar + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_bar.as_any_ref(), true) .await, Err(SyncError::DatasetNotFound(e)) if e.dataset_ref == pull_ref.as_any_ref() ); @@ -192,33 +154,52 @@ async fn do_test_sync( // Initial sync /////////////////////////////////////////////////////////// assert_matches!( - sync_svc_foo.sync( - &dataset_alias_foo.as_any_ref(), - &push_ref.as_any_ref(), - SyncOptions { create_if_not_exists: false, ..Default::default() }, - None - ).await, + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), false) + .await, Err(SyncError::DatasetNotFound(e)) if e.dataset_ref == push_ref.as_any_ref() ); + let sync_result = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), true) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_foo.sync(&dataset_alias_foo.as_any_ref(), &push_ref.as_any_ref(), SyncOptions::default(), None).await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head: None, new_head, num_blocks: 2, .. - }) if new_head == b1 + } if new_head == b1 ); + let sync_result = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_bar.as_any_ref(), true) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_bar.sync(&pull_ref.as_any_ref(), &dataset_alias_bar.as_any_ref(), SyncOptions::default(), None).await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head: None, new_head, num_blocks: 2, .. - }) if new_head == b1 + } if new_head == b1 ); assert_in_sync( @@ -231,45 +212,80 @@ async fn do_test_sync( // Subsequent sync //////////////////////////////////////////////////////// let _b2 = DatasetTestHelper::append_random_data( - dataset_repo_foo.as_ref(), + dataset_registry_foo.as_ref(), &dataset_alias_foo, FILE_DATA_ARRAY_SIZE, ) .await; let b3 = DatasetTestHelper::append_random_data( - dataset_repo_foo.as_ref(), + dataset_registry_foo.as_ref(), &dataset_alias_foo, FILE_DATA_ARRAY_SIZE, ) .await; + let sync_err = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_foo.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .err() + .unwrap(); assert_matches!( - sync_svc_foo.sync(&pull_ref.as_any_ref(), &dataset_alias_foo.as_any_ref(), SyncOptions::default(), None).await, - Err(SyncError::DestinationAhead(DestinationAheadError { + sync_err, + SyncError::DestinationAhead(DestinationAheadError { src_head, - dst_head, dst_ahead_size: 2 })) + dst_head, dst_ahead_size: 2 } + ) if src_head == b1 && dst_head == b3 ); + let sync_result = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_foo.sync(&dataset_alias_foo.as_any_ref(), &push_ref.as_any_ref(), SyncOptions::default(), None).await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head, new_head, num_blocks: 2, .. - }) if old_head.as_ref() == Some(&b1) && new_head == b3 + } if old_head.as_ref() == Some(&b1) && new_head == b3 ); + let sync_result = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_bar.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_bar.sync(&pull_ref.as_any_ref(), &dataset_alias_bar.as_any_ref(), SyncOptions::default(), None).await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head, new_head, num_blocks: 2, .. - }) if old_head.as_ref() == Some(&b1) && new_head == b3 + } if old_head.as_ref() == Some(&b1) && new_head == b3 ); assert_in_sync( @@ -281,29 +297,31 @@ async fn do_test_sync( .await; // Up to date ///////////////////////////////////////////////////////////// - assert_matches!( - sync_svc_foo - .sync( - &dataset_alias_foo.as_any_ref(), - &push_ref.as_any_ref(), - SyncOptions::default(), - None - ) - .await, - Ok(SyncResult::UpToDate) - ); - - assert_matches!( - sync_svc_bar - .sync( - &pull_ref.as_any_ref(), - &dataset_alias_bar.as_any_ref(), - SyncOptions::default(), - None - ) - .await, - Ok(SyncResult::UpToDate) - ); + let sync_result = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); + assert_matches!(sync_result, SyncResult::UpToDate,); + + let sync_result = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_bar.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); + assert_matches!(sync_result, SyncResult::UpToDate); assert_in_sync( &dataset_repo_foo, @@ -317,132 +335,198 @@ async fn do_test_sync( // Push a new block into dataset_bar (which we were pulling into before) let exta_head = DatasetTestHelper::append_random_data( - dataset_repo_bar.as_ref(), + dataset_registry_bar.as_ref(), &dataset_alias_bar, FILE_DATA_ARRAY_SIZE, ) .await; + let sync_result = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(dataset_alias_bar.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_bar.sync(&dataset_alias_bar.as_any_ref(), &push_ref.as_any_ref(), SyncOptions::default(), None).await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head, new_head, num_blocks: 1, .. - }) if old_head == Some(b3.clone()) && new_head == exta_head + } if old_head == Some(b3.clone()) && new_head == exta_head ); // Try push from dataset_foo + let sync_err = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .err() + .unwrap(); assert_matches!( - sync_svc_foo.sync(&dataset_alias_foo.as_any_ref(), &push_ref.as_any_ref(), SyncOptions::default(), None).await, - Err(SyncError::DestinationAhead(DestinationAheadError { + sync_err, + SyncError::DestinationAhead(DestinationAheadError { src_head, dst_head, dst_ahead_size: 1 - })) if src_head == b3 && dst_head == exta_head + }) if src_head == b3 && dst_head == exta_head ); // Try push from dataset_1 with --force: it should abandon the diverged_head // block + let sync_result = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions { + force: true, + ..SyncOptions::default() + }, + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_foo - .sync( - &dataset_alias_foo.as_any_ref(), - &push_ref.as_any_ref(), - SyncOptions { - force: true, - ..SyncOptions::default() - }, - None - ) - .await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head, new_head, num_blocks: 4, // full resynchronization: seed, b1, b2, b3 .. - }) if old_head == Some(exta_head.clone()) && new_head == b3 + } if old_head == Some(exta_head.clone()) && new_head == b3 ); // Try pulling dataset_bar: should fail, destination is ahead + let sync_err = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_bar.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .err() + .unwrap(); assert_matches!( - sync_svc_bar.sync(&pull_ref.as_any_ref(), &dataset_alias_bar.as_any_ref(), SyncOptions::default(), None).await, - Err(SyncError::DestinationAhead(DestinationAheadError { + sync_err, + SyncError::DestinationAhead(DestinationAheadError { src_head, dst_head, dst_ahead_size: 1 - })) if src_head == b3 && dst_head == exta_head + }) if src_head == b3 && dst_head == exta_head ); // Try pulling dataset_bar with --force: should abandon diverged_head + let sync_result = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(pull_ref.as_any_ref(), dataset_alias_bar.as_any_ref(), false) + .await + .unwrap(), + SyncOptions { + force: true, + ..SyncOptions::default() + }, + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_bar - .sync( - &pull_ref.as_any_ref(), - &dataset_alias_bar.as_any_ref(), - SyncOptions { - force: true, - .. SyncOptions::default() - }, - None - ) - .await, - Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head, new_head, num_blocks: 4, // full resynchronization: seed, b1, b2, b3 .. - }) if old_head == Some(exta_head.clone()) && new_head == b3 + } if old_head == Some(exta_head.clone()) && new_head == b3 ); // Datasets complex divergence ////////////////////////////////////////////// let _b4 = DatasetTestHelper::append_random_data( - dataset_repo_foo.as_ref(), + dataset_registry_foo.as_ref(), &dataset_alias_foo, FILE_DATA_ARRAY_SIZE, ) .await; let b5 = DatasetTestHelper::append_random_data( - dataset_repo_foo.as_ref(), + dataset_registry_foo.as_ref(), &dataset_alias_foo, FILE_DATA_ARRAY_SIZE, ) .await; let b4_alt = DatasetTestHelper::append_random_data( - dataset_repo_bar.as_ref(), + dataset_registry_bar.as_ref(), &dataset_alias_bar, FILE_DATA_ARRAY_SIZE, ) .await; + let sync_result = sync_svc_foo + .sync( + sync_request_builder_foo + .build_sync_request(dataset_alias_foo.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .unwrap(); assert_matches!( - sync_svc_foo.sync(&dataset_alias_foo.as_any_ref(), &push_ref.as_any_ref(), - SyncOptions::default(), None).await, Ok(SyncResult::Updated { + sync_result, + SyncResult::Updated { old_head, new_head, num_blocks: 2, .. - }) if old_head.as_ref() == Some(&b3) && new_head == b5 + } if old_head.as_ref() == Some(&b3) && new_head == b5 ); + let sync_err = sync_svc_bar + .sync( + sync_request_builder_bar + .build_sync_request(dataset_alias_bar.as_any_ref(), push_ref.as_any_ref(), false) + .await + .unwrap(), + SyncOptions::default(), + None, + ) + .await + .err() + .unwrap(); assert_matches!( - sync_svc_bar.sync(&dataset_alias_bar.as_any_ref(), &push_ref.as_any_ref(), SyncOptions::default(), None).await, - Err(SyncError::DatasetsDiverged(DatasetsDivergedError { + sync_err, + SyncError::DatasetsDiverged(DatasetsDivergedError { src_head, dst_head, detail: Some(DatasetsDivergedErrorDetail { uncommon_blocks_in_src, uncommon_blocks_in_dst }) - })) + }) if src_head == b4_alt && dst_head == b5 && uncommon_blocks_in_src == 1 && uncommon_blocks_in_dst == 2 ); // Datasets corrupted transfer flow ///////////////////////////////////////// if is_ipfs { let _b6 = DatasetTestHelper::append_random_data( - dataset_repo_foo.as_ref(), + dataset_registry_foo.as_ref(), &dataset_alias_foo, FILE_DATA_ARRAY_SIZE, ) @@ -456,26 +540,35 @@ async fn do_test_sync( for _i in 0..15 { DatasetTestHelper::append_random_data( - dataset_repo_foo.as_ref(), + dataset_registry_foo.as_ref(), &dataset_alias_foo, FILE_DATA_ARRAY_SIZE, ) .await; } - assert_matches!( - sync_svc_foo + let sync_err = sync_svc_foo .sync( - &dataset_alias_foo.as_any_ref(), - &push_ref.as_any_ref(), + sync_request_builder_foo + .build_sync_request( + dataset_alias_foo.as_any_ref(), + push_ref.as_any_ref(), + false, + ) + .await + .unwrap(), SyncOptions::default(), None, ) - .await, - Err(SyncError::Corrupted(CorruptedSourceError { + .await + .err() + .unwrap(); + assert_matches!( + sync_err, + SyncError::Corrupted(CorruptedSourceError { message, .. - })) if message == *"Source checkpoint file is missing" + }) if message == *"Source checkpoint file is missing" ); } } @@ -495,11 +588,6 @@ async fn test_sync_to_from_local_fs() { &DatasetRefRemote::from(&repo_url), &DatasetRefRemote::from(&repo_url), None, - AuthorizationExpectations::default(), - AuthorizationExpectations { - reads: 2, - writes: 4, - }, ) .await; } @@ -519,11 +607,6 @@ async fn test_sync_to_from_s3() { &DatasetRefRemote::from(&s3.url), &DatasetRefRemote::from(&s3.url), None, - AuthorizationExpectations::default(), - AuthorizationExpectations { - reads: 2, - writes: 4, - }, ) .await; } @@ -549,11 +632,6 @@ async fn test_sync_from_http() { &DatasetRefRemote::from(push_repo_url), &DatasetRefRemote::from(pull_repo_url), None, - AuthorizationExpectations::default(), - AuthorizationExpectations { - reads: 2, - writes: 4, - }, ) .await; } @@ -583,14 +661,6 @@ async fn test_sync_to_from_ipfs() { }, ipfs_client, )), - AuthorizationExpectations { - reads: 7, - ..Default::default() - }, - AuthorizationExpectations { - reads: 2, - writes: 4, - }, ) .await; } diff --git a/src/infra/core/tests/tests/test_transform_service_impl.rs b/src/infra/core/tests/tests/test_transform_service_impl.rs index 8265cbb1ac..7180333adb 100644 --- a/src/infra/core/tests/tests/test_transform_service_impl.rs +++ b/src/infra/core/tests/tests/test_transform_service_impl.rs @@ -29,19 +29,17 @@ use crate::mock_engine_provisioner; struct TransformTestHarness { _tempdir: TempDir, - dataset_repo: Arc, + dataset_registry: Arc, dataset_repo_writer: Arc, - transform_service: Arc, + transform_request_planner: Arc, + transform_elab_svc: Arc, + transform_exec_svc: Arc, compaction_service: Arc, push_ingest_svc: Arc, } impl TransformTestHarness { - pub fn new_custom< - TAuthorizer: auth::DatasetActionAuthorizer + 'static, - TEngineProvisioner: EngineProvisioner + 'static, - >( - dataset_action_authorizer: TAuthorizer, + pub fn new_custom( engine_provisioner: TEngineProvisioner, ) -> Self { let tempdir = tempfile::tempdir().unwrap(); @@ -53,15 +51,11 @@ impl TransformTestHarness { let catalog = dill::CatalogBuilder::new() .add_value(RunInfoDir::new(run_info_dir)) .add_value(CurrentAccountSubject::new_test()) - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) + .add_value(TenancyConfig::SingleTenant) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) .bind::() + .add::() .bind::() - .add_value(dataset_action_authorizer) - .bind::() .add::() .add::() .add::() @@ -71,25 +65,26 @@ impl TransformTestHarness { .bind::() .add_value(engine_provisioner) .bind::() - .add::() + .add::() + .add::() + .add::() .add::() .build(); Self { _tempdir: tempdir, - dataset_repo: catalog.get_one().unwrap(), + dataset_registry: catalog.get_one().unwrap(), dataset_repo_writer: catalog.get_one().unwrap(), - transform_service: catalog.get_one().unwrap(), compaction_service: catalog.get_one().unwrap(), push_ingest_svc: catalog.get_one().unwrap(), + transform_request_planner: catalog.get_one().unwrap(), + transform_elab_svc: catalog.get_one().unwrap(), + transform_exec_svc: catalog.get_one().unwrap(), } } pub fn new() -> Self { - Self::new_custom( - auth::AlwaysHappyDatasetActionAuthorizer::new(), - EngineProvisionerNull, - ) + Self::new_custom(EngineProvisionerNull) } pub async fn new_root(&self, name: &str) -> DatasetHandle { @@ -112,7 +107,7 @@ impl TransformTestHarness { &self, name: &str, inputs: &[DatasetAlias], - ) -> (DatasetHandle, SetTransform) { + ) -> (CreateDatasetResult, SetTransform) { let transform = MetadataFactory::set_transform() .inputs_from_refs(inputs) .build(); @@ -129,7 +124,7 @@ impl TransformTestHarness { .await .unwrap() .create_dataset_result; - (create_result.dataset_handle, transform) + (create_result, transform) } pub async fn append_block( @@ -137,12 +132,13 @@ impl TransformTestHarness { dataset_ref: impl Into, block: MetadataBlock, ) -> Multihash { - let ds = self - .dataset_repo - .find_dataset_by_ref(&dataset_ref.into()) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(&dataset_ref.into()) .await .unwrap(); - ds.as_metadata_chain() + resolved_dataset + .as_metadata_chain() .append(block, AppendOpts::default()) .await .unwrap() @@ -154,12 +150,12 @@ impl TransformTestHarness { alias: &DatasetAlias, records: u64, ) -> (Multihash, MetadataBlockTyped) { - let ds = self - .dataset_repo - .find_dataset_by_ref(&alias.as_local_ref()) + let resolved_dataset = self + .dataset_registry + .get_dataset_by_ref(&alias.as_local_ref()) .await .unwrap(); - let chain = ds.as_metadata_chain(); + let chain = resolved_dataset.as_metadata_chain(); let offset = chain .iter_blocks() .filter_map_ok(|(_, b)| b.event.into_variant::()) @@ -188,12 +184,12 @@ impl TransformTestHarness { (block_hash, block.into_typed::().unwrap()) } - async fn ingest_data(&self, data_str: String, dataset_ref: &DatasetRef) { + async fn ingest_data(&self, data_str: String, dataset_created: &CreateDatasetResult) { let data = std::io::Cursor::new(data_str); self.push_ingest_svc .ingest_from_file_stream( - dataset_ref, + ResolvedDataset::from(dataset_created), None, Box::new(data), PushIngestOpts::default(), @@ -202,6 +198,46 @@ impl TransformTestHarness { .await .unwrap(); } + + async fn elaborate_transform( + &self, + deriv_dataset: &CreateDatasetResult, + options: TransformOptions, + ) -> Result { + let target = ResolvedDataset::from(deriv_dataset); + self.transform_elab_svc + .elaborate_transform( + target.clone(), + self.transform_request_planner + .build_transform_preliminary_plan(target) + .await + .unwrap(), + options, + None, + ) + .await + } + + async fn transform( + &self, + deriv_dataset: &CreateDatasetResult, + options: TransformOptions, + ) -> Result { + let target = ResolvedDataset::from(deriv_dataset); + let elaboration = self + .elaborate_transform(deriv_dataset, options) + .await + .map_err(TransformError::Elaborate)?; + match elaboration { + TransformElaboration::UpToDate => Ok(TransformResult::UpToDate), + TransformElaboration::Elaborated(plan) => self + .transform_exec_svc + .execute_transform(target, plan, None) + .await + .1 + .map_err(TransformError::Execute), + } + } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -214,21 +250,22 @@ async fn test_get_next_operation() { let (bar, bar_source) = harness.new_deriv("bar", &[foo.alias.clone()]).await; // No data - no work - assert_eq!( - harness - .transform_service - .get_next_operation(&bar, Utc::now()) - .await - .unwrap(), - None - ); + let elaboration = harness + .elaborate_transform(&bar, TransformOptions::default()) + .await + .unwrap(); + assert_matches!(elaboration, TransformElaboration::UpToDate); let (foo_head, foo_block) = harness.append_data_block(&foo.alias, 10).await; let foo_slice = foo_block.event.new_data.as_ref().unwrap(); + let elaboration = harness + .elaborate_transform(&bar, TransformOptions::default()) + .await + .unwrap(); assert!(matches!( - harness.transform_service.get_next_operation(&bar, Utc::now()).await.unwrap(), - Some(TransformRequestExt{ transform, inputs, .. }) + elaboration, + TransformElaboration::Elaborated(TransformPlan { request: TransformRequestExt{ transform, inputs, .. }, datasets_map: _ } ) if transform == bar_source.transform && inputs == vec![TransformRequestInputExt { dataset_handle: foo.clone(), @@ -250,63 +287,6 @@ async fn test_get_next_operation() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[test_log::test(tokio::test)] -async fn test_transform_enforces_authorization() { - let mock_dataset_action_authorizer = MockDatasetActionAuthorizer::new() - .expect_check_read_dataset( - &DatasetAlias::new(None, DatasetName::new_unchecked("foo")), - 1, - true, - ) - .expect_check_write_dataset( - &DatasetAlias::new(None, DatasetName::new_unchecked("bar")), - 1, - true, - ); - - let harness = TransformTestHarness::new_custom( - mock_dataset_action_authorizer, - mock_engine_provisioner::MockEngineProvisioner::new().stub_provision_engine(), - ); - - let foo = harness.new_root("foo").await; - let (_, _) = harness.append_data_block(&foo.alias, 10).await; - - let (bar, _) = harness.new_deriv("bar", &[foo.alias.clone()]).await; - - let transform_result = harness - .transform_service - .transform(&bar.as_local_ref(), TransformOptions::default(), None) - .await; - - assert_matches!(transform_result, Ok(_)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#[test_log::test(tokio::test)] -async fn test_transform_unauthorized() { - let harness = TransformTestHarness::new_custom( - MockDatasetActionAuthorizer::denying(), - EngineProvisionerNull, - ); - - let foo = harness.new_root("foo").await; - let (bar, _) = harness.new_deriv("bar", &[foo.alias.clone()]).await; - - let transform_result = harness - .transform_service - .transform(&bar.as_local_ref(), TransformOptions::default(), None) - .await; - - assert_matches!( - transform_result, - Err(TransformError::Access(AccessError::Forbidden(_))) - ); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #[test_log::test(tokio::test)] async fn test_get_verification_plan_one_to_one() { let harness = TransformTestHarness::new(); @@ -390,7 +370,7 @@ async fn test_get_verification_plan_one_to_one() { .unwrap() .new_head; - let deriv_hdl = deriv_create_result.dataset_handle; + let deriv_hdl = &deriv_create_result.dataset_handle; let deriv_initial_sequence_number = 2; // T1: Root data added @@ -429,15 +409,21 @@ async fn test_get_verification_plan_one_to_one() { // T2: Transform [SEED; T1] let t2 = Utc.with_ymd_and_hms(2020, 1, 2, 12, 0, 0).unwrap(); - let deriv_req_t2 = harness - .transform_service - .get_next_operation(&deriv_hdl, t2) + let deriv_req_t2 = match harness + .elaborate_transform(&deriv_create_result, TransformOptions::default()) .await .unwrap() - .unwrap(); + { + TransformElaboration::Elaborated(plan) => TransformRequestExt { + system_time: t2, + ..plan.request + }, + TransformElaboration::UpToDate => panic!("Unexpected transform elab status"), + }; + let deriv_head_t2 = harness .append_block( - &deriv_hdl, + deriv_hdl, MetadataFactory::metadata_block(ExecuteTransform { query_inputs: vec![ExecuteTransformInput { dataset_id: root_hdl.id.clone(), @@ -501,15 +487,20 @@ async fn test_get_verification_plan_one_to_one() { // T4: Transform (T1; T3] let t4 = Utc.with_ymd_and_hms(2020, 1, 4, 12, 0, 0).unwrap(); - let deriv_req_t4 = harness - .transform_service - .get_next_operation(&deriv_hdl, t4) + let deriv_req_t4 = match harness + .elaborate_transform(&deriv_create_result, TransformOptions::default()) .await .unwrap() - .unwrap(); + { + TransformElaboration::Elaborated(plan) => TransformRequestExt { + system_time: t4, + ..plan.request + }, + TransformElaboration::UpToDate => panic!("Unexpected transform elab status"), + }; let deriv_head_t4 = harness .append_block( - &deriv_hdl, + deriv_hdl, MetadataFactory::metadata_block(ExecuteTransform { query_inputs: vec![ExecuteTransformInput { dataset_id: root_hdl.id.clone(), @@ -559,15 +550,20 @@ async fn test_get_verification_plan_one_to_one() { // T6: Transform (T3; T5] let t6 = Utc.with_ymd_and_hms(2020, 1, 6, 12, 0, 0).unwrap(); - let deriv_req_t6 = harness - .transform_service - .get_next_operation(&deriv_hdl, t6) + let deriv_req_t6 = match harness + .elaborate_transform(&deriv_create_result, TransformOptions::default()) .await .unwrap() - .unwrap(); + { + TransformElaboration::Elaborated(plan) => TransformRequestExt { + system_time: t6, + ..plan.request + }, + TransformElaboration::UpToDate => panic!("Unexpected transform elab status"), + }; let deriv_head_t6 = harness .append_block( - &deriv_hdl, + deriv_hdl, MetadataFactory::metadata_block(ExecuteTransform { query_inputs: vec![ExecuteTransformInput { dataset_id: root_hdl.id.clone(), @@ -596,44 +592,47 @@ async fn test_get_verification_plan_one_to_one() { ) .await; - let plan = harness - .transform_service - .get_verification_plan(&deriv_hdl, (None, None)) + let operation: VerifyTransformOperation = harness + .transform_request_planner + .build_transform_verification_plan( + ResolvedDataset::from(&deriv_create_result), + (None, None), + ) .await .unwrap(); - let deriv_ds = harness.dataset_repo.get_dataset_by_handle(&deriv_hdl); - let deriv_chain = deriv_ds.as_metadata_chain(); + let deriv_chain = deriv_create_result.dataset.as_metadata_chain(); - assert_eq!(plan.len(), 3); + assert_eq!(operation.steps.len(), 3); - assert_eq!(plan[0].expected_hash, deriv_head_t2); + assert_eq!(operation.steps[0].expected_hash, deriv_head_t2); assert_eq!( - plan[0].expected_block, + operation.steps[0].expected_block, deriv_chain.get_block(&deriv_head_t2).await.unwrap() ); - assert_eq!(plan[1].expected_hash, deriv_head_t4); + assert_eq!(operation.steps[1].expected_hash, deriv_head_t4); assert_eq!( - plan[1].expected_block, + operation.steps[1].expected_block, deriv_chain.get_block(&deriv_head_t4).await.unwrap() ); - assert_eq!(plan[2].expected_hash, deriv_head_t6); + assert_eq!(operation.steps[2].expected_hash, deriv_head_t6); assert_eq!( - plan[2].expected_block, + operation.steps[2].expected_block, deriv_chain.get_block(&deriv_head_t6).await.unwrap() ); - assert_requests_equivalent(&plan[0].request, deriv_req_t2); - assert_requests_equivalent(&plan[1].request, deriv_req_t4); - assert_requests_equivalent(&plan[2].request, deriv_req_t6); + assert_requests_equivalent(&operation.steps[0].request, deriv_req_t2); + assert_requests_equivalent(&operation.steps[1].request, deriv_req_t4); + assert_requests_equivalent(&operation.steps[2].request, deriv_req_t6); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + #[test_log::test(tokio::test)] async fn test_transform_with_compaction_retry() { let harness = TransformTestHarness::new_custom( - auth::AlwaysHappyDatasetActionAuthorizer::new(), mock_engine_provisioner::MockEngineProvisioner::new().always_provision_engine(), ); let root_alias = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); @@ -680,10 +679,7 @@ async fn test_transform_with_compaction_retry() { " ); harness - .ingest_data( - data_str.to_string(), - &foo_created_result.dataset_handle.as_local_ref(), - ) + .ingest_data(data_str.to_string(), &foo_created_result) .await; let data_str = indoc!( " @@ -694,56 +690,43 @@ async fn test_transform_with_compaction_retry() { " ); harness - .ingest_data( - data_str.to_string(), - &foo_created_result.dataset_handle.as_local_ref(), - ) + .ingest_data(data_str.to_string(), &foo_created_result) .await; let (bar, _) = harness .new_deriv("bar", &[foo_created_result.dataset_handle.alias.clone()]) .await; - let transform_result = harness - .transform_service - .transform(&bar.as_local_ref(), TransformOptions::default(), None) - .await; - + let transform_result = harness.transform(&bar, TransformOptions::default()).await; assert_matches!(transform_result, Ok(TransformResult::Updated { .. })); + let foo_dataset = harness + .dataset_registry + .get_dataset_by_handle(&foo_created_result.dataset_handle); + harness .compaction_service - .compact_dataset( - &foo_created_result.dataset_handle, - CompactionOptions::default(), - None, - ) + .compact_dataset(foo_dataset, CompactionOptions::default(), None) .await .unwrap(); - let transform_result = harness - .transform_service - .transform(&bar.as_local_ref(), TransformOptions::default(), None) - .await; + let transform_result = harness.transform(&bar, TransformOptions::default()).await; assert_matches!( transform_result, - Err(TransformError::InvalidInputInterval( - InvalidInputIntervalError { .. } + Err(TransformError::Elaborate( + TransformElaborateError::InvalidInputInterval(InvalidInputIntervalError { .. }) )) ); let transform_result = harness - .transform_service .transform( - &bar.as_local_ref(), + &bar, TransformOptions { reset_derivatives_on_diverged_input: true, }, - None, ) .await; - assert_matches!(transform_result, Ok(TransformResult::Updated { .. })); } diff --git a/src/infra/core/tests/tests/test_verification_service_impl.rs b/src/infra/core/tests/tests/test_verification_service_impl.rs index 025bec3c37..89d79a5cfd 100644 --- a/src/infra/core/tests/tests/test_verification_service_impl.rs +++ b/src/infra/core/tests/tests/test_verification_service_impl.rs @@ -8,91 +8,59 @@ // by the Apache License, Version 2.0. use std::assert_matches::assert_matches; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use datafusion::arrow::array::{Array, Int32Array, StringArray}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; -use dill::Component; use kamu::domain::*; -use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer, ParquetWriterHelper}; +use kamu::testing::{MetadataFactory, ParquetWriterHelper}; use kamu::*; -use kamu_accounts::CurrentAccountSubject; use opendatafabric::*; -use time_source::SystemTimeSourceDefault; -use super::test_pull_service_impl::TestTransformService; +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[tokio::test] async fn test_verify_data_consistency() { - let tempdir = tempfile::tempdir().unwrap(); - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); + let harness = VerifyHarness::new(); - let dataset_alias = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let foo_alias = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let bar_alias = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); - let catalog = dill::CatalogBuilder::new() - .add::() - .add_value(CurrentAccountSubject::new_test()) - .add_value( - MockDatasetActionAuthorizer::new().expect_check_read_dataset(&dataset_alias, 3, true), - ) - .bind::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(TestTransformService::new(Arc::new(Mutex::new(Vec::new())))) - .bind::() - .add::() - .build(); - - let verification_svc = catalog.get_one::().unwrap(); - let dataset_repo = catalog.get_one::().unwrap(); - let dataset_repo_writer = catalog.get_one::().unwrap(); - - dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name("foo") - .kind(DatasetKind::Root) - .push_event(MetadataFactory::set_polling_source().build()) - .push_event(MetadataFactory::set_data_schema().build()) - .build(), + let foo = harness.create_root_dataset(&foo_alias).await; + foo.dataset + .commit_event( + MetadataEvent::SetDataSchema(MetadataFactory::set_data_schema().build()), + CommitOpts::default(), ) .await .unwrap(); - dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(dataset_alias.clone()) - .kind(DatasetKind::Derivative) - .push_event( - MetadataFactory::set_transform() - .inputs_from_refs(["foo"]) - .build(), - ) - .push_event(MetadataFactory::set_data_schema().build()) - .build(), + let bar = harness + .create_derived_dataset(&bar_alias, vec![foo_alias.as_local_ref()]) + .await; + bar.dataset + .commit_event( + MetadataEvent::SetDataSchema(MetadataFactory::set_data_schema().build()), + CommitOpts::default(), ) .await .unwrap(); assert_matches!( - verification_svc + harness + .verification_svc .verify( - &dataset_alias.as_local_ref(), - (None, None), - VerificationOptions { - check_integrity: true, - check_logical_hashes: true, - replay_transformations: false + VerificationRequest { + target: ResolvedDataset::from(&bar), + block_range: (None, None), + options: VerificationOptions { + check_integrity: true, + check_logical_hashes: true, + replay_transformations: false + }, }, None, ) @@ -112,7 +80,7 @@ async fn test_verify_data_consistency() { let b: Arc = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])); let record_batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::clone(&a), Arc::clone(&b)]).unwrap(); - let data_path = tempdir.path().join("data"); + let data_path = harness.temp_dir_path().join("data"); ParquetWriterHelper::from_record_batch(&data_path, &record_batch).unwrap(); let data_logical_hash = @@ -121,12 +89,8 @@ async fn test_verify_data_consistency() { kamu_data_utils::data::hash::get_file_physical_hash(&data_path).unwrap(); // Commit data - let dataset = dataset_repo - .find_dataset_by_ref(&dataset_alias.as_local_ref()) - .await - .unwrap(); - - let head = dataset + let head = bar + .dataset .commit_add_data( AddDataParams { prev_checkpoint: None, @@ -144,7 +108,7 @@ async fn test_verify_data_consistency() { .new_head; assert_matches!( - dataset.as_metadata_chain().get_block(&head).await.unwrap(), + bar.dataset.as_metadata_chain().get_block(&head).await.unwrap(), MetadataBlock { event: MetadataEvent::AddData(AddData { new_data: Some(DataSlice { @@ -159,14 +123,17 @@ async fn test_verify_data_consistency() { // Check verification succeeds assert_matches!( - verification_svc + harness + .verification_svc .verify( - &dataset_alias.as_local_ref(), - (None, None), - VerificationOptions { - check_integrity: true, - check_logical_hashes: true, - replay_transformations: false + VerificationRequest { + target: ResolvedDataset::from(&bar), + block_range: (None, None), + options: VerificationOptions { + check_integrity: true, + check_logical_hashes: true, + replay_transformations: false + }, }, None, ) @@ -183,7 +150,7 @@ async fn test_verify_data_consistency() { RecordBatch::try_new(Arc::clone(&schema), vec![Arc::clone(&a), Arc::clone(&b)]).unwrap(); let local_data_path = kamu_data_utils::data::local_url::into_local_path( - dataset + bar.dataset .as_data_repo() .get_internal_url(&data_physical_hash) .await, @@ -194,10 +161,16 @@ async fn test_verify_data_consistency() { // Check verification fails assert_matches!( - verification_svc.verify( - &dataset_alias.as_local_ref(), - (None, None), - VerificationOptions {check_integrity: true, check_logical_hashes: true, replay_transformations: false}, + harness.verification_svc.verify( + VerificationRequest { + target: ResolvedDataset::from(&bar), + block_range: (None, None), + options: VerificationOptions { + check_integrity: true, + check_logical_hashes: true, + replay_transformations: false + }, + }, None, ).await, VerificationResult { @@ -213,3 +186,29 @@ async fn test_verify_data_consistency() { } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseRepoHarness, base_repo_harness)] +struct VerifyHarness { + base_repo_harness: BaseRepoHarness, + verification_svc: Arc, +} + +impl VerifyHarness { + fn new() -> Self { + let base_repo_harness = BaseRepoHarness::new(TenancyConfig::SingleTenant); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) + .add::() + .add::() + .add::() + .add::() + .build(); + + Self { + base_repo_harness, + verification_svc: catalog.get_one().unwrap(), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_watermark_service_impl.rs b/src/infra/core/tests/tests/test_watermark_service_impl.rs new file mode 100644 index 0000000000..59ebe448ae --- /dev/null +++ b/src/infra/core/tests/tests/test_watermark_service_impl.rs @@ -0,0 +1,192 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::sync::Arc; + +use chrono::{DateTime, TimeZone, Utc}; +use kamu::{RemoteAliasesRegistryImpl, WatermarkServiceImpl}; +use kamu_core::{ + ResolvedDataset, + SetWatermarkError, + SetWatermarkResult, + TenancyConfig, + WatermarkService, +}; +use opendatafabric::{DatasetAlias, DatasetName}; + +use crate::BaseRepoHarness; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_no_watermark_initially() { + let harness = WatermarkTestHarness::new(TenancyConfig::SingleTenant); + + let foo = harness + .create_root_dataset(&DatasetAlias::new( + None, + DatasetName::try_from("foo").unwrap(), + )) + .await; + + assert_eq!( + harness.current_watermark(ResolvedDataset::from(&foo)).await, + None, + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_set_watermark() { + let harness = WatermarkTestHarness::new(TenancyConfig::SingleTenant); + + let foo = harness + .create_root_dataset(&DatasetAlias::new( + None, + DatasetName::try_from("foo").unwrap(), + )) + .await; + + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 2); + + let watermark_1 = Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap(); + assert_matches!( + harness + .set_watermark(ResolvedDataset::from(&foo), watermark_1) + .await, + Ok(SetWatermarkResult::Updated { .. }) + ); + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 3); + assert_eq!( + harness.current_watermark(ResolvedDataset::from(&foo)).await, + Some(watermark_1), + ); + + let watermark_2 = Utc.with_ymd_and_hms(2000, 1, 3, 0, 0, 0).unwrap(); + assert_matches!( + harness + .set_watermark(ResolvedDataset::from(&foo), watermark_2) + .await, + Ok(SetWatermarkResult::Updated { .. }) + ); + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 4); + assert_eq!( + harness.current_watermark(ResolvedDataset::from(&foo)).await, + Some(watermark_2), + ); + + assert_matches!( + harness + .set_watermark(ResolvedDataset::from(&foo), watermark_2) + .await, + Ok(SetWatermarkResult::UpToDate) + ); + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 4); + assert_eq!( + harness.current_watermark(ResolvedDataset::from(&foo)).await, + Some(watermark_2), + ); + + let watermark_3 = Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap(); + assert_matches!( + harness + .set_watermark(ResolvedDataset::from(&foo), watermark_3) + .await, + Ok(SetWatermarkResult::UpToDate) + ); + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 4); + assert_eq!( + harness.current_watermark(ResolvedDataset::from(&foo)).await, + Some(watermark_2), + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_set_watermark_rejects_on_derivative() { + let harness = WatermarkTestHarness::new(TenancyConfig::MultiTenant); + + let root = harness + .create_root_dataset(&DatasetAlias::new( + None, + DatasetName::try_from("foo").unwrap(), + )) + .await; + + let derived = harness + .create_derived_dataset( + &DatasetAlias::new(None, DatasetName::try_from("bar").unwrap()), + vec![root.dataset_handle.as_local_ref()], + ) + .await; + + assert_matches!( + harness + .set_watermark( + ResolvedDataset::from(&derived), + Utc.with_ymd_and_hms(2000, 1, 2, 0, 0, 0).unwrap() + ) + .await, + Err(SetWatermarkError::IsDerivative) + ); + + assert_eq!(harness.num_blocks(ResolvedDataset::from(&derived)).await, 2); + assert_eq!( + harness + .current_watermark(ResolvedDataset::from(&derived)) + .await, + None, + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseRepoHarness, base_repo_harness)] +struct WatermarkTestHarness { + base_repo_harness: BaseRepoHarness, + watermark_svc: Arc, +} + +impl WatermarkTestHarness { + fn new(tenancy_config: TenancyConfig) -> Self { + let base_repo_harness = BaseRepoHarness::new(tenancy_config); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) + .add::() + .add::() + .build(); + + Self { + base_repo_harness, + watermark_svc: catalog.get_one().unwrap(), + } + } + + async fn set_watermark( + &self, + target: ResolvedDataset, + new_watermark: DateTime, + ) -> Result { + self.watermark_svc + .set_watermark(target, new_watermark) + .await + } + + async fn current_watermark(&self, target: ResolvedDataset) -> Option> { + self.watermark_svc + .try_get_current_watermark(target) + .await + .unwrap() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/base_use_case_harness.rs b/src/infra/core/tests/tests/use_cases/base_use_case_harness.rs new file mode 100644 index 0000000000..1573c09bbf --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/base_use_case_harness.rs @@ -0,0 +1,85 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use dill::Catalog; +use kamu::testing::MockDatasetActionAuthorizer; +use kamu_core::auth::DatasetActionAuthorizer; +use kamu_core::TenancyConfig; +use messaging_outbox::{MockOutbox, Outbox}; + +use crate::BaseRepoHarness; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) struct BaseUseCaseHarnessOptions { + tenancy_config: TenancyConfig, + mock_dataset_action_authorizer: MockDatasetActionAuthorizer, + mock_outbox: MockOutbox, +} + +impl BaseUseCaseHarnessOptions { + pub(crate) fn new() -> Self { + Self::default() + } + + pub(crate) fn with_authorizer( + mut self, + mock_dataset_action_authorizer: MockDatasetActionAuthorizer, + ) -> Self { + self.mock_dataset_action_authorizer = mock_dataset_action_authorizer; + self + } + + pub(crate) fn with_outbox(mut self, mock_outbox: MockOutbox) -> Self { + self.mock_outbox = mock_outbox; + self + } +} + +impl Default for BaseUseCaseHarnessOptions { + fn default() -> Self { + Self { + tenancy_config: TenancyConfig::SingleTenant, + mock_dataset_action_authorizer: MockDatasetActionAuthorizer::new(), + mock_outbox: MockOutbox::new(), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseRepoHarness, base_repo_harness)] +pub(crate) struct BaseUseCaseHarness { + base_repo_harness: BaseRepoHarness, + catalog: Catalog, +} + +impl BaseUseCaseHarness { + pub(crate) fn new(options: BaseUseCaseHarnessOptions) -> Self { + let base_repo_harness = BaseRepoHarness::new(options.tenancy_config); + + let catalog = dill::CatalogBuilder::new_chained(base_repo_harness.catalog()) + .add_value(options.mock_dataset_action_authorizer) + .bind::() + .add_value(options.mock_outbox) + .bind::() + .build(); + + Self { + base_repo_harness, + catalog, + } + } + + pub fn catalog(&self) -> &Catalog { + &self.catalog + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/mod.rs b/src/infra/core/tests/tests/use_cases/mod.rs index 9a2a97d454..e95da20c60 100644 --- a/src/infra/core/tests/tests/use_cases/mod.rs +++ b/src/infra/core/tests/tests/use_cases/mod.rs @@ -9,7 +9,18 @@ mod test_append_dataset_metadata_batch_use_case; mod test_commit_dataset_event_use_case; +mod test_compact_dataset_use_case; mod test_create_dataset_from_snapshot_use_case; mod test_create_dataset_use_case; mod test_delete_dataset_use_case; +mod test_pull_dataset_use_case; +mod test_push_dataset_use_case; mod test_rename_dataset_use_case; +mod test_reset_dataset_use_case; +mod test_set_watermark_use_case; +mod test_verify_dataset_use_case; + +mod base_use_case_harness; +mod outbox_expectation_helpers; +pub(crate) use base_use_case_harness::*; +pub(crate) use outbox_expectation_helpers::*; diff --git a/src/infra/core/tests/tests/use_cases/outbox_expectation_helpers.rs b/src/infra/core/tests/tests/use_cases/outbox_expectation_helpers.rs new file mode 100644 index 0000000000..d319941541 --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/outbox_expectation_helpers.rs @@ -0,0 +1,93 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use kamu_core::{DatasetLifecycleMessage, MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE}; +use messaging_outbox::MockOutbox; +use mockall::predicate::{always, eq, function}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn expect_outbox_dataset_created(mock_outbox: &mut MockOutbox, times: usize) { + mock_outbox + .expect_post_message_as_json() + .with( + eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), + function(|message_as_json: &serde_json::Value| { + matches!( + serde_json::from_value::(message_as_json.clone()), + Ok(DatasetLifecycleMessage::Created(_)) + ) + }), + always(), + ) + .times(times) + .returning(|_, _, _| Ok(())); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn expect_outbox_dataset_dependencies_updated( + mock_outbox: &mut MockOutbox, + times: usize, +) { + mock_outbox + .expect_post_message_as_json() + .with( + eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), + function(|message_as_json: &serde_json::Value| { + matches!( + serde_json::from_value::(message_as_json.clone()), + Ok(DatasetLifecycleMessage::DependenciesUpdated(_)) + ) + }), + always(), + ) + .times(times) + .returning(|_, _, _| Ok(())); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn expect_outbox_dataset_renamed(mock_outbox: &mut MockOutbox, times: usize) { + mock_outbox + .expect_post_message_as_json() + .with( + eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), + function(|message_as_json: &serde_json::Value| { + matches!( + serde_json::from_value::(message_as_json.clone()), + Ok(DatasetLifecycleMessage::Renamed(_)) + ) + }), + always(), + ) + .times(times) + .returning(|_, _, _| Ok(())); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn expect_outbox_dataset_deleted(mock_outbox: &mut MockOutbox, times: usize) { + mock_outbox + .expect_post_message_as_json() + .with( + eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), + function(|message_as_json: &serde_json::Value| { + matches!( + serde_json::from_value::(message_as_json.clone()), + Ok(DatasetLifecycleMessage::Deleted(_)) + ) + }), + always(), + ) + .times(times) + .returning(|_, _, _| Ok(())); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_append_dataset_metadata_batch_use_case.rs b/src/infra/core/tests/tests/use_cases/test_append_dataset_metadata_batch_use_case.rs index ffef5ff651..3a6b29a164 100644 --- a/src/infra/core/tests/tests/use_cases/test_append_dataset_metadata_batch_use_case.rs +++ b/src/infra/core/tests/tests/use_cases/test_append_dataset_metadata_batch_use_case.rs @@ -12,35 +12,14 @@ use std::collections::VecDeque; use std::sync::Arc; use chrono::Utc; -use dill::{Catalog, Component}; use kamu::testing::MetadataFactory; -use kamu::{ - AppendDatasetMetadataBatchUseCaseImpl, - DatasetRepositoryLocalFs, - DatasetRepositoryWriter, -}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::{ - AppendDatasetMetadataBatchUseCase, - CreateDatasetResult, - DatasetLifecycleMessage, - DatasetRepository, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, -}; -use messaging_outbox::{MockOutbox, Outbox}; -use mockall::predicate::{eq, function}; -use opendatafabric::serde::flatbuffers::FlatbuffersMetadataBlockSerializer; -use opendatafabric::serde::MetadataBlockSerializer; -use opendatafabric::{ - DatasetAlias, - DatasetKind, - DatasetName, - MetadataBlock, - MetadataEvent, - Multicodec, - Multihash, -}; -use time_source::SystemTimeSourceDefault; +use kamu::AppendDatasetMetadataBatchUseCaseImpl; +use kamu_core::AppendDatasetMetadataBatchUseCase; +use messaging_outbox::MockOutbox; +use opendatafabric::*; + +use crate::tests::use_cases::*; +use crate::BaseRepoHarness; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -51,29 +30,23 @@ async fn test_append_dataset_metadata_batch() { let mock_outbox = MockOutbox::new(); let harness = AppendDatasetMetadataBatchUseCaseHarness::new(mock_outbox); - let create_result_foo = harness.create_dataset(&alias_foo, DatasetKind::Root).await; - - let foo_dataset = harness - .dataset_repo - .get_dataset_by_handle(&create_result_foo.dataset_handle); + let foo = harness.create_root_dataset(&alias_foo).await; let set_info_block = MetadataBlock { system_time: Utc::now(), - prev_block_hash: Some(create_result_foo.head.clone()), - sequence_number: 1, + prev_block_hash: Some(foo.head.clone()), + sequence_number: 2, event: MetadataEvent::SetInfo(MetadataFactory::set_info().description("test").build()), }; - let hash_set_info_block = - AppendDatasetMetadataBatchUseCaseHarness::hash_from_block(&set_info_block); + let hash_set_info_block = BaseRepoHarness::hash_from_block(&set_info_block); let set_license_block = MetadataBlock { system_time: Utc::now(), prev_block_hash: Some(hash_set_info_block.clone()), - sequence_number: 2, + sequence_number: 3, event: MetadataEvent::SetLicense(MetadataFactory::set_license().build()), }; - let hash_set_license_block = - AppendDatasetMetadataBatchUseCaseHarness::hash_from_block(&set_license_block); + let hash_set_license_block = BaseRepoHarness::hash_from_block(&set_license_block); let new_blocks = VecDeque::from([ (hash_set_info_block, set_info_block), @@ -82,7 +55,7 @@ async fn test_append_dataset_metadata_batch() { let res = harness .use_case - .execute(foo_dataset.as_ref(), new_blocks, false) + .execute(foo.dataset.as_ref(), new_blocks, false) .await; assert_matches!(res, Ok(_)); } @@ -95,137 +68,61 @@ async fn test_append_dataset_metadata_batch_with_new_dependencies() { let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); let mut mock_outbox = MockOutbox::new(); - AppendDatasetMetadataBatchUseCaseHarness::add_outbox_dataset_dependencies_updated_expectation( - &mut mock_outbox, - 1, - ); + expect_outbox_dataset_dependencies_updated(&mut mock_outbox, 1); let harness = AppendDatasetMetadataBatchUseCaseHarness::new(mock_outbox); - let create_result_foo = harness.create_dataset(&alias_foo, DatasetKind::Root).await; - let create_result_bar = harness - .create_dataset(&alias_bar, DatasetKind::Derivative) + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness + .create_derived_dataset(&alias_bar, vec![foo.dataset_handle.as_local_ref()]) .await; - let bar_dataset = harness - .dataset_repo - .get_dataset_by_handle(&create_result_bar.dataset_handle); - let set_transform_block = MetadataBlock { system_time: Utc::now(), - prev_block_hash: Some(create_result_bar.head.clone()), - sequence_number: 1, + prev_block_hash: Some(bar.head.clone()), + sequence_number: 2, event: MetadataEvent::SetTransform( MetadataFactory::set_transform() - .inputs_from_refs_and_aliases(vec![( - create_result_foo.dataset_handle.id, - alias_foo.to_string(), - )]) + .inputs_from_refs_and_aliases(vec![(foo.dataset_handle.id, alias_foo.to_string())]) .build(), ), }; - let hash_set_transform_block = - AppendDatasetMetadataBatchUseCaseHarness::hash_from_block(&set_transform_block); + let hash_set_transform_block = BaseRepoHarness::hash_from_block(&set_transform_block); let new_blocks = VecDeque::from([(hash_set_transform_block, set_transform_block)]); let res = harness .use_case - .execute(bar_dataset.as_ref(), new_blocks, false) + .execute(bar.dataset.as_ref(), new_blocks, false) .await; assert_matches!(res, Ok(_)); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseUseCaseHarness, base_harness)] struct AppendDatasetMetadataBatchUseCaseHarness { - _temp_dir: tempfile::TempDir, - catalog: Catalog, - dataset_repo: Arc, + base_harness: BaseUseCaseHarness, use_case: Arc, } impl AppendDatasetMetadataBatchUseCaseHarness { fn new(mock_outbox: MockOutbox) -> Self { - let tempdir = tempfile::tempdir().unwrap(); + let base_harness = + BaseUseCaseHarness::new(BaseUseCaseHarnessOptions::new().with_outbox(mock_outbox)); - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); - - let catalog = dill::CatalogBuilder::new() + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add::() - .add_value(mock_outbox) - .bind::() .build(); let use_case = catalog .get_one::() .unwrap(); - let dataset_repo = catalog.get_one::().unwrap(); - Self { - _temp_dir: tempdir, - catalog, + base_harness, use_case, - dataset_repo, } } - - async fn create_dataset(&self, alias: &DatasetAlias, kind: DatasetKind) -> CreateDatasetResult { - let snapshot = MetadataFactory::dataset_snapshot() - .name(alias.clone()) - .kind(kind) - .build(); - - let dataset_repo_writer = self - .catalog - .get_one::() - .unwrap(); - - let result = dataset_repo_writer - .create_dataset_from_snapshot(snapshot) - .await - .unwrap(); - - result.create_dataset_result - } - - fn hash_from_block(block: &MetadataBlock) -> Multihash { - let block_data = FlatbuffersMetadataBlockSerializer - .write_manifest(block) - .unwrap(); - - Multihash::from_digest::(Multicodec::Sha3_256, &block_data) - } - - fn add_outbox_dataset_dependencies_updated_expectation( - mock_outbox: &mut MockOutbox, - times: usize, - ) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::DependenciesUpdated(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_commit_dataset_event_use_case.rs b/src/infra/core/tests/tests/use_cases/test_commit_dataset_event_use_case.rs index 98e9e7c390..d68d41c077 100644 --- a/src/infra/core/tests/tests/use_cases/test_commit_dataset_event_use_case.rs +++ b/src/infra/core/tests/tests/use_cases/test_commit_dataset_event_use_case.rs @@ -10,24 +10,13 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use dill::{Catalog, Component}; use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer}; -use kamu::{CommitDatasetEventUseCaseImpl, DatasetRepositoryLocalFs, DatasetRepositoryWriter}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::auth::DatasetActionAuthorizer; -use kamu_core::{ - CommitDatasetEventUseCase, - CommitError, - CommitOpts, - CreateDatasetResult, - DatasetLifecycleMessage, - DatasetRepository, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, -}; -use messaging_outbox::{MockOutbox, Outbox}; -use mockall::predicate::{eq, function}; -use opendatafabric::{DatasetAlias, DatasetKind, DatasetName, MetadataEvent}; -use time_source::SystemTimeSourceDefault; +use kamu::CommitDatasetEventUseCaseImpl; +use kamu_core::{CommitDatasetEventUseCase, CommitError, CommitOpts}; +use messaging_outbox::MockOutbox; +use opendatafabric::{DatasetAlias, DatasetName, MetadataEvent}; + +use crate::tests::use_cases::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -41,12 +30,12 @@ async fn test_commit_dataset_event() { let mock_outbox = MockOutbox::new(); let harness = CommitDatasetEventUseCaseHarness::new(mock_authorizer, mock_outbox); - let create_result_foo = harness.create_dataset(&alias_foo, DatasetKind::Root).await; + let foo = harness.create_root_dataset(&alias_foo).await; let res = harness .use_case .execute( - &create_result_foo.dataset_handle, + &foo.dataset_handle, MetadataEvent::SetInfo(MetadataFactory::set_info().description("test").build()), CommitOpts::default(), ) @@ -66,12 +55,12 @@ async fn test_commit_event_unauthorized() { let mock_outbox = MockOutbox::new(); let harness = CommitDatasetEventUseCaseHarness::new(mock_authorizer, mock_outbox); - let create_result_foo = harness.create_dataset(&alias_foo, DatasetKind::Root).await; + let foo = harness.create_root_dataset(&alias_foo).await; let res = harness .use_case .execute( - &create_result_foo.dataset_handle, + &foo.dataset_handle, MetadataEvent::SetInfo(MetadataFactory::set_info().description("test").build()), CommitOpts::default(), ) @@ -90,25 +79,22 @@ async fn test_commit_event_with_new_dependencies() { MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_bar, 1, true); let mut mock_outbox = MockOutbox::new(); - CommitDatasetEventUseCaseHarness::add_outbox_dataset_dependencies_updated_expectation( - &mut mock_outbox, - 1, - ); + expect_outbox_dataset_dependencies_updated(&mut mock_outbox, 1); let harness = CommitDatasetEventUseCaseHarness::new(mock_authorizer, mock_outbox); - let create_result_foo = harness.create_dataset(&alias_foo, DatasetKind::Root).await; - let create_result_bar = harness - .create_dataset(&alias_bar, DatasetKind::Derivative) + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness + .create_derived_dataset(&alias_bar, vec![foo.dataset_handle.as_local_ref()]) .await; let res = harness .use_case .execute( - &create_result_bar.dataset_handle, + &bar.dataset_handle, MetadataEvent::SetTransform( MetadataFactory::set_transform() .inputs_from_refs_and_aliases(vec![( - create_result_foo.dataset_handle.id, + foo.dataset_handle.id, alias_foo.to_string(), )]) .build(), @@ -121,9 +107,9 @@ async fn test_commit_event_with_new_dependencies() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseUseCaseHarness, base_harness)] struct CommitDatasetEventUseCaseHarness { - _temp_dir: tempfile::TempDir, - catalog: Catalog, + base_harness: BaseUseCaseHarness, use_case: Arc, } @@ -132,75 +118,23 @@ impl CommitDatasetEventUseCaseHarness { mock_dataset_action_authorizer: MockDatasetActionAuthorizer, mock_outbox: MockOutbox, ) -> Self { - let tempdir = tempfile::tempdir().unwrap(); - - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new() + .with_authorizer(mock_dataset_action_authorizer) + .with_outbox(mock_outbox), + ); - let catalog = dill::CatalogBuilder::new() + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add_value(mock_dataset_action_authorizer) - .bind::() - .add::() - .add_value(mock_outbox) - .bind::() .build(); let use_case = catalog.get_one::().unwrap(); Self { - _temp_dir: tempdir, - catalog, + base_harness, use_case, } } - - async fn create_dataset(&self, alias: &DatasetAlias, kind: DatasetKind) -> CreateDatasetResult { - let snapshot = MetadataFactory::dataset_snapshot() - .name(alias.clone()) - .kind(kind) - .build(); - - let dataset_repo_writer = self - .catalog - .get_one::() - .unwrap(); - - let result = dataset_repo_writer - .create_dataset_from_snapshot(snapshot) - .await - .unwrap(); - - result.create_dataset_result - } - - fn add_outbox_dataset_dependencies_updated_expectation( - mock_outbox: &mut MockOutbox, - times: usize, - ) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::DependenciesUpdated(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_compact_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_compact_dataset_use_case.rs new file mode 100644 index 0000000000..4a1ffba017 --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/test_compact_dataset_use_case.rs @@ -0,0 +1,189 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::sync::Arc; + +use kamu::testing::MockDatasetActionAuthorizer; +use kamu::*; +use kamu_core::*; +use opendatafabric::{DatasetAlias, DatasetName}; + +use crate::tests::use_cases::*; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_compact_dataset_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = CompactUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + + assert_matches!( + harness.compact_dataset(ResolvedDataset::from(&foo)).await, + Ok(_) + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_compact_multiple_datasets_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + + let harness = CompactUseCaseHarness::new( + MockDatasetActionAuthorizer::new() + .expect_check_write_dataset(&alias_foo, 1, true) + .expect_check_write_dataset(&alias_bar, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + + let mut responses = harness + .compact_datasets(vec![ + ResolvedDataset::from(&foo), + ResolvedDataset::from(&bar), + ]) + .await; + + assert_eq!(responses.len(), 2); + let response_bar = responses.remove(1); + let response_foo = responses.remove(0); + + assert_matches!( + response_foo, + CompactionResponse { + result: Ok(_), + dataset_ref + } if dataset_ref == foo.dataset_handle.as_local_ref()); + assert_matches!( + response_bar, + CompactionResponse { + result: Ok(_), + dataset_ref + } if dataset_ref == bar.dataset_handle.as_local_ref() + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_compact_dataset_unauthorized() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = CompactUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, false), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + + assert_matches!( + harness.compact_dataset(ResolvedDataset::from(&foo)).await, + Err(CompactionError::Access(_)) + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_compact_dataset_mixed_authorization_outcome() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + + let harness = CompactUseCaseHarness::new( + MockDatasetActionAuthorizer::new() + .expect_check_write_dataset(&alias_foo, 1, false) + .expect_check_write_dataset(&alias_bar, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + + let mut responses = harness + .compact_datasets(vec![ + ResolvedDataset::from(&foo), + ResolvedDataset::from(&bar), + ]) + .await; + + assert_eq!(responses.len(), 2); + let response_bar = responses.remove(1); + let response_foo = responses.remove(0); + + assert_matches!( + response_foo, + CompactionResponse { + result: Err(CompactionError::Access(_)), + dataset_ref + } if dataset_ref == foo.dataset_handle.as_local_ref()); + assert_matches!( + response_bar, + CompactionResponse { + result: Ok(_), + dataset_ref + } if dataset_ref == bar.dataset_handle.as_local_ref() + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseUseCaseHarness, base_harness)] +struct CompactUseCaseHarness { + base_harness: BaseUseCaseHarness, + use_case: Arc, +} + +impl CompactUseCaseHarness { + fn new(mock_dataset_action_authorizer: MockDatasetActionAuthorizer) -> Self { + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new().with_authorizer(mock_dataset_action_authorizer), + ); + + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .add::() + .add::() + .build(); + + let use_case = catalog.get_one::().unwrap(); + + Self { + base_harness, + use_case, + } + } + + async fn compact_dataset( + &self, + target: ResolvedDataset, + ) -> Result { + self.use_case + .execute(target.get_handle(), CompactionOptions::default(), None) + .await + } + + async fn compact_datasets(&self, targets: Vec) -> Vec { + let handles: Vec<_> = targets + .into_iter() + .map(ResolvedDataset::take_handle) + .collect(); + + self.use_case + .execute_multi(handles, CompactionOptions::default(), None) + .await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_create_dataset_from_snapshot_use_case.rs b/src/infra/core/tests/tests/use_cases/test_create_dataset_from_snapshot_use_case.rs index 0724b2a06d..d44b77a180 100644 --- a/src/infra/core/tests/tests/use_cases/test_create_dataset_from_snapshot_use_case.rs +++ b/src/infra/core/tests/tests/use_cases/test_create_dataset_from_snapshot_use_case.rs @@ -10,25 +10,13 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use dill::{Catalog, Component}; use kamu::testing::MetadataFactory; -use kamu::{ - CreateDatasetFromSnapshotUseCaseImpl, - DatasetRepositoryLocalFs, - DatasetRepositoryWriter, -}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::{ - CreateDatasetFromSnapshotUseCase, - DatasetLifecycleMessage, - DatasetRepository, - GetDatasetError, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, -}; -use messaging_outbox::{MockOutbox, Outbox}; -use mockall::predicate::{eq, function}; +use kamu::CreateDatasetFromSnapshotUseCaseImpl; +use kamu_core::CreateDatasetFromSnapshotUseCase; +use messaging_outbox::MockOutbox; use opendatafabric::{DatasetAlias, DatasetKind, DatasetName}; -use time_source::SystemTimeSourceDefault; + +use crate::tests::use_cases::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -38,7 +26,7 @@ async fn test_create_root_dataset_from_snapshot() { // Expect only DatasetCreated message for "foo" let mut mock_outbox = MockOutbox::new(); - CreateFromSnapshotUseCaseHarness::add_outbox_dataset_created_expectation(&mut mock_outbox, 1); + expect_outbox_dataset_created(&mut mock_outbox, 1); let harness = CreateFromSnapshotUseCaseHarness::new(mock_outbox); @@ -65,11 +53,8 @@ async fn test_create_derived_dataset_from_snapshot() { // Expect DatasetCreated messages for "foo" and "bar" // Expect DatasetDependenciesUpdated message for "bar" let mut mock_outbox = MockOutbox::new(); - CreateFromSnapshotUseCaseHarness::add_outbox_dataset_created_expectation(&mut mock_outbox, 2); - CreateFromSnapshotUseCaseHarness::add_outbox_dataset_dependencies_updated_expectation( - &mut mock_outbox, - 1, - ); + expect_outbox_dataset_created(&mut mock_outbox, 2); + expect_outbox_dataset_dependencies_updated(&mut mock_outbox, 1); let harness = CreateFromSnapshotUseCaseHarness::new(mock_outbox); @@ -108,87 +93,28 @@ async fn test_create_derived_dataset_from_snapshot() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseUseCaseHarness, base_harness)] struct CreateFromSnapshotUseCaseHarness { - _temp_dir: tempfile::TempDir, - catalog: Catalog, + base_harness: BaseUseCaseHarness, use_case: Arc, } impl CreateFromSnapshotUseCaseHarness { fn new(mock_outbox: MockOutbox) -> Self { - let tempdir = tempfile::tempdir().unwrap(); - - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); - - let mut b = dill::CatalogBuilder::new(); + let base_harness = + BaseUseCaseHarness::new(BaseUseCaseHarnessOptions::new().with_outbox(mock_outbox)); - b.add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add_value(mock_outbox) - .bind::() - .add::(); + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .build(); - let catalog = b.build(); + let use_case = catalog.get_one().unwrap(); Self { - _temp_dir: tempdir, - use_case: catalog.get_one().unwrap(), - catalog, + base_harness, + use_case, } } - - async fn check_dataset_exists(&self, alias: &DatasetAlias) -> Result<(), GetDatasetError> { - let dataset_repo = self.catalog.get_one::().unwrap(); - dataset_repo - .find_dataset_by_ref(&alias.as_local_ref()) - .await?; - Ok(()) - } - - fn add_outbox_dataset_created_expectation(mock_outbox: &mut MockOutbox, times: usize) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::Created(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } - - fn add_outbox_dataset_dependencies_updated_expectation( - mock_outbox: &mut MockOutbox, - times: usize, - ) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::DependenciesUpdated(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_create_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_create_dataset_use_case.rs index 04e2fd043c..1dd101e6be 100644 --- a/src/infra/core/tests/tests/use_cases/test_create_dataset_use_case.rs +++ b/src/infra/core/tests/tests/use_cases/test_create_dataset_use_case.rs @@ -10,21 +10,13 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use dill::{Catalog, Component}; use kamu::testing::MetadataFactory; -use kamu::{CreateDatasetUseCaseImpl, DatasetRepositoryLocalFs, DatasetRepositoryWriter}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::{ - CreateDatasetUseCase, - DatasetLifecycleMessage, - DatasetRepository, - GetDatasetError, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, -}; -use messaging_outbox::{MockOutbox, Outbox}; -use mockall::predicate::{eq, function}; +use kamu::CreateDatasetUseCaseImpl; +use kamu_core::CreateDatasetUseCase; +use messaging_outbox::MockOutbox; use opendatafabric::{DatasetAlias, DatasetKind, DatasetName}; -use time_source::SystemTimeSourceDefault; + +use crate::tests::use_cases::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -33,7 +25,7 @@ async fn test_create_root_dataset() { let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); let mut mock_outbox = MockOutbox::new(); - CreateUseCaseHarness::add_outbox_dataset_created_expectation(&mut mock_outbox, 1); + expect_outbox_dataset_created(&mut mock_outbox, 1); let harness = CreateUseCaseHarness::new(mock_outbox); @@ -53,67 +45,28 @@ async fn test_create_root_dataset() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseUseCaseHarness, base_harness)] struct CreateUseCaseHarness { - _temp_dir: tempfile::TempDir, - catalog: Catalog, + base_harness: BaseUseCaseHarness, use_case: Arc, } impl CreateUseCaseHarness { fn new(mock_outbox: MockOutbox) -> Self { - let tempdir = tempfile::tempdir().unwrap(); - - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); + let base_harness = + BaseUseCaseHarness::new(BaseUseCaseHarnessOptions::new().with_outbox(mock_outbox)); - let catalog = dill::CatalogBuilder::new() + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add::() - .add_value(mock_outbox) - .bind::() .build(); let use_case = catalog.get_one::().unwrap(); Self { - _temp_dir: tempdir, - catalog, + base_harness, use_case, } } - - async fn check_dataset_exists(&self, alias: &DatasetAlias) -> Result<(), GetDatasetError> { - let dataset_repo = self.catalog.get_one::().unwrap(); - dataset_repo - .find_dataset_by_ref(&alias.as_local_ref()) - .await?; - Ok(()) - } - - fn add_outbox_dataset_created_expectation(mock_outbox: &mut MockOutbox, times: usize) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::Created(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_delete_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_delete_dataset_use_case.rs index b027f142b6..cdb5acd0c4 100644 --- a/src/infra/core/tests/tests/use_cases/test_delete_dataset_use_case.rs +++ b/src/infra/core/tests/tests/use_cases/test_delete_dataset_use_case.rs @@ -10,31 +10,25 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use dill::{Catalog, Component}; -use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer}; +use dill::Catalog; +use kamu::testing::MockDatasetActionAuthorizer; use kamu::{ - DatasetRepositoryLocalFs, - DatasetRepositoryWriter, DeleteDatasetUseCaseImpl, DependencyGraphRepositoryInMemory, DependencyGraphServiceInMemory, }; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::auth::DatasetActionAuthorizer; use kamu_core::{ - CreateDatasetResult, DatasetLifecycleMessage, DatasetRepository, DeleteDatasetError, DeleteDatasetUseCase, DependencyGraphService, GetDatasetError, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, }; -use messaging_outbox::{consume_deserialized_message, ConsumerFilter, Message, MockOutbox, Outbox}; -use mockall::predicate::{eq, function}; -use opendatafabric::{DatasetAlias, DatasetKind, DatasetName, DatasetRef}; -use time_source::SystemTimeSourceDefault; +use messaging_outbox::{consume_deserialized_message, ConsumerFilter, Message, MockOutbox}; +use opendatafabric::{DatasetAlias, DatasetName}; + +use crate::tests::use_cases::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -43,7 +37,7 @@ async fn test_delete_dataset_success_via_ref() { let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); let mut mock_outbox = MockOutbox::new(); - DeleteUseCaseHarness::add_outbox_dataset_deleted_expectation(&mut mock_outbox, 1); + expect_outbox_dataset_deleted(&mut mock_outbox, 1); let mock_authorizer = MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, true); @@ -72,19 +66,19 @@ async fn test_delete_dataset_success_via_handle() { let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); let mut mock_outbox = MockOutbox::new(); - DeleteUseCaseHarness::add_outbox_dataset_deleted_expectation(&mut mock_outbox, 1); + expect_outbox_dataset_deleted(&mut mock_outbox, 1); let mock_authorizer = MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, true); let harness = DeleteUseCaseHarness::new(mock_authorizer, mock_outbox); - let create_result_foo = harness.create_root_dataset(&alias_foo).await; + let foo = harness.create_root_dataset(&alias_foo).await; harness.dependencies_eager_initialization().await; harness .use_case - .execute_via_handle(&create_result_foo.dataset_handle) + .execute_via_handle(&foo.dataset_handle) .await .unwrap(); @@ -121,13 +115,13 @@ async fn test_delete_unauthorized() { MockOutbox::new(), ); - let create_result_foo = harness.create_root_dataset(&alias_foo).await; + let foo = harness.create_root_dataset(&alias_foo).await; harness.dependencies_eager_initialization().await; assert_matches!( harness .use_case - .execute_via_handle(&create_result_foo.dataset_handle) + .execute_via_handle(&foo.dataset_handle) .await, Err(DeleteDatasetError::Access(_)) ); @@ -143,19 +137,19 @@ async fn test_delete_dataset_respects_dangling_refs() { let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); let mut mock_outbox = MockOutbox::new(); - DeleteUseCaseHarness::add_outbox_dataset_deleted_expectation(&mut mock_outbox, 2); + expect_outbox_dataset_deleted(&mut mock_outbox, 2); let harness = DeleteUseCaseHarness::new(MockDatasetActionAuthorizer::allowing(), mock_outbox); - let create_result_root = harness.create_root_dataset(&alias_foo).await; - let create_result_derived = harness + let root = harness.create_root_dataset(&alias_foo).await; + let derived = harness .create_derived_dataset(&alias_bar, vec![alias_foo.as_local_ref()]) .await; harness.dependencies_eager_initialization().await; assert_matches!( - harness.use_case.execute_via_handle(&create_result_root.dataset_handle).await, - Err(DeleteDatasetError::DanglingReference(e)) if e.children == vec![create_result_derived.dataset_handle.clone()] + harness.use_case.execute_via_handle(&root.dataset_handle).await, + Err(DeleteDatasetError::DanglingReference(e)) if e.children == vec![derived.dataset_handle.clone()] ); assert_matches!(harness.check_dataset_exists(&alias_foo).await, Ok(_)); @@ -163,14 +157,12 @@ async fn test_delete_dataset_respects_dangling_refs() { harness .use_case - .execute_via_handle(&create_result_derived.dataset_handle) + .execute_via_handle(&derived.dataset_handle) .await .unwrap(); harness - .consume_message(DatasetLifecycleMessage::deleted( - create_result_derived.dataset_handle.id, - )) + .consume_message(DatasetLifecycleMessage::deleted(derived.dataset_handle.id)) .await; assert_matches!(harness.check_dataset_exists(&alias_foo).await, Ok(_)); @@ -181,14 +173,12 @@ async fn test_delete_dataset_respects_dangling_refs() { harness .use_case - .execute_via_handle(&create_result_root.dataset_handle) + .execute_via_handle(&root.dataset_handle) .await .unwrap(); harness - .consume_message(DatasetLifecycleMessage::deleted( - create_result_root.dataset_handle.id, - )) + .consume_message(DatasetLifecycleMessage::deleted(root.dataset_handle.id)) .await; assert_matches!( @@ -203,9 +193,12 @@ async fn test_delete_dataset_respects_dangling_refs() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseUseCaseHarness, base_harness)] struct DeleteUseCaseHarness { - _temp_dir: tempfile::TempDir, + base_harness: BaseUseCaseHarness, catalog: Catalog, + dependency_graph_service: Arc, + dataset_repo: Arc, use_case: Arc, } @@ -214,102 +207,35 @@ impl DeleteUseCaseHarness { mock_dataset_action_authorizer: MockDatasetActionAuthorizer, mock_outbox: MockOutbox, ) -> Self { - let tempdir = tempfile::tempdir().unwrap(); - - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new() + .with_authorizer(mock_dataset_action_authorizer) + .with_outbox(mock_outbox), + ); - let catalog = dill::CatalogBuilder::new() + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) .add::() - .add_value(mock_dataset_action_authorizer) - .bind::() - .add::() - .add_value(mock_outbox) - .bind::() .build(); - let use_case = catalog.get_one::().unwrap(); + let dependency_graph_service = catalog.get_one().unwrap(); + let use_case = catalog.get_one().unwrap(); + let dataset_repo = catalog.get_one().unwrap(); Self { - _temp_dir: tempdir, + base_harness, catalog, + dependency_graph_service, + dataset_repo, use_case, } } - async fn create_root_dataset(&self, alias: &DatasetAlias) -> CreateDatasetResult { - let snapshot = MetadataFactory::dataset_snapshot() - .name(alias.clone()) - .kind(DatasetKind::Root) - .push_event(MetadataFactory::set_polling_source().build()) - .build(); - - let dataset_repo_writer = self - .catalog - .get_one::() - .unwrap(); - - let result = dataset_repo_writer - .create_dataset_from_snapshot(snapshot) - .await - .unwrap(); - - result.create_dataset_result - } - - async fn create_derived_dataset( - &self, - alias: &DatasetAlias, - input_dataset_refs: Vec, - ) -> CreateDatasetResult { - let dataset_repo_writer = self - .catalog - .get_one::() - .unwrap(); - - dataset_repo_writer - .create_dataset_from_snapshot( - MetadataFactory::dataset_snapshot() - .name(alias.clone()) - .kind(DatasetKind::Derivative) - .push_event( - MetadataFactory::set_transform() - .inputs_from_refs(input_dataset_refs) - .build(), - ) - .build(), - ) - .await - .unwrap() - .create_dataset_result - } - - async fn check_dataset_exists(&self, alias: &DatasetAlias) -> Result<(), GetDatasetError> { - let dataset_repo = self.catalog.get_one::().unwrap(); - dataset_repo - .find_dataset_by_ref(&alias.as_local_ref()) - .await?; - Ok(()) - } - async fn dependencies_eager_initialization(&self) { - let dependency_graph_service = self - .catalog - .get_one::() - .unwrap(); - let dataset_repo = self.catalog.get_one::().unwrap(); - - dependency_graph_service - .eager_initialization(&DependencyGraphRepositoryInMemory::new(dataset_repo)) + self.dependency_graph_service + .eager_initialization(&DependencyGraphRepositoryInMemory::new( + self.dataset_repo.clone(), + )) .await .unwrap(); } @@ -325,23 +251,6 @@ impl DeleteUseCaseHarness { .await .unwrap(); } - - fn add_outbox_dataset_deleted_expectation(mock_outbox: &mut MockOutbox, times: usize) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::Deleted(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_pull_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_pull_dataset_use_case.rs new file mode 100644 index 0000000000..4ff453b7ec --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/test_pull_dataset_use_case.rs @@ -0,0 +1,803 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use kamu::testing::*; +use kamu::*; +use kamu_core::auth::{DatasetAction, DummyOdfServerAccessTokenResolver}; +use kamu_core::*; +use opendatafabric::*; +use tempfile::TempDir; +use url::Url; + +use super::{BaseUseCaseHarness, BaseUseCaseHarnessOptions}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_ingest_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let mocks = PullUseCaseHarnessMocks::default() + .with_authorizer_mock( + MockDatasetActionAuthorizer::new().make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo.clone()]), + ), + ) + .with_polling_ingest_mock( + MockPollingIngestService::new().make_expect_ingest(alias_foo.clone()), + ); + + let harness = PullUseCaseHarness::new(mocks); + let foo = harness.create_root_dataset(&alias_foo).await; + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); + + let pull_request = PullRequest::Local(alias_foo.as_local_ref()); + + let pull_response = harness + .use_case + .execute(pull_request.clone(), PullOptions::default(), None) + .await + .unwrap(); + + assert_matches!( + pull_response, + PullResponse { + maybe_original_request: Some(a_pull_request), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(_)), + } if a_pull_request == pull_request && + a_local_ref == foo.dataset_handle.as_local_ref() + ); + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_transform_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + + let mocks = PullUseCaseHarnessMocks::default() + .with_authorizer_mock( + MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_bar.clone()]), + ) + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::from_iter([alias_foo.clone()]), + ), + ) + .with_transform_elaboration_mock( + MockTransformElaborationService::new() + .make_expect_elaborate_transform(alias_bar.clone()), + ) + .with_transform_execution_mock( + MockTransformExecutionService::new().make_expect_transform(alias_bar.clone()), + ); + + let harness = PullUseCaseHarness::new(mocks); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness + .create_derived_dataset(&alias_bar, vec![foo.dataset_handle.as_local_ref()]) + .await; + + let aliases = harness.get_remote_aliases(&bar).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); + + let pull_request = PullRequest::Local(alias_bar.as_local_ref()); + + let pull_response = harness + .use_case + .execute(pull_request.clone(), PullOptions::default(), None) + .await + .unwrap(); + + assert_matches!( + pull_response, + PullResponse { + maybe_original_request: Some(a_pull_request), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(_)), + } if a_pull_request == pull_request && + a_local_ref == bar.dataset_handle.as_local_ref() + ); + + let aliases = harness.get_remote_aliases(&bar).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_sync_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let remote_ref = DatasetRefRemote::Alias(DatasetAliasRemote { + repo_name: RepoName::new_unchecked(REMOTE_REPO_NAME_STR), + account_name: None, + dataset_name: alias_foo.dataset_name.clone(), + }); + + let mocks = PullUseCaseHarnessMocks::default() + .with_authorizer_mock( + MockDatasetActionAuthorizer::new().make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo.clone()]), + ), + ) + .with_sync_mock( + MockSyncService::new().make_expect_sync_pull_from_remote_to_existing_local( + alias_foo.clone(), + remote_ref.clone(), + SyncResult::Updated { + old_head: None, + new_head: Multihash::from_multibase( + "f16205603b882241c71351baf996d6dba7e3ddbd571457e93c1cd282bdc61f9fed5f2", + ) + .unwrap(), + num_blocks: 0, + }, + ), + ); + + let harness = PullUseCaseHarness::new(mocks); + + let foo = harness.create_root_dataset(&alias_foo).await; + + harness.copy_dataset_to_remote_repo(&alias_foo).await; + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); + + let pull_request = PullRequest::Remote(PullRequestRemote { + remote_ref: remote_ref.clone(), + maybe_local_alias: Some(alias_foo.clone()), + }); + + let pull_response = harness + .use_case + .execute(pull_request.clone(), PullOptions::default(), None) + .await + .unwrap(); + + assert_matches!( + pull_response, + PullResponse { + maybe_original_request: Some(a_pull_request), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: Some(a_remote_ref), + result: Ok(PullResult::Updated { .. }), + } if a_pull_request == pull_request && + a_local_ref == foo.dataset_handle.as_local_ref() && + a_remote_ref == remote_ref, + ); + + let aliases = harness.get_remote_aliases(&foo).await; + let pull_aliases: Vec<_> = aliases.get_by_kind(RemoteAliasKind::Pull).collect(); + assert_eq!( + pull_aliases, + vec![&DatasetRefRemote::Alias(DatasetAliasRemote { + repo_name: harness.remote_repo_name, + account_name: None, + dataset_name: alias_foo.dataset_name.clone() + })] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_sync_success_without_saving_alias() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let remote_ref = DatasetRefRemote::Alias(DatasetAliasRemote { + repo_name: RepoName::new_unchecked(REMOTE_REPO_NAME_STR), + account_name: None, + dataset_name: alias_foo.dataset_name.clone(), + }); + + let mocks = PullUseCaseHarnessMocks::default() + .with_authorizer_mock( + MockDatasetActionAuthorizer::new().make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo.clone()]), + ), + ) + .with_sync_mock( + MockSyncService::new().make_expect_sync_pull_from_remote_to_existing_local( + alias_foo.clone(), + remote_ref.clone(), + SyncResult::Updated { + old_head: None, + new_head: Multihash::from_multibase( + "f16205603b882241c71351baf996d6dba7e3ddbd571457e93c1cd282bdc61f9fed5f2", + ) + .unwrap(), + num_blocks: 0, + }, + ), + ); + + let harness = PullUseCaseHarness::new(mocks); + + let foo = harness.create_root_dataset(&alias_foo).await; + + harness.copy_dataset_to_remote_repo(&alias_foo).await; + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); + + let pull_request = PullRequest::Remote(PullRequestRemote { + remote_ref: remote_ref.clone(), + maybe_local_alias: Some(alias_foo.clone()), + }); + + let pull_response = harness + .use_case + .execute( + pull_request.clone(), + PullOptions { + add_aliases: false, + ..PullOptions::default() + }, + None, + ) + .await + .unwrap(); + + assert_matches!( + pull_response, + PullResponse { + maybe_original_request: Some(a_pull_request), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: Some(a_remote_ref), + result: Ok(PullResult::Updated { .. }), + } if a_pull_request == pull_request && + a_local_ref == foo.dataset_handle.as_local_ref() && + a_remote_ref == remote_ref, + ); + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Pull)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_multi_recursive() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(None, DatasetName::new_unchecked("baz")); + + let alias_foo_bar = DatasetAlias::new(None, DatasetName::new_unchecked("foo-bar")); + + let mocks = PullUseCaseHarnessMocks::default() + .with_authorizer_mock( + MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo.clone(), alias_bar.clone(), alias_baz.clone()]), + ) + .make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo_bar.clone()]), + ) + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::from_iter([alias_foo.clone(), alias_bar.clone()]), + ), + ) + .with_polling_ingest_mock( + MockPollingIngestService::new() + .make_expect_ingest(alias_foo.clone()) + .make_expect_ingest(alias_bar.clone()) + .make_expect_ingest(alias_baz.clone()), + ) + .with_transform_elaboration_mock( + MockTransformElaborationService::new() + .make_expect_elaborate_transform(alias_foo_bar.clone()), + ) + .with_transform_execution_mock( + MockTransformExecutionService::new().make_expect_transform(alias_foo_bar.clone()), + ); + let harness = PullUseCaseHarness::new(mocks); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let baz = harness.create_root_dataset(&alias_baz).await; + + let foo_bar = harness + .create_derived_dataset( + &alias_foo_bar, + vec![ + foo.dataset_handle.as_local_ref(), + bar.dataset_handle.as_local_ref(), + ], + ) + .await; + + let pull_responses = harness + .use_case + .execute_multi( + vec![ + PullRequest::Local(baz.dataset_handle.as_local_ref()), + PullRequest::Local(foo_bar.dataset_handle.as_local_ref()), + ], + PullOptions { + recursive: true, + ..PullOptions::default() + }, + None, + ) + .await + .unwrap(); + assert_eq!(4, pull_responses.len()); + + let responses_by_name: HashMap<_, _> = pull_responses + .into_iter() + .map(|response| { + assert!(response.maybe_local_ref.is_some()); + ( + response + .maybe_local_ref + .as_ref() + .unwrap() + .dataset_name() + .unwrap() + .clone(), + response, + ) + }) + .collect(); + + assert_matches!( + responses_by_name.get(&alias_foo.dataset_name).unwrap(), + PullResponse { + maybe_original_request: None, // triggered via recursion + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::PollingIngest( + PollingIngestResultUpToDate { uncacheable: false } + ))) + } if *a_local_ref == foo.dataset_handle.as_local_ref() + ); + + assert_matches!( + responses_by_name.get(&alias_bar.dataset_name).unwrap(), + PullResponse { + maybe_original_request: None, // triggered via recursion + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::PollingIngest( + PollingIngestResultUpToDate { uncacheable: false } + ))) + } if *a_local_ref == bar.dataset_handle.as_local_ref() + ); + + assert_matches!( + responses_by_name.get(&alias_baz.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::PollingIngest( + PollingIngestResultUpToDate { uncacheable: false } + ))) + } if *a_local_ref == baz.dataset_handle.as_local_ref() + ); + + assert_matches!( + responses_by_name.get(&alias_foo_bar.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::Transform)) + } if *a_local_ref == foo_bar.dataset_handle.as_local_ref() + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_all_owned() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(None, DatasetName::new_unchecked("baz")); + + let alias_foo_bar = DatasetAlias::new(None, DatasetName::new_unchecked("foo-bar")); + let alias_foo_baz = DatasetAlias::new(None, DatasetName::new_unchecked("foo-baz")); + + let baz_ref_remote = DatasetRefRemote::Alias(DatasetAliasRemote { + repo_name: RepoName::new_unchecked(REMOTE_REPO_NAME_STR), + account_name: None, + dataset_name: alias_baz.dataset_name.clone(), + }); + + let mocks = PullUseCaseHarnessMocks::default() + .with_authorizer_mock( + MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo.clone(), alias_bar.clone(), alias_baz.clone()]), + ) + .make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo_bar.clone(), alias_foo_baz.clone()]), + ) + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::from_iter([alias_foo.clone(), alias_bar.clone(), alias_baz.clone()]), + ), + ) + .with_polling_ingest_mock( + MockPollingIngestService::new() + .make_expect_ingest(alias_foo.clone()) + .make_expect_ingest(alias_bar.clone()), + ) + .with_sync_mock( + MockSyncService::new().make_expect_sync_pull_from_remote_to_existing_local( + alias_baz.clone(), + baz_ref_remote.clone(), + SyncResult::UpToDate, + ), + ) + .with_transform_elaboration_mock( + MockTransformElaborationService::new() + .make_expect_elaborate_transform(alias_foo_bar.clone()) + .make_expect_elaborate_transform(alias_foo_baz.clone()), + ) + .with_transform_execution_mock( + MockTransformExecutionService::new() + .make_expect_transform(alias_foo_bar.clone()) + .make_expect_transform(alias_foo_baz.clone()), + ); + let harness = PullUseCaseHarness::new(mocks); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let baz = harness.create_root_dataset(&alias_baz).await; + + harness.copy_dataset_to_remote_repo(&alias_baz).await; + harness + .get_remote_aliases(&baz) + .await + .add(&baz_ref_remote, RemoteAliasKind::Pull) + .await + .unwrap(); + + let foo_bar = harness + .create_derived_dataset( + &alias_foo_bar, + vec![ + foo.dataset_handle.as_local_ref(), + bar.dataset_handle.as_local_ref(), + ], + ) + .await; + let foo_baz = harness + .create_derived_dataset( + &alias_foo_baz, + vec![ + foo.dataset_handle.as_local_ref(), + baz.dataset_handle.as_local_ref(), + ], + ) + .await; + + let pull_responses = harness + .use_case + .execute_all_owned(PullOptions::default(), None) + .await + .unwrap(); + assert_eq!(5, pull_responses.len()); + + let responses_by_name: HashMap<_, _> = pull_responses + .into_iter() + .map(|response| { + assert!(response.maybe_local_ref.is_some()); + ( + response + .maybe_local_ref + .as_ref() + .unwrap() + .dataset_name() + .unwrap() + .clone(), + response, + ) + }) + .collect(); + + assert_matches!( + responses_by_name.get(&alias_foo.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::PollingIngest( + PollingIngestResultUpToDate { uncacheable: false } + ))) + } if *a_local_ref == foo.dataset_handle.as_local_ref() + ); + + assert_matches!( + responses_by_name.get(&alias_bar.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::PollingIngest( + PollingIngestResultUpToDate { uncacheable: false } + ))) + } if *a_local_ref == bar.dataset_handle.as_local_ref() + ); + + assert_matches!( + responses_by_name.get(&alias_baz.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: Some(a_remote_ref), + result: Ok(PullResult::UpToDate(PullResultUpToDate::Sync)) + } if *a_local_ref == baz.dataset_handle.as_local_ref() && *a_remote_ref == baz_ref_remote + ); + + assert_matches!( + responses_by_name.get(&alias_foo_bar.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::Transform)) + } if *a_local_ref == foo_bar.dataset_handle.as_local_ref() + ); + + assert_matches!( + responses_by_name.get(&alias_foo_baz.dataset_name).unwrap(), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Ok(PullResult::UpToDate(PullResultUpToDate::Transform)) + } if *a_local_ref == foo_baz.dataset_handle.as_local_ref() + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_pull_authorization_issue() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(None, DatasetName::new_unchecked("baz")); + + let mocks = PullUseCaseHarnessMocks::default().with_authorizer_mock( + MockDatasetActionAuthorizer::new().make_expect_classify_datasets_by_allowance( + DatasetAction::Write, + 1, + HashSet::from_iter([alias_foo.clone(), alias_baz.clone()]), + ), + ); + let harness = PullUseCaseHarness::new(mocks); + + let _foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let _baz = harness.create_root_dataset(&alias_baz).await; + + let mut pull_responses = harness + .use_case + .execute_multi( + vec![ + PullRequest::local(alias_foo.as_local_ref()), + PullRequest::local(alias_bar.as_local_ref()), + PullRequest::local(alias_baz.as_local_ref()), + ], + PullOptions::default(), + None, + ) + .await + .unwrap(); + assert_eq!(1, pull_responses.len()); + + assert_matches!( + pull_responses.remove(0), + PullResponse { + maybe_original_request: Some(PullRequest::Local(_)), + maybe_local_ref: Some(a_local_ref), + maybe_remote_ref: None, + result: Err(PullError::Access(_)) + } if a_local_ref == bar.dataset_handle.as_local_ref() + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +static REMOTE_REPO_NAME_STR: &str = "remote"; + +#[oop::extend(BaseUseCaseHarness, base_harness)] +struct PullUseCaseHarness { + base_harness: BaseUseCaseHarness, + use_case: Arc, + remote_aliases_registry: Arc, + remote_repo_name: RepoName, + remote_tmp_dir: TempDir, +} + +impl PullUseCaseHarness { + fn new(mocks: PullUseCaseHarnessMocks) -> Self { + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new().with_authorizer(mocks.mock_dataset_action_authorizer), + ); + + let repos_dir = base_harness.temp_dir_path().join("repos"); + std::fs::create_dir(&repos_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .add::() + .add::() + .add_value(mocks.mock_polling_ingest_service) + .bind::() + .add_value(mocks.mock_transform_elaboration_service) + .bind::() + .add_value(mocks.mock_transform_execution_service) + .bind::() + .add_value(mocks.mock_sync_service) + .bind::() + .add::() + .add::() + .add::() + .add_value(RemoteRepositoryRegistryImpl::create(repos_dir).unwrap()) + .bind::() + .add::() + .add_value(IpfsGateway::default()) + .build(); + + let use_case = catalog.get_one().unwrap(); + let remote_aliases_registry = catalog.get_one().unwrap(); + + let remote_tmp_dir = tempfile::tempdir().unwrap(); + let remote_repo_url = Url::from_directory_path(remote_tmp_dir.path()).unwrap(); + + let remote_repo_name = RepoName::new_unchecked(REMOTE_REPO_NAME_STR); + let remote_repo_registry = catalog.get_one::().unwrap(); + remote_repo_registry + .add_repository(&remote_repo_name, remote_repo_url) + .unwrap(); + + Self { + base_harness, + use_case, + remote_aliases_registry, + remote_repo_name, + remote_tmp_dir, + } + } + + async fn get_remote_aliases(&self, created: &CreateDatasetResult) -> Box { + self.remote_aliases_registry + .get_remote_aliases(&created.dataset_handle) + .await + .unwrap() + } + + async fn copy_dataset_to_remote_repo(&self, dataset_alias: &DatasetAlias) { + let src_path = self + .base_harness + .temp_dir_path() + .join("datasets") + .join(&dataset_alias.dataset_name); + + let dst_path = self.remote_tmp_dir.path().join(&dataset_alias.dataset_name); + + tokio::fs::create_dir_all(&dst_path).await.unwrap(); + let copy_options = fs_extra::dir::CopyOptions::new().content_only(true); + fs_extra::dir::copy(src_path, dst_path, ©_options).unwrap(); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +struct PullUseCaseHarnessMocks { + mock_dataset_action_authorizer: MockDatasetActionAuthorizer, + mock_polling_ingest_service: MockPollingIngestService, + mock_transform_elaboration_service: MockTransformElaborationService, + mock_transform_execution_service: MockTransformExecutionService, + mock_sync_service: MockSyncService, +} + +impl PullUseCaseHarnessMocks { + fn with_authorizer_mock( + self, + mock_dataset_action_authorizer: MockDatasetActionAuthorizer, + ) -> Self { + Self { + mock_dataset_action_authorizer, + ..self + } + } + + fn with_polling_ingest_mock( + self, + mock_polling_ingest_service: MockPollingIngestService, + ) -> Self { + Self { + mock_polling_ingest_service, + ..self + } + } + + fn with_transform_elaboration_mock( + self, + mock_transform_elaboration_service: MockTransformElaborationService, + ) -> Self { + Self { + mock_transform_elaboration_service, + ..self + } + } + + fn with_transform_execution_mock( + self, + mock_transform_execution_service: MockTransformExecutionService, + ) -> Self { + Self { + mock_transform_execution_service, + ..self + } + } + + fn with_sync_mock(self, mock_sync_service: MockSyncService) -> Self { + Self { + mock_sync_service, + ..self + } + } +} + +impl Default for PullUseCaseHarnessMocks { + fn default() -> Self { + Self { + mock_dataset_action_authorizer: MockDatasetActionAuthorizer::new(), + mock_polling_ingest_service: MockPollingIngestService::new(), + mock_transform_elaboration_service: MockTransformElaborationService::new(), + mock_transform_execution_service: MockTransformExecutionService::new(), + mock_sync_service: MockSyncService::new(), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_push_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_push_dataset_use_case.rs new file mode 100644 index 0000000000..987fe37f73 --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/test_push_dataset_use_case.rs @@ -0,0 +1,401 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::collections::HashSet; +use std::sync::Arc; + +use kamu::testing::{DummySmartTransferProtocolClient, MockDatasetActionAuthorizer}; +use kamu::utils::ipfs_wrapper::IpfsClient; +use kamu::utils::simple_transfer_protocol::SimpleTransferProtocol; +use kamu::*; +use kamu_core::auth::{DatasetAction, DummyOdfServerAccessTokenResolver}; +use kamu_core::*; +use opendatafabric::*; +use tempfile::TempDir; +use url::Url; + +use super::{BaseUseCaseHarness, BaseUseCaseHarnessOptions}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_push_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let mock_authorizer = MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 2, + HashSet::from_iter([alias_foo.clone()]), + ); + + let harness = PushUseCaseHarness::new(mock_authorizer); + let foo = harness.create_root_dataset(&alias_foo).await; + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Push)); + + let push_options = PushMultiOptions { + remote_target: Some(DatasetPushTarget::Repository( + harness.remote_repo_name.clone(), + )), + ..Default::default() + }; + + let mut responses = harness + .use_case + .execute_multi(vec![foo.dataset_handle.clone()], push_options.clone(), None) + .await + .unwrap(); + + assert_eq!(responses.len(), 1); + assert_matches!( + responses.remove(0), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Ok(SyncResult::Updated { old_head, new_head: _, num_blocks }), + } if local_handle == foo.dataset_handle && + repo_name == harness.remote_repo_name && + old_head.is_none() && num_blocks == 2 + ); + + let aliases = harness.get_remote_aliases(&foo).await; + let push_aliases: Vec<_> = aliases.get_by_kind(RemoteAliasKind::Push).collect(); + assert_eq!( + push_aliases, + vec![&DatasetRefRemote::Url(Arc::new( + harness.remote_repo_url.join("foo").unwrap() + ))] + ); + + let mut responses = harness + .use_case + .execute_multi(vec![foo.dataset_handle.clone()], push_options, None) + .await + .unwrap(); + + assert_eq!(responses.len(), 1); + assert_matches!( + responses.remove(0), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Ok(SyncResult::UpToDate), + } if local_handle == foo.dataset_handle && + repo_name == harness.remote_repo_name + ); + + let aliases = harness.get_remote_aliases(&foo).await; + let push_aliases: Vec<_> = aliases.get_by_kind(RemoteAliasKind::Push).collect(); + assert_eq!( + push_aliases, + vec![&DatasetRefRemote::Url(Arc::new( + harness.remote_repo_url.join("foo").unwrap() + ))] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_push_success_without_saving_alias() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let mock_authorizer = MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::from_iter([alias_foo.clone()]), + ); + + let harness = PushUseCaseHarness::new(mock_authorizer); + let foo = harness.create_root_dataset(&alias_foo).await; + + let push_options = PushMultiOptions { + remote_target: Some(DatasetPushTarget::Repository( + harness.remote_repo_name.clone(), + )), + add_aliases: false, + ..Default::default() + }; + + let mut responses = harness + .use_case + .execute_multi(vec![foo.dataset_handle.clone()], push_options.clone(), None) + .await + .unwrap(); + + assert_eq!(responses.len(), 1); + assert_matches!( + responses.remove(0), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Ok(SyncResult::Updated { old_head, new_head: _, num_blocks }), + } if local_handle == foo.dataset_handle && + repo_name == harness.remote_repo_name && + old_head.is_none() && num_blocks == 2 + ); + + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Push)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_push_unauthorized() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let mock_authorizer = MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::new(), // not authorized + ); + + let harness = PushUseCaseHarness::new(mock_authorizer); + let foo = harness.create_root_dataset(&alias_foo).await; + + let push_options = PushMultiOptions { + remote_target: Some(DatasetPushTarget::Repository( + harness.remote_repo_name.clone(), + )), + ..Default::default() + }; + + let mut responses = harness + .use_case + .execute_multi(vec![foo.dataset_handle.clone()], push_options.clone(), None) + .await + .unwrap(); + + assert_eq!(responses.len(), 1); + assert_matches!( + responses.remove(0), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Err(PushError::SyncError(SyncError::Access(_))), + } if local_handle == foo.dataset_handle && + repo_name == harness.remote_repo_name + ); + + // Aliases should not be touched in case of failure + let aliases = harness.get_remote_aliases(&foo).await; + assert!(aliases.is_empty(RemoteAliasKind::Push)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_push_multiple_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + + let mock_authorizer = MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::from_iter([alias_foo.clone(), alias_bar.clone()]), + ); + + let harness = PushUseCaseHarness::new(mock_authorizer); + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + + let push_options = PushMultiOptions { + remote_target: Some(DatasetPushTarget::Repository( + harness.remote_repo_name.clone(), + )), + ..Default::default() + }; + + let mut responses = harness + .use_case + .execute_multi( + vec![foo.dataset_handle.clone(), bar.dataset_handle.clone()], + push_options.clone(), + None, + ) + .await + .unwrap(); + + assert_eq!(responses.len(), 2); + assert_matches!( + responses.remove(1), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Ok(SyncResult::Updated { .. }), + } if local_handle == bar.dataset_handle && + repo_name == harness.remote_repo_name + ); + assert_matches!( + responses.remove(0), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Ok(SyncResult::Updated { .. }), + } if local_handle == foo.dataset_handle && + repo_name == harness.remote_repo_name + ); + + let foo_aliases = harness.get_remote_aliases(&foo).await; + let push_aliases: Vec<_> = foo_aliases.get_by_kind(RemoteAliasKind::Push).collect(); + assert_eq!( + push_aliases, + vec![&DatasetRefRemote::Url(Arc::new( + harness.remote_repo_url.join("foo").unwrap() + ))] + ); + + let bar_aliases = harness.get_remote_aliases(&bar).await; + let push_aliases: Vec<_> = bar_aliases.get_by_kind(RemoteAliasKind::Push).collect(); + assert_eq!( + push_aliases, + vec![&DatasetRefRemote::Url(Arc::new( + harness.remote_repo_url.join("bar").unwrap() + ))] + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_push_multiple_mixed_authorization_issues() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(None, DatasetName::new_unchecked("baz")); + + let mock_authorizer = MockDatasetActionAuthorizer::new() + .make_expect_classify_datasets_by_allowance( + DatasetAction::Read, + 1, + HashSet::from_iter([alias_bar.clone()]), // 1 of 3 is authorized + ); + + let harness = PushUseCaseHarness::new(mock_authorizer); + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let baz = harness.create_root_dataset(&alias_baz).await; + + let push_options = PushMultiOptions { + remote_target: Some(DatasetPushTarget::Repository( + harness.remote_repo_name.clone(), + )), + ..Default::default() + }; + + let mut responses = harness + .use_case + .execute_multi( + vec![ + foo.dataset_handle.clone(), + bar.dataset_handle.clone(), + baz.dataset_handle.clone(), + ], + push_options.clone(), + None, + ) + .await + .unwrap(); + + assert_eq!(responses.len(), 2); + assert_matches!( + responses.remove(1), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Err(PushError::SyncError(SyncError::Access(_))), + } if local_handle == baz.dataset_handle && + repo_name == harness.remote_repo_name + ); + assert_matches!( + responses.remove(0), + PushResponse { + local_handle: Some(local_handle), + target: Some(DatasetPushTarget::Repository(repo_name)), + result: Err(PushError::SyncError(SyncError::Access(_))), + } if local_handle == foo.dataset_handle && + repo_name == harness.remote_repo_name + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseUseCaseHarness, base_harness)] +struct PushUseCaseHarness { + base_harness: BaseUseCaseHarness, + use_case: Arc, + remote_aliases_registry: Arc, + remote_repo_name: RepoName, + remote_repo_url: Url, + _remote_tmp_dir: TempDir, +} + +impl PushUseCaseHarness { + fn new(mock_dataset_action_authorizer: MockDatasetActionAuthorizer) -> Self { + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new().with_authorizer(mock_dataset_action_authorizer), + ); + + let repos_dir = base_harness.temp_dir_path().join("repos"); + std::fs::create_dir(&repos_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add::() + .add_value(RemoteRepositoryRegistryImpl::create(repos_dir).unwrap()) + .bind::() + .add::() + .add::() + .add::() + .add_value(IpfsClient::default()) + .add_value(IpfsGateway::default()) + .build(); + + let use_case = catalog.get_one().unwrap(); + let remote_aliases_registry = catalog.get_one().unwrap(); + + let remote_tmp_dir = tempfile::tempdir().unwrap(); + let remote_repo_url = Url::from_directory_path(remote_tmp_dir.path()).unwrap(); + + let remote_repo_name = RepoName::new_unchecked("remote"); + let remote_repo_registry = catalog.get_one::().unwrap(); + remote_repo_registry + .add_repository(&remote_repo_name, remote_repo_url.clone()) + .unwrap(); + + Self { + base_harness, + use_case, + remote_aliases_registry, + remote_repo_name, + remote_repo_url, + _remote_tmp_dir: remote_tmp_dir, + } + } + + async fn get_remote_aliases(&self, created: &CreateDatasetResult) -> Box { + self.remote_aliases_registry + .get_remote_aliases(&created.dataset_handle) + .await + .unwrap() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_rename_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_rename_dataset_use_case.rs index 13c1a8f1e0..6d4275f095 100644 --- a/src/infra/core/tests/tests/use_cases/test_rename_dataset_use_case.rs +++ b/src/infra/core/tests/tests/use_cases/test_rename_dataset_use_case.rs @@ -10,24 +10,13 @@ use std::assert_matches::assert_matches; use std::sync::Arc; -use dill::{Catalog, Component}; -use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer}; -use kamu::{DatasetRepositoryLocalFs, DatasetRepositoryWriter, RenameDatasetUseCaseImpl}; -use kamu_accounts::CurrentAccountSubject; -use kamu_core::auth::DatasetActionAuthorizer; -use kamu_core::{ - CreateDatasetResult, - DatasetLifecycleMessage, - DatasetRepository, - GetDatasetError, - RenameDatasetError, - RenameDatasetUseCase, - MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE, -}; -use messaging_outbox::{MockOutbox, Outbox}; -use mockall::predicate::{eq, function}; -use opendatafabric::{DatasetAlias, DatasetKind, DatasetName}; -use time_source::SystemTimeSourceDefault; +use kamu::testing::MockDatasetActionAuthorizer; +use kamu::RenameDatasetUseCaseImpl; +use kamu_core::{GetDatasetError, RenameDatasetError, RenameDatasetUseCase}; +use messaging_outbox::MockOutbox; +use opendatafabric::{DatasetAlias, DatasetName}; + +use crate::tests::use_cases::*; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -39,7 +28,7 @@ async fn test_rename_dataset_success_via_ref() { let mock_authorizer = MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, true); let mut mock_outbox = MockOutbox::new(); - RenameUseCaseHarness::add_outbox_dataset_renamed_expectation(&mut mock_outbox, 1); + expect_outbox_dataset_renamed(&mut mock_outbox, 1); let harness = RenameUseCaseHarness::new(mock_authorizer, mock_outbox); harness.create_root_dataset(&alias_foo).await; @@ -111,9 +100,9 @@ async fn test_rename_dataset_unauthorized() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#[oop::extend(BaseUseCaseHarness, base_harness)] struct RenameUseCaseHarness { - _temp_dir: tempfile::TempDir, - catalog: Catalog, + base_harness: BaseUseCaseHarness, use_case: Arc, } @@ -122,81 +111,23 @@ impl RenameUseCaseHarness { mock_dataset_action_authorizer: MockDatasetActionAuthorizer, mock_outbox: MockOutbox, ) -> Self { - let tempdir = tempfile::tempdir().unwrap(); - - let datasets_dir = tempdir.path().join("datasets"); - std::fs::create_dir(&datasets_dir).unwrap(); + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new() + .with_authorizer(mock_dataset_action_authorizer) + .with_outbox(mock_outbox), + ); - let catalog = dill::CatalogBuilder::new() + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) .add::() - .add_builder( - DatasetRepositoryLocalFs::builder() - .with_root(datasets_dir) - .with_multi_tenant(false), - ) - .bind::() - .bind::() - .add_value(CurrentAccountSubject::new_test()) - .add_value(mock_dataset_action_authorizer) - .bind::() - .add::() - .add_value(mock_outbox) - .bind::() .build(); let use_case = catalog.get_one::().unwrap(); Self { - _temp_dir: tempdir, - catalog, + base_harness, use_case, } } - - async fn create_root_dataset(&self, alias: &DatasetAlias) -> CreateDatasetResult { - let snapshot = MetadataFactory::dataset_snapshot() - .name(alias.clone()) - .kind(DatasetKind::Root) - .push_event(MetadataFactory::set_polling_source().build()) - .build(); - - let dataset_repo_writer = self - .catalog - .get_one::() - .unwrap(); - - let result = dataset_repo_writer - .create_dataset_from_snapshot(snapshot) - .await - .unwrap(); - - result.create_dataset_result - } - - async fn check_dataset_exists(&self, alias: &DatasetAlias) -> Result<(), GetDatasetError> { - let dataset_repo = self.catalog.get_one::().unwrap(); - dataset_repo - .find_dataset_by_ref(&alias.as_local_ref()) - .await?; - Ok(()) - } - - fn add_outbox_dataset_renamed_expectation(mock_outbox: &mut MockOutbox, times: usize) { - mock_outbox - .expect_post_message_as_json() - .with( - eq(MESSAGE_PRODUCER_KAMU_CORE_DATASET_SERVICE), - function(|message_as_json: &serde_json::Value| { - matches!( - serde_json::from_value::(message_as_json.clone()), - Ok(DatasetLifecycleMessage::Renamed(_)) - ) - }), - eq(1), - ) - .times(times) - .returning(|_, _, _| Ok(())); - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_reset_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_reset_dataset_use_case.rs new file mode 100644 index 0000000000..ea89f41370 --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/test_reset_dataset_use_case.rs @@ -0,0 +1,100 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::sync::Arc; + +use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer}; +use kamu::*; +use kamu_core::*; +use opendatafabric::*; + +use super::{BaseUseCaseHarness, BaseUseCaseHarnessOptions}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_reset_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = ResetUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + foo.dataset + .commit_event( + MetadataEvent::SetInfo(MetadataFactory::set_info().description("test").build()), + CommitOpts::default(), + ) + .await + .unwrap(); + + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 3); + + let new_head = harness + .use_case + .execute(&foo.dataset_handle, Some(&foo.head), None) + .await + .unwrap(); + + assert_eq!(new_head, foo.head); + assert_eq!(harness.num_blocks(ResolvedDataset::from(&foo)).await, 2); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_reset_dataset_unauthorized() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = ResetUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, false), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + + assert_matches!( + harness + .use_case + .execute(&foo.dataset_handle, Some(&foo.head), None) + .await, + Err(ResetError::Access(_)) + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseUseCaseHarness, base_harness)] +struct ResetUseCaseHarness { + base_harness: BaseUseCaseHarness, + use_case: Arc, +} + +impl ResetUseCaseHarness { + fn new(mock_dataset_action_authorizer: MockDatasetActionAuthorizer) -> Self { + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new().with_authorizer(mock_dataset_action_authorizer), + ); + + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .add::() + .build(); + + let use_case = catalog.get_one::().unwrap(); + + Self { + base_harness, + use_case, + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_set_watermark_use_case.rs b/src/infra/core/tests/tests/use_cases/test_set_watermark_use_case.rs new file mode 100644 index 0000000000..d5debe946f --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/test_set_watermark_use_case.rs @@ -0,0 +1,108 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::sync::Arc; + +use chrono::{DateTime, TimeDelta, Utc}; +use kamu::testing::MockDatasetActionAuthorizer; +use kamu::*; +use kamu_core::*; +use opendatafabric::*; + +use super::{BaseUseCaseHarness, BaseUseCaseHarnessOptions}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_set_watermark_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = SetWatermarkUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + + let watermark = Utc::now() - TimeDelta::minutes(5); + let result = harness + .use_case + .execute(&foo.dataset_handle, watermark) + .await + .unwrap(); + + assert_matches!(result, SetWatermarkResult::Updated { .. }); + assert_eq!(harness.current_watermark(&foo).await, Some(watermark)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_set_watermark_unauthorized() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = SetWatermarkUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_write_dataset(&alias_foo, 1, false), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + + assert_matches!( + harness + .use_case + .execute(&foo.dataset_handle, Utc::now()) + .await, + Err(SetWatermarkError::Access(_)) + ); + assert_eq!(harness.current_watermark(&foo).await, None,); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseUseCaseHarness, base_harness)] +struct SetWatermarkUseCaseHarness { + base_harness: BaseUseCaseHarness, + use_case: Arc, + watermark_svc: Arc, +} + +impl SetWatermarkUseCaseHarness { + fn new(mock_dataset_action_authorizer: MockDatasetActionAuthorizer) -> Self { + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new().with_authorizer(mock_dataset_action_authorizer), + ); + + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .add::() + .add::() + .build(); + + let use_case = catalog.get_one().unwrap(); + let watermark_svc = catalog.get_one().unwrap(); + + Self { + base_harness, + use_case, + watermark_svc, + } + } + + async fn current_watermark( + &self, + created_result: &CreateDatasetResult, + ) -> Option> { + self.watermark_svc + .try_get_current_watermark(ResolvedDataset::from(created_result)) + .await + .unwrap() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/use_cases/test_verify_dataset_use_case.rs b/src/infra/core/tests/tests/use_cases/test_verify_dataset_use_case.rs new file mode 100644 index 0000000000..e2b9f4a076 --- /dev/null +++ b/src/infra/core/tests/tests/use_cases/test_verify_dataset_use_case.rs @@ -0,0 +1,220 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::sync::Arc; + +use kamu::testing::MockDatasetActionAuthorizer; +use kamu::*; +use kamu_core::*; +use opendatafabric::*; + +use super::{BaseUseCaseHarness, BaseUseCaseHarnessOptions}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_verify_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = VerifyUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_read_dataset(&alias_foo, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + assert_matches!( + harness.verify_dataset(ResolvedDataset::from(&foo)).await, + VerificationResult { + dataset_handle: Some(dataset_handle), + outcome: Ok(()), + } if dataset_handle == foo.dataset_handle + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_verify_multiple_success() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + + let harness = VerifyUseCaseHarness::new( + MockDatasetActionAuthorizer::new() + .expect_check_read_dataset(&alias_foo, 1, true) + .expect_check_read_dataset(&alias_bar, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + + let mut responses = harness + .verify_datasets(vec![ + ResolvedDataset::from(&foo), + ResolvedDataset::from(&bar), + ]) + .await; + + assert_eq!(responses.len(), 2); + let response_bar = responses.remove(1); + let response_foo = responses.remove(0); + + assert_matches!( + response_foo, + VerificationResult { + dataset_handle, + outcome: Ok(_), + } + if dataset_handle == Some(foo.dataset_handle) + ); + assert_matches!( + response_bar, + VerificationResult { + dataset_handle, + outcome: Ok(_), + } + if dataset_handle == Some(bar.dataset_handle) + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_verify_unauthorized() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + + let harness = VerifyUseCaseHarness::new( + MockDatasetActionAuthorizer::new().expect_check_read_dataset(&alias_foo, 1, false), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + assert_matches!( + harness.verify_dataset(ResolvedDataset::from(&foo)).await, + VerificationResult { + dataset_handle: Some(dataset_handle), + outcome: Err(VerificationError::Access(_)), + } if dataset_handle == foo.dataset_handle + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[tokio::test] +async fn test_verify_mixed_authorization_outcome() { + let alias_foo = DatasetAlias::new(None, DatasetName::new_unchecked("foo")); + let alias_bar = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); + let alias_baz = DatasetAlias::new(None, DatasetName::new_unchecked("baz")); + + let harness = VerifyUseCaseHarness::new( + MockDatasetActionAuthorizer::new() + .expect_check_read_dataset(&alias_foo, 1, true) + .expect_check_read_dataset(&alias_bar, 1, false) + .expect_check_read_dataset(&alias_baz, 1, true), + ); + + let foo = harness.create_root_dataset(&alias_foo).await; + let bar = harness.create_root_dataset(&alias_bar).await; + let baz = harness.create_root_dataset(&alias_baz).await; + + let mut responses = harness + .verify_datasets(vec![ + ResolvedDataset::from(&foo), + ResolvedDataset::from(&bar), + ResolvedDataset::from(&baz), + ]) + .await; + + assert_eq!(responses.len(), 3); + let response_baz = responses.remove(2); + let response_foo = responses.remove(1); + let response_bar = responses.remove(0); + + assert_matches!( + response_foo, + VerificationResult { + dataset_handle, + outcome: Ok(_), + } + if dataset_handle == Some(foo.dataset_handle) + ); + assert_matches!( + response_bar, + VerificationResult { + dataset_handle, + outcome: Err(VerificationError::Access(_)), + } + if dataset_handle == Some(bar.dataset_handle) + ); + assert_matches!( + response_baz, + VerificationResult { + dataset_handle, + outcome: Ok(_), + } + if dataset_handle == Some(baz.dataset_handle) + ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#[oop::extend(BaseUseCaseHarness, base_harness)] +struct VerifyUseCaseHarness { + base_harness: BaseUseCaseHarness, + use_case: Arc, +} + +impl VerifyUseCaseHarness { + fn new(mock_dataset_action_authorizer: MockDatasetActionAuthorizer) -> Self { + let base_harness = BaseUseCaseHarness::new( + BaseUseCaseHarnessOptions::new().with_authorizer(mock_dataset_action_authorizer), + ); + + let catalog = dill::CatalogBuilder::new_chained(base_harness.catalog()) + .add::() + .add::() + .add::() + .add::() + .add::() + .build(); + + let use_case = catalog.get_one().unwrap(); + + Self { + base_harness, + use_case, + } + } + + async fn verify_dataset(&self, target: ResolvedDataset) -> VerificationResult { + self.use_case + .execute( + VerificationRequest:: { + target: target.take_handle(), + block_range: (None, None), + options: VerificationOptions::default(), + }, + None, + ) + .await + } + + async fn verify_datasets(&self, targets: Vec) -> Vec { + let requests: Vec<_> = targets + .into_iter() + .map(|target| VerificationRequest:: { + target: target.take_handle(), + block_range: (None, None), + options: VerificationOptions::default(), + }) + .collect(); + + self.use_case.execute_multi(requests, None).await + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/utils/base_repo_harness.rs b/src/infra/core/tests/utils/base_repo_harness.rs new file mode 100644 index 0000000000..9ec640e53c --- /dev/null +++ b/src/infra/core/tests/utils/base_repo_harness.rs @@ -0,0 +1,149 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::path::Path; +use std::sync::Arc; + +use dill::{Catalog, Component}; +use kamu::testing::MetadataFactory; +use kamu::{DatasetRegistryRepoBridge, DatasetRepositoryLocalFs, DatasetRepositoryWriter}; +use kamu_accounts::CurrentAccountSubject; +use kamu_core::{ + CreateDatasetResult, + DatasetRegistry, + DatasetRegistryExt, + DatasetRepository, + GetDatasetError, + MetadataChainExt, + ResolvedDataset, + RunInfoDir, + TenancyConfig, +}; +use opendatafabric::serde::flatbuffers::FlatbuffersMetadataBlockSerializer; +use opendatafabric::serde::MetadataBlockSerializer; +use opendatafabric::{DatasetAlias, DatasetKind, DatasetRef, MetadataBlock, Multicodec, Multihash}; +use time_source::SystemTimeSourceDefault; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct BaseRepoHarness { + temp_dir: tempfile::TempDir, + catalog: Catalog, + dataset_registry: Arc, + dataset_repo_writer: Arc, +} + +impl BaseRepoHarness { + pub fn new(tenancy_config: TenancyConfig) -> Self { + let temp_dir = tempfile::tempdir().unwrap(); + + let datasets_dir = temp_dir.path().join("datasets"); + std::fs::create_dir(&datasets_dir).unwrap(); + + let run_info_dir = temp_dir.path().join("run"); + std::fs::create_dir(&run_info_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new() + .add_value(RunInfoDir::new(run_info_dir)) + .add_value(tenancy_config) + .add_builder(DatasetRepositoryLocalFs::builder().with_root(datasets_dir)) + .bind::() + .bind::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add::() + .build(); + + let dataset_registry = catalog.get_one().unwrap(); + let dataset_repo_writer = catalog.get_one().unwrap(); + + Self { + temp_dir, + catalog, + dataset_registry, + dataset_repo_writer, + } + } + + pub fn catalog(&self) -> &Catalog { + &self.catalog + } + + pub fn temp_dir_path(&self) -> &Path { + self.temp_dir.path() + } + + pub fn dataset_registry(&self) -> &dyn DatasetRegistry { + self.dataset_registry.as_ref() + } + + pub fn dataset_repo_writer(&self) -> &dyn DatasetRepositoryWriter { + self.dataset_repo_writer.as_ref() + } + + pub async fn check_dataset_exists(&self, alias: &DatasetAlias) -> Result<(), GetDatasetError> { + self.dataset_registry + .get_dataset_by_ref(&alias.as_local_ref()) + .await?; + Ok(()) + } + + pub async fn create_root_dataset(&self, alias: &DatasetAlias) -> CreateDatasetResult { + let snapshot = MetadataFactory::dataset_snapshot() + .name(alias.clone()) + .kind(DatasetKind::Root) + .push_event(MetadataFactory::set_polling_source().build()) + .build(); + + let result = self + .dataset_repo_writer + .create_dataset_from_snapshot(snapshot) + .await + .unwrap(); + + result.create_dataset_result + } + + pub async fn create_derived_dataset( + &self, + alias: &DatasetAlias, + input_dataset_refs: Vec, + ) -> CreateDatasetResult { + self.dataset_repo_writer + .create_dataset_from_snapshot( + MetadataFactory::dataset_snapshot() + .name(alias.clone()) + .kind(DatasetKind::Derivative) + .push_event( + MetadataFactory::set_transform() + .inputs_from_refs(input_dataset_refs) + .build(), + ) + .build(), + ) + .await + .unwrap() + .create_dataset_result + } + + pub async fn num_blocks(&self, target: ResolvedDataset) -> usize { + use futures::StreamExt; + target.as_metadata_chain().iter_blocks().count().await + } + + pub fn hash_from_block(block: &MetadataBlock) -> Multihash { + let block_data = FlatbuffersMetadataBlockSerializer + .write_manifest(block) + .unwrap(); + + Multihash::from_digest::(Multicodec::Sha3_256, &block_data) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/utils/ftp_server.rs b/src/infra/core/tests/utils/ftp_server.rs index 34819eb603..0901b9b489 100644 --- a/src/infra/core/tests/utils/ftp_server.rs +++ b/src/infra/core/tests/utils/ftp_server.rs @@ -33,7 +33,7 @@ impl FtpServer { .unwrap(); if !server_dir.exists() { - std::fs::create_dir(&server_dir).unwrap(); + std::fs::create_dir(server_dir).unwrap(); } // TODO: this is likely very brittle because of all the port mapping diff --git a/src/infra/core/tests/utils/mock_engine_provisioner.rs b/src/infra/core/tests/utils/mock_engine_provisioner.rs index 06c1037b53..05eb33d906 100644 --- a/src/infra/core/tests/utils/mock_engine_provisioner.rs +++ b/src/infra/core/tests/utils/mock_engine_provisioner.rs @@ -57,6 +57,7 @@ impl Engine for EngineStub { async fn execute_transform( &self, _request: TransformRequestExt, + _datasets_map: &ResolvedDatasetsMap, ) -> Result { // Note: At least 1 output field must be present, watermark is easy to mimic Ok(TransformResponseExt { diff --git a/src/infra/core/tests/utils/mod.rs b/src/infra/core/tests/utils/mod.rs index ffc53a8611..eb28ace95a 100644 --- a/src/infra/core/tests/utils/mod.rs +++ b/src/infra/core/tests/utils/mod.rs @@ -22,3 +22,9 @@ pub use http_server::*; pub use ipfs_daemon::*; #[cfg(feature = "ingest-mqtt")] pub use mqtt_broker::*; + +mod transform_test_helper; +pub use transform_test_helper::*; + +mod base_repo_harness; +pub use base_repo_harness::*; diff --git a/src/infra/core/tests/utils/transform_test_helper.rs b/src/infra/core/tests/utils/transform_test_helper.rs new file mode 100644 index 0000000000..a36bd1a16b --- /dev/null +++ b/src/infra/core/tests/utils/transform_test_helper.rs @@ -0,0 +1,120 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use dill::Catalog; +use kamu::{ + TransformElaborationServiceImpl, + TransformExecutionServiceImpl, + TransformRequestPlannerImpl, +}; +use kamu_core::{ + CompactionService, + CreateDatasetResult, + DatasetRegistry, + EngineProvisioner, + ResolvedDataset, + TransformElaboration, + TransformElaborationService, + TransformExecutionService, + TransformOptions, + TransformRequestPlanner, + TransformResult, + VerifyTransformError, +}; +use time_source::SystemTimeSource; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub struct TransformTestHelper { + transform_request_planner: Arc, + transform_elab_svc: Arc, + transform_exec_svc: Arc, +} + +impl TransformTestHelper { + pub fn build( + dataset_registry: Arc, + system_time_source: Arc, + compaction_svc: Arc, + engine_provisioner: Arc, + ) -> Self { + Self { + transform_request_planner: Arc::new(TransformRequestPlannerImpl::new( + dataset_registry, + system_time_source.clone(), + )), + transform_elab_svc: Arc::new(TransformElaborationServiceImpl::new( + compaction_svc, + system_time_source, + )), + transform_exec_svc: Arc::new(TransformExecutionServiceImpl::new(engine_provisioner)), + } + } + + pub fn from_catalog(catalog: &Catalog) -> Self { + Self { + transform_request_planner: catalog.get_one().unwrap(), + transform_elab_svc: catalog.get_one().unwrap(), + transform_exec_svc: catalog.get_one().unwrap(), + } + } + + pub async fn transform_dataset(&self, derived: &CreateDatasetResult) -> TransformResult { + let deriv_target = ResolvedDataset::from(derived); + + let plan = self + .transform_request_planner + .build_transform_preliminary_plan(deriv_target.clone()) + .await + .unwrap(); + + let plan = match self + .transform_elab_svc + .elaborate_transform( + deriv_target.clone(), + plan, + TransformOptions::default(), + None, + ) + .await + .unwrap() + { + TransformElaboration::Elaborated(plan) => plan, + TransformElaboration::UpToDate => return TransformResult::UpToDate, + }; + + self.transform_exec_svc + .execute_transform(deriv_target, plan, None) + .await + .1 + .unwrap() + } + + pub async fn verify_transform( + &self, + derived: &CreateDatasetResult, + ) -> Result<(), VerifyTransformError> { + let deriv_target = ResolvedDataset::from(derived); + + let verify_plan = self + .transform_request_planner + .build_transform_verification_plan(deriv_target.clone(), (None, None)) + .await + .map_err(VerifyTransformError::Plan)?; + + self.transform_exec_svc + .execute_verify_transform(deriv_target, verify_plan, None) + .await + .map_err(VerifyTransformError::Execute) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/datasets/inmem/Cargo.toml b/src/infra/datasets/inmem/Cargo.toml index 0a6fc3e399..d5fa677bf5 100644 --- a/src/infra/datasets/inmem/Cargo.toml +++ b/src/infra/datasets/inmem/Cargo.toml @@ -27,9 +27,11 @@ kamu-datasets = { workspace = true } opendatafabric = { workspace = true } internal-error = { workspace = true } +async-stream = "0.3" async-trait = { version = "0.1", default-features = false } chrono = { version = "0.4", default-features = false } dill = "0.9" +futures = "0.3" secrecy = "0.10" thiserror = { version = "1", default-features = false } tokio = { version = "1", default-features = false } diff --git a/src/infra/datasets/inmem/src/repos/inmem_dateset_entry_repository.rs b/src/infra/datasets/inmem/src/repos/inmem_dateset_entry_repository.rs index 0d5eb70361..a0059d11f3 100644 --- a/src/infra/datasets/inmem/src/repos/inmem_dateset_entry_repository.rs +++ b/src/infra/datasets/inmem/src/repos/inmem_dateset_entry_repository.rs @@ -7,38 +7,30 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::collections::HashMap; -use std::sync::Arc; - -use dill::{component, interface, scope, Singleton}; -use kamu_datasets::{ - DatasetEntry, - DatasetEntryByNameNotFoundError, - DatasetEntryNameCollisionError, - DatasetEntryNotFoundError, - DatasetEntryRepository, - DeleteEntryDatasetError, - GetDatasetEntriesByOwnerIdError, - GetDatasetEntryByNameError, - GetDatasetEntryError, - SaveDatasetEntryError, - SaveDatasetEntryErrorDuplicate, - UpdateDatasetEntryNameError, -}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::sync::{Arc, Mutex}; + +use database_common::PaginationOpts; +use dill::*; +use internal_error::InternalError; +use kamu_datasets::*; use opendatafabric::{AccountID, DatasetID, DatasetName}; -use tokio::sync::RwLock; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[derive(Default)] struct State { rows: HashMap, + rows_by_name: BTreeMap, + rows_by_owner: HashMap>, } impl State { fn new() -> Self { Self { rows: HashMap::new(), + rows_by_name: BTreeMap::new(), + rows_by_owner: HashMap::new(), } } } @@ -46,7 +38,7 @@ impl State { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct InMemoryDatasetEntryRepository { - state: Arc>, + state: Arc>, } #[component(pub)] @@ -55,7 +47,7 @@ pub struct InMemoryDatasetEntryRepository { impl InMemoryDatasetEntryRepository { pub fn new() -> Self { Self { - state: Arc::new(RwLock::new(State::new())), + state: Arc::new(Mutex::new(State::new())), } } } @@ -64,19 +56,41 @@ impl InMemoryDatasetEntryRepository { #[async_trait::async_trait] impl DatasetEntryRepository for InMemoryDatasetEntryRepository { - async fn dataset_entries_count(&self) -> Result { - let readable_state = self.state.read().await; + async fn dataset_entries_count(&self) -> Result { + let readable_state = self.state.lock().unwrap(); + Ok(readable_state.rows.len()) + } - let dataset_entries_count = readable_state.rows.len(); + async fn dataset_entries_count_by_owner_id( + &self, + owner_id: &AccountID, + ) -> Result { + let readable_state = self.state.lock().unwrap(); + let owner_entires = readable_state.rows_by_owner.get(owner_id); + Ok(owner_entires.map_or(0, BTreeSet::len)) + } + + fn get_dataset_entries(&self, pagination: PaginationOpts) -> DatasetEntryStream { + let dataset_entries_page: Vec<_> = { + let readable_state = self.state.lock().unwrap(); + readable_state + .rows_by_name + .values() + .skip(pagination.offset) + .take(pagination.limit) + .cloned() + .map(Ok) + .collect() + }; - Ok(dataset_entries_count) + Box::pin(futures::stream::iter(dataset_entries_page)) } async fn get_dataset_entry( &self, dataset_id: &DatasetID, ) -> Result { - let readable_state = self.state.read().await; + let readable_state = self.state.lock().unwrap(); let maybe_dataset_entry = readable_state.rows.get(dataset_id); @@ -87,12 +101,32 @@ impl DatasetEntryRepository for InMemoryDatasetEntryRepository { Ok(dataset_entry.clone()) } - async fn get_dataset_entry_by_name( + async fn get_multiple_dataset_entries( + &self, + dataset_ids: &[DatasetID], + ) -> Result { + let readable_state = self.state.lock().unwrap(); + + let mut resolution = DatasetEntriesResolution::default(); + + for dataset_id in dataset_ids { + let maybe_dataset_entry = readable_state.rows.get(dataset_id); + if let Some(dataset_entry) = maybe_dataset_entry { + resolution.resolved_entries.push(dataset_entry.clone()); + } else { + resolution.unresolved_entries.push(dataset_id.clone()); + } + } + + Ok(resolution) + } + + async fn get_dataset_entry_by_owner_and_name( &self, owner_id: &AccountID, name: &DatasetName, ) -> Result { - let readable_state = self.state.read().await; + let readable_state = self.state.lock().unwrap(); let maybe_dataset_entry = readable_state .rows @@ -108,31 +142,35 @@ impl DatasetEntryRepository for InMemoryDatasetEntryRepository { Ok(dataset_entry.clone()) } - async fn get_dataset_entries_by_owner_id( + fn get_dataset_entries_by_owner_id( &self, owner_id: &AccountID, - ) -> Result, GetDatasetEntriesByOwnerIdError> { - let readable_state = self.state.read().await; - - let dataset_entries = readable_state - .rows - .values() - .fold(vec![], |mut acc, dataset| { - if dataset.owner_id == *owner_id { - acc.push(dataset.clone()); - } - - acc - }); + pagination: PaginationOpts, + ) -> DatasetEntryStream<'_> { + let dataset_entries_page: Vec<_> = { + let readable_state = self.state.lock().unwrap(); + if let Some(dataset_ids) = readable_state.rows_by_owner.get(owner_id) { + dataset_ids + .iter() + .skip(pagination.offset) + .take(pagination.limit) + .map(|dataset_id| readable_state.rows.get(dataset_id).unwrap()) + .cloned() + .map(Ok) + .collect() + } else { + vec![] + } + }; - Ok(dataset_entries) + Box::pin(futures::stream::iter(dataset_entries_page)) } async fn save_dataset_entry( &self, dataset_entry: &DatasetEntry, ) -> Result<(), SaveDatasetEntryError> { - let mut writable_state = self.state.write().await; + let mut writable_state = self.state.lock().unwrap(); for row in writable_state.rows.values() { if row.id == dataset_entry.id { @@ -148,6 +186,18 @@ impl DatasetEntryRepository for InMemoryDatasetEntryRepository { .rows .insert(dataset_entry.id.clone(), dataset_entry.clone()); + writable_state + .rows_by_name + .insert(dataset_entry.name.clone(), dataset_entry.clone()); + + writable_state + .rows_by_owner + .entry(dataset_entry.owner_id.clone()) + .and_modify(|owner_dataset_ids| { + owner_dataset_ids.insert(dataset_entry.id.clone()); + }) + .or_insert_with(|| BTreeSet::from_iter([dataset_entry.id.clone()])); + Ok(()) } @@ -156,7 +206,7 @@ impl DatasetEntryRepository for InMemoryDatasetEntryRepository { dataset_id: &DatasetID, new_name: &DatasetName, ) -> Result<(), UpdateDatasetEntryNameError> { - let mut writable_state = self.state.write().await; + let mut writable_state = self.state.lock().unwrap(); let maybe_dataset_entry = writable_state.rows.get(dataset_id); @@ -176,9 +226,20 @@ impl DatasetEntryRepository for InMemoryDatasetEntryRepository { // To avoid frustrating the borrow checker, we have to do a second look-up. // Safety: We're already guaranteed that the entry will be present. - let found_dataset_entry = writable_state.rows.get_mut(dataset_id).unwrap(); + let old_name = { + let found_dataset_entry = writable_state.rows.get_mut(dataset_id).unwrap(); + let old_name = found_dataset_entry.name.clone(); + found_dataset_entry.name = new_name.clone(); + old_name + }; - found_dataset_entry.name = new_name.clone(); + // Mirror the change in named collection + let mut entry = writable_state + .rows_by_name + .remove(&old_name) + .expect("named record must be present"); + entry.name = new_name.clone(); + writable_state.rows_by_name.insert(new_name.clone(), entry); Ok(()) } @@ -187,11 +248,17 @@ impl DatasetEntryRepository for InMemoryDatasetEntryRepository { &self, dataset_id: &DatasetID, ) -> Result<(), DeleteEntryDatasetError> { - let mut writable_state = self.state.write().await; - - let not_found = writable_state.rows.remove(dataset_id).is_none(); - - if not_found { + let mut writable_state = self.state.lock().unwrap(); + + let maybe_removed_entry = writable_state.rows.remove(dataset_id); + if let Some(removed_entry) = maybe_removed_entry { + writable_state.rows_by_name.remove(&removed_entry.name); + writable_state + .rows_by_owner + .get_mut(&removed_entry.owner_id) + .unwrap() + .remove(&removed_entry.id); + } else { return Err(DatasetEntryNotFoundError::new(dataset_id.clone()).into()); } diff --git a/src/infra/datasets/inmem/tests/repos/test_inmem_dataset_entry_repository.rs b/src/infra/datasets/inmem/tests/repos/test_inmem_dataset_entry_repository.rs index c183e5e353..d040e1eff2 100644 --- a/src/infra/datasets/inmem/tests/repos/test_inmem_dataset_entry_repository.rs +++ b/src/infra/datasets/inmem/tests/repos/test_inmem_dataset_entry_repository.rs @@ -23,6 +23,22 @@ database_transactional_test!( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +database_transactional_test!( + storage = inmem, + fixture = dataset_entry_repo::test_stream_many_entries, + harness = InMemoryDatasetEntryRepositoryHarness +); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +database_transactional_test!( + storage = inmem, + fixture = dataset_entry_repo::test_get_multiple_entries, + harness = InMemoryDatasetEntryRepositoryHarness +); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + database_transactional_test!( storage = inmem, fixture = dataset_entry_repo::test_get_dataset_entry_by_name, diff --git a/src/infra/datasets/postgres/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json b/src/infra/datasets/postgres/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json new file mode 100644 index 0000000000..128220e4a5 --- /dev/null +++ b/src/infra/datasets/postgres/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json @@ -0,0 +1,41 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT\n dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n ORDER BY dataset_name ASC\n LIMIT $1 OFFSET $2\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id: _", + "type_info": "Varchar" + }, + { + "ordinal": 1, + "name": "owner_id: _", + "type_info": "Varchar" + }, + { + "ordinal": 2, + "name": "name", + "type_info": "Varchar" + }, + { + "ordinal": 3, + "name": "created_at: _", + "type_info": "Timestamptz" + } + ], + "parameters": { + "Left": [ + "Int8", + "Int8" + ] + }, + "nullable": [ + false, + false, + false, + false + ] + }, + "hash": "13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2" +} diff --git a/src/infra/datasets/postgres/.sqlx/query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json b/src/infra/datasets/postgres/.sqlx/query-2bcdb350c9c397529fafa84a0b575eca95214025291d1bd310c3900040a3c9c8.json similarity index 83% rename from src/infra/datasets/postgres/.sqlx/query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json rename to src/infra/datasets/postgres/.sqlx/query-2bcdb350c9c397529fafa84a0b575eca95214025291d1bd310c3900040a3c9c8.json index a47f416089..f9498617cf 100644 --- a/src/infra/datasets/postgres/.sqlx/query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json +++ b/src/infra/datasets/postgres/.sqlx/query-2bcdb350c9c397529fafa84a0b575eca95214025291d1bd310c3900040a3c9c8.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE owner_id = $1\n ", + "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE dataset_id = ANY($1)\n ORDER BY dataset_id\n ", "describe": { "columns": [ { @@ -26,7 +26,7 @@ ], "parameters": { "Left": [ - "Text" + "TextArray" ] }, "nullable": [ @@ -36,5 +36,5 @@ false ] }, - "hash": "a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9" + "hash": "2bcdb350c9c397529fafa84a0b575eca95214025291d1bd310c3900040a3c9c8" } diff --git a/src/infra/datasets/postgres/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json b/src/infra/datasets/postgres/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json new file mode 100644 index 0000000000..44c9bb3e13 --- /dev/null +++ b/src/infra/datasets/postgres/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json @@ -0,0 +1,22 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT COUNT(*)\n FROM dataset_entries\n WHERE owner_id = $1\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "count", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [ + "Text" + ] + }, + "nullable": [ + null + ] + }, + "hash": "a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523" +} diff --git a/src/infra/datasets/postgres/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json b/src/infra/datasets/postgres/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json new file mode 100644 index 0000000000..0cf71fabd0 --- /dev/null +++ b/src/infra/datasets/postgres/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json @@ -0,0 +1,42 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE owner_id = $1\n LIMIT $2 OFFSET $3\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id: _", + "type_info": "Varchar" + }, + { + "ordinal": 1, + "name": "owner_id: _", + "type_info": "Varchar" + }, + { + "ordinal": 2, + "name": "name", + "type_info": "Varchar" + }, + { + "ordinal": 3, + "name": "created_at: _", + "type_info": "Timestamptz" + } + ], + "parameters": { + "Left": [ + "Text", + "Int8", + "Int8" + ] + }, + "nullable": [ + false, + false, + false, + false + ] + }, + "hash": "b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed" +} diff --git a/src/infra/datasets/postgres/.sqlx/query-7954a6acf1cdb627dfe2890b042679ef9e3886268865cce559cf2268c66ea800.json b/src/infra/datasets/postgres/.sqlx/query-fcb34f3fa8f59b1f8190694fc38dc66874757b9f56f23ed86f8494c6ed4b0b7a.json similarity index 85% rename from src/infra/datasets/postgres/.sqlx/query-7954a6acf1cdb627dfe2890b042679ef9e3886268865cce559cf2268c66ea800.json rename to src/infra/datasets/postgres/.sqlx/query-fcb34f3fa8f59b1f8190694fc38dc66874757b9f56f23ed86f8494c6ed4b0b7a.json index 66fcff1853..1d1911431c 100644 --- a/src/infra/datasets/postgres/.sqlx/query-7954a6acf1cdb627dfe2890b042679ef9e3886268865cce559cf2268c66ea800.json +++ b/src/infra/datasets/postgres/.sqlx/query-fcb34f3fa8f59b1f8190694fc38dc66874757b9f56f23ed86f8494c6ed4b0b7a.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE dataset_id = $1\n ", + "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE dataset_id = $1\n ORDER BY created_at\n ", "describe": { "columns": [ { @@ -36,5 +36,5 @@ false ] }, - "hash": "7954a6acf1cdb627dfe2890b042679ef9e3886268865cce559cf2268c66ea800" + "hash": "fcb34f3fa8f59b1f8190694fc38dc66874757b9f56f23ed86f8494c6ed4b0b7a" } diff --git a/src/infra/datasets/postgres/Cargo.toml b/src/infra/datasets/postgres/Cargo.toml index d468bab9b3..6c2299ff8e 100644 --- a/src/infra/datasets/postgres/Cargo.toml +++ b/src/infra/datasets/postgres/Cargo.toml @@ -27,9 +27,11 @@ kamu-datasets = { workspace = true, features = ["sqlx"] } internal-error = { workspace = true } opendatafabric = { workspace = true, features = ["sqlx-postgres"] } +async-stream = "0.3" async-trait = { version = "0.1", default-features = false } chrono = { version = "0.4", default-features = false } dill = "0.9" +futures = "0.3" secrecy = "0.10" sqlx = { version = "0.8", default-features = false, features = [ "runtime-tokio-rustls", diff --git a/src/infra/datasets/postgres/src/repos/postgres_dataset_entry_repository.rs b/src/infra/datasets/postgres/src/repos/postgres_dataset_entry_repository.rs index fa9a76a6f5..e779999804 100644 --- a/src/infra/datasets/postgres/src/repos/postgres_dataset_entry_repository.rs +++ b/src/infra/datasets/postgres/src/repos/postgres_dataset_entry_repository.rs @@ -7,9 +7,11 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use database_common::{TransactionRef, TransactionRefT}; +use std::collections::HashSet; + +use database_common::{PaginationOpts, TransactionRef, TransactionRefT}; use dill::{component, interface}; -use internal_error::{ErrorIntoInternal, ResultIntoInternal}; +use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; use kamu_datasets::*; use opendatafabric::{AccountID, DatasetID, DatasetName}; @@ -33,7 +35,7 @@ impl PostgresDatasetEntryRepository { #[async_trait::async_trait] impl DatasetEntryRepository for PostgresDatasetEntryRepository { - async fn dataset_entries_count(&self) -> Result { + async fn dataset_entries_count(&self) -> Result { let mut tr = self.transaction.lock().await; let connection_mut = tr.connection_mut().await?; @@ -51,6 +53,64 @@ impl DatasetEntryRepository for PostgresDatasetEntryRepository { Ok(usize::try_from(dataset_entries_count.unwrap_or(0)).unwrap()) } + async fn dataset_entries_count_by_owner_id( + &self, + owner_id: &AccountID, + ) -> Result { + let stack_owner_id = owner_id.as_did_str().to_stack_string(); + + let mut tr = self.transaction.lock().await; + + let connection_mut = tr.connection_mut().await?; + + let dataset_entries_count = sqlx::query_scalar!( + r#" + SELECT COUNT(*) + FROM dataset_entries + WHERE owner_id = $1 + "#, + stack_owner_id.as_str() + ) + .fetch_one(connection_mut) + .await + .int_err()?; + + Ok(usize::try_from(dataset_entries_count.unwrap_or(0)).unwrap()) + } + + fn get_dataset_entries(&self, pagination: PaginationOpts) -> DatasetEntryStream { + Box::pin(async_stream::stream! { + let mut tr = self.transaction.lock().await; + let connection_mut = tr.connection_mut().await?; + + let limit = i64::try_from(pagination.limit).int_err()?; + let offset = i64::try_from(pagination.offset).int_err()?; + + let mut query_stream = sqlx::query_as!( + DatasetEntryRowModel, + r#" + SELECT + dataset_id as "id: _", + owner_id as "owner_id: _", + dataset_name as name, + created_at as "created_at: _" + FROM dataset_entries + ORDER BY dataset_name ASC + LIMIT $1 OFFSET $2 + "#, + limit, + offset, + ) + .fetch(connection_mut) + .map_err(ErrorIntoInternal::int_err); + + use futures::TryStreamExt; + while let Some(entry) = query_stream.try_next().await? { + yield Ok(entry.into()); + } + }) + } + async fn get_dataset_entry( &self, dataset_id: &DatasetID, @@ -70,6 +130,7 @@ impl DatasetEntryRepository for PostgresDatasetEntryRepository { created_at as "created_at: _" FROM dataset_entries WHERE dataset_id = $1 + ORDER BY created_at "#, stack_dataset_id.as_str(), ) @@ -84,18 +145,20 @@ impl DatasetEntryRepository for PostgresDatasetEntryRepository { } } - async fn get_dataset_entry_by_name( + async fn get_multiple_dataset_entries( &self, - owner_id: &AccountID, - name: &DatasetName, - ) -> Result { + dataset_ids: &[DatasetID], + ) -> Result { let mut tr = self.transaction.lock().await; let connection_mut = tr.connection_mut().await?; - let stack_owner_id = owner_id.as_did_str().to_stack_string(); + let dataset_ids_search: Vec<_> = dataset_ids + .iter() + .map(|dataset_id| dataset_id.as_did_str().to_string()) + .collect(); - let maybe_dataset_entry_row = sqlx::query_as!( + let resolved_entries = sqlx::query_as!( DatasetEntryRowModel, r#" SELECT dataset_id as "id: _", @@ -103,34 +166,46 @@ impl DatasetEntryRepository for PostgresDatasetEntryRepository { dataset_name as name, created_at as "created_at: _" FROM dataset_entries - WHERE owner_id = $1 - AND dataset_name = $2 + WHERE dataset_id = ANY($1) + ORDER BY dataset_id "#, - stack_owner_id.as_str(), - name.as_str() + &dataset_ids_search, ) - .fetch_optional(connection_mut) + .map(Into::into) + .fetch_all(connection_mut) .await .int_err()?; - if let Some(dataset_entry_row) = maybe_dataset_entry_row { - Ok(dataset_entry_row.into()) - } else { - Err(DatasetEntryByNameNotFoundError::new(owner_id.clone(), name.clone()).into()) - } + let resolved_dataset_ids: HashSet<_> = resolved_entries + .iter() + .map(|entry: &DatasetEntry| &entry.id) + .cloned() + .collect(); + + let unresolved_entries = dataset_ids + .iter() + .filter(|id| !resolved_dataset_ids.contains(id)) + .cloned() + .collect(); + + Ok(DatasetEntriesResolution { + resolved_entries, + unresolved_entries, + }) } - async fn get_dataset_entries_by_owner_id( + async fn get_dataset_entry_by_owner_and_name( &self, owner_id: &AccountID, - ) -> Result, GetDatasetEntriesByOwnerIdError> { + name: &DatasetName, + ) -> Result { let mut tr = self.transaction.lock().await; let connection_mut = tr.connection_mut().await?; let stack_owner_id = owner_id.as_did_str().to_stack_string(); - let dataset_entry_rows = sqlx::query_as!( + let maybe_dataset_entry_row = sqlx::query_as!( DatasetEntryRowModel, r#" SELECT dataset_id as "id: _", @@ -139,14 +214,57 @@ impl DatasetEntryRepository for PostgresDatasetEntryRepository { created_at as "created_at: _" FROM dataset_entries WHERE owner_id = $1 + AND dataset_name = $2 "#, stack_owner_id.as_str(), + name.as_str() ) - .fetch_all(connection_mut) + .fetch_optional(connection_mut) .await .int_err()?; - Ok(dataset_entry_rows.into_iter().map(Into::into).collect()) + if let Some(dataset_entry_row) = maybe_dataset_entry_row { + Ok(dataset_entry_row.into()) + } else { + Err(DatasetEntryByNameNotFoundError::new(owner_id.clone(), name.clone()).into()) + } + } + + fn get_dataset_entries_by_owner_id( + &self, + owner_id: &AccountID, + pagination: PaginationOpts, + ) -> DatasetEntryStream<'_> { + let stack_owner_id = owner_id.as_did_str().to_stack_string(); + + Box::pin(async_stream::stream! { + let mut tr = self.transaction.lock().await; + + let connection_mut = tr.connection_mut().await?; + + let mut query_stream = sqlx::query_as!( + DatasetEntryRowModel, + r#" + SELECT dataset_id as "id: _", + owner_id as "owner_id: _", + dataset_name as name, + created_at as "created_at: _" + FROM dataset_entries + WHERE owner_id = $1 + LIMIT $2 OFFSET $3 + "#, + stack_owner_id.as_str(), + i64::try_from(pagination.limit).unwrap(), + i64::try_from(pagination.offset).unwrap(), + ) + .fetch(connection_mut); + + use futures::TryStreamExt; + while let Some(row) = query_stream.try_next().await.int_err()? { + yield Ok(row.into()); + } + + }) } async fn save_dataset_entry( diff --git a/src/infra/datasets/postgres/tests/repos/test_postgres_dataset_entry_repository.rs b/src/infra/datasets/postgres/tests/repos/test_postgres_dataset_entry_repository.rs index 4147088a17..7759310ab1 100644 --- a/src/infra/datasets/postgres/tests/repos/test_postgres_dataset_entry_repository.rs +++ b/src/infra/datasets/postgres/tests/repos/test_postgres_dataset_entry_repository.rs @@ -25,6 +25,22 @@ database_transactional_test!( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +database_transactional_test!( + storage = postgres, + fixture = dataset_entry_repo::test_stream_many_entries, + harness = PostgresDatasetEntryRepositoryHarness +); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +database_transactional_test!( + storage = postgres, + fixture = dataset_entry_repo::test_get_multiple_entries, + harness = PostgresDatasetEntryRepositoryHarness +); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + database_transactional_test!( storage = postgres, fixture = dataset_entry_repo::test_get_dataset_entry_by_name, diff --git a/src/infra/datasets/repo-tests/Cargo.toml b/src/infra/datasets/repo-tests/Cargo.toml index 3379414592..99a46d9f9e 100644 --- a/src/infra/datasets/repo-tests/Cargo.toml +++ b/src/infra/datasets/repo-tests/Cargo.toml @@ -29,5 +29,6 @@ opendatafabric = { workspace = true } chrono = { version = "0.4", default-features = false } dill = "0.9" +futures = "0.3" secrecy = "0.10" uuid = "1" diff --git a/src/infra/datasets/repo-tests/src/dataset_entry_repository_test_suite.rs b/src/infra/datasets/repo-tests/src/dataset_entry_repository_test_suite.rs index 32d8849d51..bef7e14acc 100644 --- a/src/infra/datasets/repo-tests/src/dataset_entry_repository_test_suite.rs +++ b/src/infra/datasets/repo-tests/src/dataset_entry_repository_test_suite.rs @@ -11,9 +11,11 @@ use std::assert_matches::assert_matches; use std::sync::Arc; use chrono::{SubsecRound, Utc}; +use database_common::PaginationOpts; use dill::Catalog; use kamu_accounts::{Account, AccountRepository, AccountType}; use kamu_datasets::{ + DatasetEntriesResolution, DatasetEntry, DatasetEntryByNameNotFoundError, DatasetEntryNotFoundError, @@ -76,6 +78,190 @@ pub async fn test_get_dataset_entry(catalog: &Catalog) { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +pub async fn test_stream_many_entries(catalog: &Catalog) { + let account_repo = catalog.get_one::().unwrap(); + let dataset_entry_repo = catalog.get_one::().unwrap(); + + use futures::TryStreamExt; + + { + let get_res: Result, _> = dataset_entry_repo + .get_dataset_entries(PaginationOpts { + limit: 100, + offset: 0, + }) + .try_collect() + .await; + let expected_dataset_entries = vec![]; + + assert_matches!( + get_res, + Ok(actual_dataset_entries) + if actual_dataset_entries == expected_dataset_entries + ); + } + + let account_1 = new_account_with_name(&account_repo, "user1").await; + let account_2 = new_account_with_name(&account_repo, "user2").await; + + let dataset_entry_acc_1_1 = new_dataset_entry_with(&account_1, "dataset1"); + let dataset_entry_acc_1_2 = new_dataset_entry_with(&account_1, "dataset2"); + let dataset_entry_acc_2_3 = new_dataset_entry_with(&account_2, "dataset3"); + + { + let save_res = dataset_entry_repo + .save_dataset_entry(&dataset_entry_acc_1_1) + .await; + + assert_matches!(save_res, Ok(_)); + } + { + let save_res = dataset_entry_repo + .save_dataset_entry(&dataset_entry_acc_1_2) + .await; + + assert_matches!(save_res, Ok(_)); + } + { + let save_res = dataset_entry_repo + .save_dataset_entry(&dataset_entry_acc_2_3) + .await; + + assert_matches!(save_res, Ok(_)); + } + + { + let get_res: Result, _> = dataset_entry_repo + .get_dataset_entries(PaginationOpts { + limit: 100, + offset: 0, + }) + .try_collect() + .await; + let expected_dataset_entries = vec![ + dataset_entry_acc_1_1, + dataset_entry_acc_1_2, + dataset_entry_acc_2_3, + ]; + + assert_matches!( + get_res, + Ok(actual_dataset_entries) + if actual_dataset_entries == expected_dataset_entries + ); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pub async fn test_get_multiple_entries(catalog: &Catalog) { + let account_repo = catalog.get_one::().unwrap(); + let dataset_entry_repo = catalog.get_one::().unwrap(); + + { + let get_multiple_res = dataset_entry_repo + .get_multiple_dataset_entries(&[]) + .await + .unwrap(); + + assert_eq!( + get_multiple_res, + DatasetEntriesResolution { + resolved_entries: vec![], + unresolved_entries: vec![] + } + ); + } + + let account = new_account(&account_repo).await; + + let dataset_entry_acc_1 = new_dataset_entry_with(&account, "dataset1"); + let dataset_entry_acc_2 = new_dataset_entry_with(&account, "dataset2"); + let dataset_entry_acc_3 = new_dataset_entry_with(&account, "dataset3"); + + { + let save_res = dataset_entry_repo + .save_dataset_entry(&dataset_entry_acc_1) + .await; + + assert_matches!(save_res, Ok(_)); + } + { + let save_res = dataset_entry_repo + .save_dataset_entry(&dataset_entry_acc_2) + .await; + + assert_matches!(save_res, Ok(_)); + } + { + let save_res = dataset_entry_repo + .save_dataset_entry(&dataset_entry_acc_3) + .await; + + assert_matches!(save_res, Ok(_)); + } + + { + let mut get_multiple_res = dataset_entry_repo + .get_multiple_dataset_entries(&[ + dataset_entry_acc_1.id.clone(), + dataset_entry_acc_3.id.clone(), + ]) + .await + .unwrap(); + + get_multiple_res.resolved_entries.sort(); + + let mut expected_resolved_entries = + vec![dataset_entry_acc_1.clone(), dataset_entry_acc_3.clone()]; + expected_resolved_entries.sort(); + + assert_eq!( + get_multiple_res, + DatasetEntriesResolution { + resolved_entries: expected_resolved_entries, + unresolved_entries: vec![] + } + ); + } + + { + let wrong_id = DatasetID::new_seeded_ed25519(b"wrong_id"); + let get_multiple_res = dataset_entry_repo + .get_multiple_dataset_entries(&[dataset_entry_acc_2.id.clone(), wrong_id.clone()]) + .await + .unwrap(); + + assert_eq!( + get_multiple_res, + DatasetEntriesResolution { + resolved_entries: vec![dataset_entry_acc_2.clone()], + unresolved_entries: vec![wrong_id] + } + ); + } + + { + let wrong_id_1 = DatasetID::new_seeded_ed25519(b"wrong_id_1"); + let wrong_id_2 = DatasetID::new_seeded_ed25519(b"wrong_id_2"); + + let get_multiple_res = dataset_entry_repo + .get_multiple_dataset_entries(&[wrong_id_1.clone(), wrong_id_2.clone()]) + .await + .unwrap(); + + assert_eq!( + get_multiple_res, + DatasetEntriesResolution { + resolved_entries: vec![], + unresolved_entries: vec![wrong_id_1, wrong_id_2] + } + ); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + pub async fn test_get_dataset_entry_by_name(catalog: &Catalog) { let account_repo = catalog.get_one::().unwrap(); let dataset_entry_repo = catalog.get_one::().unwrap(); @@ -85,7 +271,7 @@ pub async fn test_get_dataset_entry_by_name(catalog: &Catalog) { let dataset_entry = new_dataset_entry(&account); { let get_res = dataset_entry_repo - .get_dataset_entry_by_name(&dataset_entry.owner_id, &dataset_entry.name) + .get_dataset_entry_by_owner_and_name(&dataset_entry.owner_id, &dataset_entry.name) .await; assert_matches!( @@ -105,7 +291,7 @@ pub async fn test_get_dataset_entry_by_name(catalog: &Catalog) { } { let get_res = dataset_entry_repo - .get_dataset_entry_by_name(&dataset_entry.owner_id, &dataset_entry.name) + .get_dataset_entry_by_owner_and_name(&dataset_entry.owner_id, &dataset_entry.name) .await; assert_matches!( @@ -125,9 +311,33 @@ pub async fn test_get_dataset_entries_by_owner_id(catalog: &Catalog) { let account_1 = new_account_with_name(&account_repo, "user1").await; let account_2 = new_account_with_name(&account_repo, "user2").await; + use futures::TryStreamExt; + { - let get_res = dataset_entry_repo - .get_dataset_entries_by_owner_id(&account_1.id) + assert_eq!( + dataset_entry_repo + .dataset_entries_count_by_owner_id(&account_1.id) + .await + .unwrap(), + 0, + ); + assert_eq!( + dataset_entry_repo + .dataset_entries_count_by_owner_id(&account_2.id) + .await + .unwrap(), + 0, + ); + + let get_res: Result, _> = dataset_entry_repo + .get_dataset_entries_by_owner_id( + &account_1.id, + PaginationOpts { + limit: 100, + offset: 0, + }, + ) + .try_collect() .await; let expected_dataset_entries = vec![]; @@ -138,8 +348,15 @@ pub async fn test_get_dataset_entries_by_owner_id(catalog: &Catalog) { ); } { - let get_res = dataset_entry_repo - .get_dataset_entries_by_owner_id(&account_2.id) + let get_res: Result, _> = dataset_entry_repo + .get_dataset_entries_by_owner_id( + &account_2.id, + PaginationOpts { + limit: 100, + offset: 0, + }, + ) + .try_collect() .await; let expected_dataset_entries = vec![]; @@ -175,8 +392,15 @@ pub async fn test_get_dataset_entries_by_owner_id(catalog: &Catalog) { assert_matches!(save_res, Ok(_)); } { - let get_res = dataset_entry_repo - .get_dataset_entries_by_owner_id(&account_1.id) + let get_res: Result, _> = dataset_entry_repo + .get_dataset_entries_by_owner_id( + &account_1.id, + PaginationOpts { + limit: 100, + offset: 0, + }, + ) + .try_collect() .await; let mut expected_dataset_entries = vec![dataset_entry_acc_1_1, dataset_entry_acc_1_2]; @@ -192,10 +416,25 @@ pub async fn test_get_dataset_entries_by_owner_id(catalog: &Catalog) { panic!("A successful result was expected, but an error was received: {e}"); } } + + assert_eq!( + dataset_entry_repo + .dataset_entries_count_by_owner_id(&account_1.id) + .await + .unwrap(), + 2, + ); } { - let get_res = dataset_entry_repo - .get_dataset_entries_by_owner_id(&account_2.id) + let get_res: Result, _> = dataset_entry_repo + .get_dataset_entries_by_owner_id( + &account_2.id, + PaginationOpts { + limit: 100, + offset: 0, + }, + ) + .try_collect() .await; let expected_dataset_entries = vec![dataset_entry_acc_2_3]; @@ -204,6 +443,14 @@ pub async fn test_get_dataset_entries_by_owner_id(catalog: &Catalog) { Ok(actual_dataset_entries) if actual_dataset_entries == expected_dataset_entries ); + + assert_eq!( + dataset_entry_repo + .dataset_entries_count_by_owner_id(&account_2.id) + .await + .unwrap(), + 1, + ); } { let count_res = dataset_entry_repo.dataset_entries_count().await; diff --git a/src/infra/datasets/sqlite/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json b/src/infra/datasets/sqlite/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json new file mode 100644 index 0000000000..be47e2fc9b --- /dev/null +++ b/src/infra/datasets/sqlite/.sqlx/query-13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2.json @@ -0,0 +1,38 @@ +{ + "db_name": "SQLite", + "query": "\n SELECT\n dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n ORDER BY dataset_name ASC\n LIMIT $1 OFFSET $2\n ", + "describe": { + "columns": [ + { + "name": "id: _", + "ordinal": 0, + "type_info": "Text" + }, + { + "name": "owner_id: _", + "ordinal": 1, + "type_info": "Text" + }, + { + "name": "name", + "ordinal": 2, + "type_info": "Text" + }, + { + "name": "created_at: _", + "ordinal": 3, + "type_info": "Null" + } + ], + "parameters": { + "Right": 2 + }, + "nullable": [ + false, + false, + false, + false + ] + }, + "hash": "13fe35a7997b790566736b78e16c17cd7452d48887938a2a28cbd9a1408472e2" +} diff --git a/src/infra/datasets/sqlite/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json b/src/infra/datasets/sqlite/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json new file mode 100644 index 0000000000..09c4f9ee3b --- /dev/null +++ b/src/infra/datasets/sqlite/.sqlx/query-a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523.json @@ -0,0 +1,20 @@ +{ + "db_name": "SQLite", + "query": "\n SELECT COUNT(*)\n FROM dataset_entries\n WHERE owner_id = $1\n ", + "describe": { + "columns": [ + { + "name": "COUNT(*)", + "ordinal": 0, + "type_info": "Integer" + } + ], + "parameters": { + "Right": 1 + }, + "nullable": [ + false + ] + }, + "hash": "a35cae0015f9dd08f3095ee317c568af9d34f9614bb06303f05fc42601a07523" +} diff --git a/src/infra/datasets/sqlite/.sqlx/query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json b/src/infra/datasets/sqlite/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json similarity index 55% rename from src/infra/datasets/sqlite/.sqlx/query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json rename to src/infra/datasets/sqlite/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json index 2e1a07cee5..3da596a8c9 100644 --- a/src/infra/datasets/sqlite/.sqlx/query-a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9.json +++ b/src/infra/datasets/sqlite/.sqlx/query-b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed.json @@ -1,6 +1,6 @@ { "db_name": "SQLite", - "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE owner_id = $1\n ", + "query": "\n SELECT dataset_id as \"id: _\",\n owner_id as \"owner_id: _\",\n dataset_name as name,\n created_at as \"created_at: _\"\n FROM dataset_entries\n WHERE owner_id = $1\n LIMIT $2 OFFSET $3\n ", "describe": { "columns": [ { @@ -25,7 +25,7 @@ } ], "parameters": { - "Right": 1 + "Right": 3 }, "nullable": [ false, @@ -34,5 +34,5 @@ false ] }, - "hash": "a0155b21b942423c8590308e767ef41a2d958ad8296b835f7fb9d8b87ec5e4f9" + "hash": "b7aa1b2d72f9ec6955f8370e0dc3113608f5ea1bdcf8319aa8e00a97e55269ed" } diff --git a/src/infra/datasets/sqlite/Cargo.toml b/src/infra/datasets/sqlite/Cargo.toml index 0b8f63a40d..feb3be99fc 100644 --- a/src/infra/datasets/sqlite/Cargo.toml +++ b/src/infra/datasets/sqlite/Cargo.toml @@ -27,9 +27,11 @@ kamu-datasets = { workspace = true, features = ["sqlx"] } internal-error = { workspace = true } opendatafabric = { workspace = true, features = ["sqlx-sqlite"] } +async-stream = "0.3" async-trait = { version = "0.1", default-features = false } chrono = { version = "0.4", default-features = false } dill = "0.9" +futures = "0.3" secrecy = "0.10" sqlx = { version = "0.8", default-features = false, features = [ "runtime-tokio-rustls", diff --git a/src/infra/datasets/sqlite/src/repos/sqlite_dateset_entry_repository.rs b/src/infra/datasets/sqlite/src/repos/sqlite_dateset_entry_repository.rs index f81f568908..ff0a02ffc3 100644 --- a/src/infra/datasets/sqlite/src/repos/sqlite_dateset_entry_repository.rs +++ b/src/infra/datasets/sqlite/src/repos/sqlite_dateset_entry_repository.rs @@ -7,11 +7,14 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use database_common::{TransactionRef, TransactionRefT}; +use std::collections::HashSet; + +use database_common::{PaginationOpts, TransactionRef, TransactionRefT}; use dill::{component, interface}; -use internal_error::{ErrorIntoInternal, ResultIntoInternal}; +use internal_error::{ErrorIntoInternal, InternalError, ResultIntoInternal}; use kamu_datasets::*; use opendatafabric::{AccountID, DatasetID, DatasetName}; +use sqlx::Row; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -33,7 +36,7 @@ impl SqliteDatasetEntryRepository { #[async_trait::async_trait] impl DatasetEntryRepository for SqliteDatasetEntryRepository { - async fn dataset_entries_count(&self) -> Result { + async fn dataset_entries_count(&self) -> Result { let mut tr = self.transaction.lock().await; let connection_mut = tr.connection_mut().await?; @@ -51,6 +54,65 @@ impl DatasetEntryRepository for SqliteDatasetEntryRepository { Ok(usize::try_from(dataset_entries_count).unwrap_or(0)) } + async fn dataset_entries_count_by_owner_id( + &self, + owner_id: &AccountID, + ) -> Result { + let stack_owner_id = owner_id.as_did_str().to_stack_string(); + let owner_id_as_str = stack_owner_id.as_str(); + + let mut tr = self.transaction.lock().await; + + let connection_mut = tr.connection_mut().await?; + + let dataset_entries_count = sqlx::query_scalar!( + r#" + SELECT COUNT(*) + FROM dataset_entries + WHERE owner_id = $1 + "#, + owner_id_as_str + ) + .fetch_one(connection_mut) + .await + .int_err()?; + + Ok(usize::try_from(dataset_entries_count).unwrap()) + } + + fn get_dataset_entries(&self, pagination: PaginationOpts) -> DatasetEntryStream { + Box::pin(async_stream::stream! { + let mut tr = self.transaction.lock().await; + let connection_mut = tr.connection_mut().await?; + + let limit = i64::try_from(pagination.limit).int_err()?; + let offset = i64::try_from(pagination.offset).int_err()?; + + let mut query_stream = sqlx::query_as!( + DatasetEntryRowModel, + r#" + SELECT + dataset_id as "id: _", + owner_id as "owner_id: _", + dataset_name as name, + created_at as "created_at: _" + FROM dataset_entries + ORDER BY dataset_name ASC + LIMIT $1 OFFSET $2 + "#, + limit, + offset, + ) + .fetch(connection_mut) + .map_err(ErrorIntoInternal::int_err); + + use futures::TryStreamExt; + while let Some(entry) = query_stream.try_next().await? { + yield Ok(entry.into()); + } + }) + } + async fn get_dataset_entry( &self, dataset_id: &DatasetID, @@ -85,7 +147,76 @@ impl DatasetEntryRepository for SqliteDatasetEntryRepository { } } - async fn get_dataset_entry_by_name( + async fn get_multiple_dataset_entries( + &self, + dataset_ids: &[DatasetID], + ) -> Result { + let mut tr = self.transaction.lock().await; + + let connection_mut = tr.connection_mut().await?; + + let placeholders = dataset_ids + .iter() + .map(|_| "?") + .collect::>() + .join(", "); + + let query_str = format!( + r#" + SELECT dataset_id as id, + owner_id, + dataset_name as name, + created_at + FROM dataset_entries + WHERE dataset_id IN ({placeholders}) + ORDER BY created_at + "#, + ); + + // ToDo replace it by macro once sqlx will support it + // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query + let mut query = sqlx::query(&query_str); + for dataset_id in dataset_ids { + query = query.bind(dataset_id.to_string()); + } + + let dataset_rows = query + .fetch_all(connection_mut) + .await + .int_err() + .map_err(GetMultipleDatasetEntriesError::Internal)?; + + let resolved_entries: Vec<_> = dataset_rows + .into_iter() + .map(|row| { + DatasetEntry::new( + row.get_unchecked("id"), + row.get_unchecked("owner_id"), + DatasetName::new_unchecked(&row.get::("name")), + row.get_unchecked("created_at"), + ) + }) + .collect(); + + let resolved_dataset_ids: HashSet<_> = resolved_entries + .iter() + .map(|entry: &DatasetEntry| &entry.id) + .cloned() + .collect(); + + let unresolved_entries = dataset_ids + .iter() + .filter(|id| !resolved_dataset_ids.contains(id)) + .cloned() + .collect(); + + Ok(DatasetEntriesResolution { + resolved_entries, + unresolved_entries, + }) + } + + async fn get_dataset_entry_by_owner_and_name( &self, owner_id: &AccountID, name: &DatasetName, @@ -123,34 +254,45 @@ impl DatasetEntryRepository for SqliteDatasetEntryRepository { } } - async fn get_dataset_entries_by_owner_id( + fn get_dataset_entries_by_owner_id( &self, owner_id: &AccountID, - ) -> Result, GetDatasetEntriesByOwnerIdError> { - let mut tr = self.transaction.lock().await; - - let connection_mut = tr.connection_mut().await?; - + pagination: PaginationOpts, + ) -> DatasetEntryStream<'_> { let stack_owner_id = owner_id.as_did_str().to_stack_string(); - let owner_id_as_str = stack_owner_id.as_str(); - let dataset_entry_rows = sqlx::query_as!( - DatasetEntryRowModel, - r#" - SELECT dataset_id as "id: _", - owner_id as "owner_id: _", - dataset_name as name, - created_at as "created_at: _" - FROM dataset_entries - WHERE owner_id = $1 - "#, - owner_id_as_str, - ) - .fetch_all(connection_mut) - .await - .int_err()?; - - Ok(dataset_entry_rows.into_iter().map(Into::into).collect()) + let limit = i64::try_from(pagination.limit).unwrap(); + let offset = i64::try_from(pagination.offset).unwrap(); + + Box::pin(async_stream::stream! { + let mut tr = self.transaction.lock().await; + + let connection_mut = tr.connection_mut().await?; + + let owner_id_as_str = stack_owner_id.as_str(); + + let mut query_stream = sqlx::query_as!( + DatasetEntryRowModel, + r#" + SELECT dataset_id as "id: _", + owner_id as "owner_id: _", + dataset_name as name, + created_at as "created_at: _" + FROM dataset_entries + WHERE owner_id = $1 + LIMIT $2 OFFSET $3 + "#, + owner_id_as_str, + limit, + offset + ) + .fetch(connection_mut); + + use futures::TryStreamExt; + while let Some(row) = query_stream.try_next().await.int_err()? { + yield Ok(row.into()); + } + }) } async fn save_dataset_entry( diff --git a/src/infra/datasets/sqlite/tests/repos/test_sqlite_dataset_entry_repository.rs b/src/infra/datasets/sqlite/tests/repos/test_sqlite_dataset_entry_repository.rs index f6681d6af2..d639e28658 100644 --- a/src/infra/datasets/sqlite/tests/repos/test_sqlite_dataset_entry_repository.rs +++ b/src/infra/datasets/sqlite/tests/repos/test_sqlite_dataset_entry_repository.rs @@ -25,6 +25,22 @@ database_transactional_test!( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +database_transactional_test!( + storage = sqlite, + fixture = dataset_entry_repo::test_stream_many_entries, + harness = SqliteDatasetEntryRepositoryHarness +); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +database_transactional_test!( + storage = sqlite, + fixture = dataset_entry_repo::test_get_multiple_entries, + harness = SqliteDatasetEntryRepositoryHarness +); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + database_transactional_test!( storage = sqlite, fixture = dataset_entry_repo::test_get_dataset_entry_by_name, diff --git a/src/infra/ingest-datafusion/benches/cdc_project.rs b/src/infra/ingest-datafusion/benches/cdc_project.rs index 2af36d32a0..c373e5b4e6 100644 --- a/src/infra/ingest-datafusion/benches/cdc_project.rs +++ b/src/infra/ingest-datafusion/benches/cdc_project.rs @@ -23,7 +23,9 @@ async fn setup(tempdir: &Path, num_rows: usize) -> String { let ctx = SessionContext::new(); - let path = tempdir.join("data").to_str().unwrap().to_string(); + let data_path = tempdir.join("data"); + std::fs::create_dir(&data_path).unwrap(); + let path = data_path.to_str().unwrap().to_string(); let mut offset = array::PrimitiveBuilder::::with_capacity(num_rows); let mut op = array::PrimitiveBuilder::::new(); diff --git a/src/infra/ingest-datafusion/benches/ledger.rs b/src/infra/ingest-datafusion/benches/ledger.rs index 68dbae8ff1..02e0c4c8e4 100644 --- a/src/infra/ingest-datafusion/benches/ledger.rs +++ b/src/infra/ingest-datafusion/benches/ledger.rs @@ -30,8 +30,13 @@ async fn setup( let ctx = SessionContext::new(); - let prev = tempdir.join("prev").to_str().unwrap().to_string(); - let new = tempdir.join("new").to_str().unwrap().to_string(); + let prev_path = tempdir.join("prev"); + let new_path = tempdir.join("new"); + std::fs::create_dir(&prev_path).unwrap(); + std::fs::create_dir(&new_path).unwrap(); + + let prev = prev_path.to_str().unwrap().to_string(); + let new = new_path.to_str().unwrap().to_string(); let mut pk1 = array::PrimitiveBuilder::::with_capacity(orig_rows); let mut pk2 = array::PrimitiveBuilder::::with_capacity(orig_rows); diff --git a/src/infra/ingest-datafusion/benches/snapshot.rs b/src/infra/ingest-datafusion/benches/snapshot.rs index 48146c4835..0dc4b4aa69 100644 --- a/src/infra/ingest-datafusion/benches/snapshot.rs +++ b/src/infra/ingest-datafusion/benches/snapshot.rs @@ -31,8 +31,13 @@ async fn setup( let ctx = SessionContext::new(); - let prev = tempdir.join("prev").to_str().unwrap().to_string(); - let new = tempdir.join("new").to_str().unwrap().to_string(); + let prev_path = tempdir.join("prev"); + let new_path = tempdir.join("new"); + std::fs::create_dir(&prev_path).unwrap(); + std::fs::create_dir(&new_path).unwrap(); + + let prev = prev_path.to_str().unwrap().to_string(); + let new = new_path.to_str().unwrap().to_string(); let mut offset = array::PrimitiveBuilder::::with_capacity(orig_rows); let mut op = array::PrimitiveBuilder::::new(); diff --git a/src/infra/task-system/repo-tests/src/task_system_repository_test_suite.rs b/src/infra/task-system/repo-tests/src/task_system_repository_test_suite.rs index 3975910acf..b8edcd87d8 100644 --- a/src/infra/task-system/repo-tests/src/task_system_repository_test_suite.rs +++ b/src/infra/task-system/repo-tests/src/task_system_repository_test_suite.rs @@ -59,9 +59,9 @@ pub async fn test_event_store_get_streams(catalog: &Catalog) { let event_1 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_1, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -70,9 +70,9 @@ pub async fn test_event_store_get_streams(catalog: &Catalog) { let event_2 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_2, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -137,9 +137,9 @@ pub async fn test_event_store_get_events_with_windowing(catalog: &Catalog) { let event_1 = TaskEventCreated { event_time: Utc::now(), task_id, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -235,9 +235,9 @@ pub async fn test_event_store_get_events_by_tasks(catalog: &Catalog) { let event_1_1 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_1, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -246,9 +246,9 @@ pub async fn test_event_store_get_events_by_tasks(catalog: &Catalog) { let event_2_1 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_2, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -349,9 +349,9 @@ pub async fn test_event_store_get_dataset_tasks(catalog: &Catalog) { let event_1_1 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_1_1, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id_foo.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -360,9 +360,9 @@ pub async fn test_event_store_get_dataset_tasks(catalog: &Catalog) { let event_1_2 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_1_2, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id_foo.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -371,9 +371,9 @@ pub async fn test_event_store_get_dataset_tasks(catalog: &Catalog) { let event_2_1 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_2_1, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id_bar.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -382,9 +382,9 @@ pub async fn test_event_store_get_dataset_tasks(catalog: &Catalog) { let event_2_2 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_2_2, - logical_plan: Probe { + logical_plan: LogicalPlanProbe { dataset_id: Some(dataset_id_bar.clone()), - ..Probe::default() + ..LogicalPlanProbe::default() } .into(), metadata: None, @@ -523,7 +523,7 @@ pub async fn test_event_store_try_get_queued_single_task(catalog: &Catalog) { vec![TaskEventCreated { event_time: Utc::now(), task_id: task_id_1, - logical_plan: Probe::default().into(), + logical_plan: LogicalPlanProbe::default().into(), metadata: None, } .into()], @@ -615,7 +615,7 @@ pub async fn test_event_store_try_get_queued_multiple_tasks(catalog: &Catalog) { vec![TaskEventCreated { event_time: Utc::now(), task_id, - logical_plan: Probe::default().into(), + logical_plan: LogicalPlanProbe::default().into(), metadata: None, } .into()], @@ -770,7 +770,7 @@ pub async fn test_event_store_get_running_tasks(catalog: &Catalog) { vec![TaskEventCreated { event_time: Utc::now(), task_id, - logical_plan: Probe::default().into(), + logical_plan: LogicalPlanProbe::default().into(), metadata: None, } .into()], @@ -945,7 +945,7 @@ pub async fn test_event_store_concurrent_modification(catalog: &Catalog) { vec![TaskEventCreated { event_time: Utc::now(), task_id, - logical_plan: Probe::default().into(), + logical_plan: LogicalPlanProbe::default().into(), metadata: None, } .into()], @@ -961,7 +961,7 @@ pub async fn test_event_store_concurrent_modification(catalog: &Catalog) { vec![TaskEventCreated { event_time: Utc::now(), task_id, - logical_plan: Probe::default().into(), + logical_plan: LogicalPlanProbe::default().into(), metadata: None, } .into()], diff --git a/src/utils/database-common/src/entities.rs b/src/utils/database-common/src/entities.rs index 07d9a0ff4e..1c4d25570e 100644 --- a/src/utils/database-common/src/entities.rs +++ b/src/utils/database-common/src/entities.rs @@ -9,7 +9,7 @@ //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub struct PaginationOpts { pub limit: usize, pub offset: usize, diff --git a/src/utils/kamu-cli-puppet/src/kamu_cli_puppet_ext.rs b/src/utils/kamu-cli-puppet/src/kamu_cli_puppet_ext.rs index 6b3b618de6..2f5aa4c6ff 100644 --- a/src/utils/kamu-cli-puppet/src/kamu_cli_puppet_ext.rs +++ b/src/utils/kamu-cli-puppet/src/kamu_cli_puppet_ext.rs @@ -470,7 +470,7 @@ fn assert_execute_command_result<'a>( for expected_stderr_item in expected_stderr_items { assert!( stderr.contains(expected_stderr_item), - "Unexpected output:\n{stderr}", + "Expected output:\n{expected_stderr_item}\nUnexpected output:\n{stderr}", ); } } diff --git a/src/utils/messaging-outbox/src/consumers/message_consumer.rs b/src/utils/messaging-outbox/src/consumers/message_consumer.rs index 85a1c298d5..8b40969d52 100644 --- a/src/utils/messaging-outbox/src/consumers/message_consumer.rs +++ b/src/utils/messaging-outbox/src/consumers/message_consumer.rs @@ -34,15 +34,15 @@ pub trait MessageConsumerT: MessageConsumer { pub struct MessageConsumerMeta { pub consumer_name: &'static str, pub feeding_producers: &'static [&'static str], - pub durability: MessageConsumptionDurability, + pub delivery: MessageDeliveryMechanism, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Eq, PartialEq, Copy, Clone)] -pub enum MessageConsumptionDurability { - Durable, - BestEffort, +pub enum MessageDeliveryMechanism { + Transactional, + Immediate, } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/messaging-outbox/src/consumers/message_consumers_utils.rs b/src/utils/messaging-outbox/src/consumers/message_consumers_utils.rs index 14fc335c3d..f025d40d67 100644 --- a/src/utils/messaging-outbox/src/consumers/message_consumers_utils.rs +++ b/src/utils/messaging-outbox/src/consumers/message_consumers_utils.rs @@ -18,7 +18,7 @@ use super::{ ConsumerFilter, MessageConsumer, MessageConsumerMeta, - MessageConsumptionDurability, + MessageDeliveryMechanism, MessageDispatcher, }; use crate::{Message, MessageConsumerT, MessageSubscription}; @@ -46,7 +46,7 @@ pub async fn consume_deserialized_message<'a, TMessage: Message + 'static>( let consumers = match consumer_filter { ConsumerFilter::AllConsumers => all_consumers_for::(catalog), - ConsumerFilter::BestEffortConsumers => best_effort_consumers_for::(catalog), + ConsumerFilter::ImmediateConsumers => immediate_consumers_for::(catalog), ConsumerFilter::SelectedConsumer(consumer_name) => { particular_consumers_for::(catalog, consumer_name) } @@ -77,15 +77,13 @@ fn all_consumers_for( //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -fn best_effort_consumers_for( +fn immediate_consumers_for( catalog: &Catalog, ) -> Vec>> { consumers_from_builders( catalog, catalog.builders_for_with_meta::, _>( - |meta: &MessageConsumerMeta| { - meta.durability == MessageConsumptionDurability::BestEffort - }, + |meta: &MessageConsumerMeta| meta.delivery == MessageDeliveryMechanism::Immediate, ), ) } @@ -141,7 +139,7 @@ pub(crate) fn group_message_dispatchers_by_producer( pub(crate) fn enumerate_messaging_routes( catalog: &Catalog, - durability: MessageConsumptionDurability, + durability: MessageDeliveryMechanism, ) -> Vec { let mut res = Vec::new(); @@ -154,7 +152,7 @@ pub(crate) fn enumerate_messaging_routes( consumer_builder.instance_type_name() ); for metadata in all_metadata { - if metadata.durability == durability { + if metadata.delivery == durability { for producer_name in metadata.feeding_producers { res.push(MessageSubscription::new( producer_name, diff --git a/src/utils/messaging-outbox/src/consumers/message_dispatcher.rs b/src/utils/messaging-outbox/src/consumers/message_dispatcher.rs index 8ffc1df3b8..d37c108d1a 100644 --- a/src/utils/messaging-outbox/src/consumers/message_dispatcher.rs +++ b/src/utils/messaging-outbox/src/consumers/message_dispatcher.rs @@ -20,7 +20,7 @@ use crate::Message; #[derive(Debug, Copy, Clone)] pub enum ConsumerFilter<'a> { AllConsumers, - BestEffortConsumers, + ImmediateConsumers, SelectedConsumer(&'a str), } diff --git a/src/utils/messaging-outbox/src/executors/outbox_executor.rs b/src/utils/messaging-outbox/src/executors/outbox_executor.rs index ff50aa95fe..be86f72552 100644 --- a/src/utils/messaging-outbox/src/executors/outbox_executor.rs +++ b/src/utils/messaging-outbox/src/executors/outbox_executor.rs @@ -39,6 +39,7 @@ pub struct OutboxExecutor { routes_static_info: Arc, producer_consumption_jobs: Vec, metrics: Arc, + run_lock: tokio::sync::Mutex<()>, } #[component(pub)] @@ -80,6 +81,7 @@ impl OutboxExecutor { routes_static_info, producer_consumption_jobs, metrics, + run_lock: tokio::sync::Mutex::new(()), } } @@ -89,7 +91,7 @@ impl OutboxExecutor { message_dispatchers: Vec>, ) -> OutboxRoutesStaticInfo { let all_durable_messaging_routes = - enumerate_messaging_routes(catalog, MessageConsumptionDurability::Durable); + enumerate_messaging_routes(catalog, MessageDeliveryMechanism::Transactional); let consumers_by_producers = group_consumers_by_producers(&all_durable_messaging_routes); let message_dispatchers_by_producers = group_message_dispatchers_by_producer(&message_dispatchers); @@ -193,6 +195,12 @@ impl OutboxExecutor { async fn run_consumption_iteration( &self, ) -> Result { + // We should not allow multiple concurrent entrances into consumption flow. + // I.e., there could be concurrently: + // - main scheduled loop iteration + // - flushed iteration (i.e. via e2e middleware) + let _guard = self.run_lock.lock().await; + // Read current state of producers and consumptions // Prepare consumption tasks for each progressed producer let mut consumption_tasks_by_producer = self.prepare_consumption_iteration().await?; diff --git a/src/utils/messaging-outbox/src/services/implementation/outbox_dispatching_impl.rs b/src/utils/messaging-outbox/src/services/implementation/outbox_dispatching_impl.rs index cc7481c194..c93bc8d28b 100644 --- a/src/utils/messaging-outbox/src/services/implementation/outbox_dispatching_impl.rs +++ b/src/utils/messaging-outbox/src/services/implementation/outbox_dispatching_impl.rs @@ -14,15 +14,15 @@ use dill::*; use internal_error::InternalError; use super::{OutboxImmediateImpl, OutboxTransactionalImpl}; -use crate::{MessageConsumer, MessageConsumerMeta, MessageConsumptionDurability, Outbox}; +use crate::{MessageConsumer, MessageConsumerMeta, MessageDeliveryMechanism, Outbox}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// pub struct OutboxDispatchingImpl { immediate_outbox: Arc, transactional_outbox: Arc, - durable_producers: HashSet, - best_effort_producers: HashSet, + transactional_producers: HashSet, + immediate_producers: HashSet, } #[component(pub)] @@ -33,19 +33,20 @@ impl OutboxDispatchingImpl { immediate_outbox: Arc, transactional_outbox: Arc, ) -> Self { - let (durable_producers, best_effort_producers) = Self::classify_message_routes(&catalog); + let (transactional_producers, immediate_producers) = + Self::classify_message_routes(&catalog); Self { immediate_outbox, transactional_outbox, - durable_producers, - best_effort_producers, + transactional_producers, + immediate_producers, } } fn classify_message_routes(catalog: &Catalog) -> (HashSet, HashSet) { - let mut durable_producers = HashSet::new(); - let mut best_effort_producers = HashSet::new(); + let mut transactional_producers = HashSet::new(); + let mut immediate_producers = HashSet::new(); let all_consumer_builders = catalog.builders_for::(); for consumer_builder in all_consumer_builders { @@ -57,19 +58,19 @@ impl OutboxDispatchingImpl { ); for metadata in all_metadata { for producer_name in metadata.feeding_producers { - match metadata.durability { - MessageConsumptionDurability::Durable => { - durable_producers.insert((*producer_name).to_string()); + match metadata.delivery { + MessageDeliveryMechanism::Transactional => { + transactional_producers.insert((*producer_name).to_string()); } - MessageConsumptionDurability::BestEffort => { - best_effort_producers.insert((*producer_name).to_string()); + MessageDeliveryMechanism::Immediate => { + immediate_producers.insert((*producer_name).to_string()); } } } } } - (durable_producers, best_effort_producers) + (transactional_producers, immediate_producers) } } @@ -86,13 +87,13 @@ impl Outbox for OutboxDispatchingImpl { ) -> Result<(), InternalError> { tracing::debug!(content_json = %content_json, "Dispatching outbox message"); - if self.durable_producers.contains(producer_name) { + if self.transactional_producers.contains(producer_name) { self.transactional_outbox .post_message_as_json(producer_name, content_json, version) .await?; } - if self.best_effort_producers.contains(producer_name) { + if self.immediate_producers.contains(producer_name) { self.immediate_outbox .post_message_as_json(producer_name, content_json, version) .await?; diff --git a/src/utils/messaging-outbox/tests/mod.rs b/src/utils/messaging-outbox/tests/mod.rs index eb58539eeb..85c0a4ae3e 100644 --- a/src/utils/messaging-outbox/tests/mod.rs +++ b/src/utils/messaging-outbox/tests/mod.rs @@ -31,7 +31,7 @@ macro_rules! test_message_type { } macro_rules! test_message_consumer { - ($message_type_suffix: ident, $message_consumer_suffix: ident, $producer_name: ident, $durability: ident) => { + ($message_type_suffix: ident, $message_consumer_suffix: ident, $producer_name: ident, $delivery: ident) => { paste::paste! { struct [<"TestMessageConsumer" $message_consumer_suffix>] { state: Arc]>>, @@ -56,7 +56,7 @@ macro_rules! test_message_consumer { #[meta(MessageConsumerMeta { consumer_name: concat!("TestMessageConsumer", stringify!($message_consumer_suffix)), feeding_producers: &[$producer_name], - durability: MessageConsumptionDurability::$durability, + delivery: MessageDeliveryMechanism::$delivery, })] impl [<"TestMessageConsumer" $message_consumer_suffix>] { fn new() -> Self { diff --git a/src/utils/messaging-outbox/tests/tests/test_dispatching_outbox_impl.rs b/src/utils/messaging-outbox/tests/tests/test_dispatching_outbox_impl.rs index b0c3f310d0..cbbb72d3a0 100644 --- a/src/utils/messaging-outbox/tests/tests/test_dispatching_outbox_impl.rs +++ b/src/utils/messaging-outbox/tests/tests/test_dispatching_outbox_impl.rs @@ -29,15 +29,15 @@ test_message_type!(A); test_message_type!(B); test_message_type!(C); -test_message_consumer!(A, A, TEST_PRODUCER_A, BestEffort); -test_message_consumer!(B, B, TEST_PRODUCER_B, Durable); -test_message_consumer!(C, CB, TEST_PRODUCER_C, BestEffort); -test_message_consumer!(C, CD, TEST_PRODUCER_C, Durable); +test_message_consumer!(A, A, TEST_PRODUCER_A, Immediate); +test_message_consumer!(B, B, TEST_PRODUCER_B, Transactional); +test_message_consumer!(C, CB, TEST_PRODUCER_C, Immediate); +test_message_consumer!(C, CD, TEST_PRODUCER_C, Transactional); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] -async fn test_best_effort_only_messages() { +async fn test_immediate_only_messages() { let message_1 = TestMessageA { body: "foo".to_string(), }; @@ -93,8 +93,8 @@ async fn test_best_effort_only_messages() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] -async fn test_durable_only_messages() { - let message_1 = TestMessageB { +async fn test_transactional_only_messages() { + let message_1: TestMessageB = TestMessageB { body: "foo".to_string(), }; let message_2 = TestMessageB { @@ -145,7 +145,7 @@ async fn test_durable_only_messages() { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #[test_log::test(tokio::test)] -async fn test_messages_mixed_durability() { +async fn test_messages_mixed_delivery() { let message_1 = TestMessageC { body: "foo".to_string(), }; @@ -214,7 +214,7 @@ impl DispatchingOutboxHarness { let mut b = CatalogBuilder::new(); b.add_builder( messaging_outbox::OutboxImmediateImpl::builder() - .with_consumer_filter(messaging_outbox::ConsumerFilter::BestEffortConsumers), + .with_consumer_filter(messaging_outbox::ConsumerFilter::ImmediateConsumers), ); b.add::(); b.add::(); diff --git a/src/utils/messaging-outbox/tests/tests/test_immediate_outbox_impl.rs b/src/utils/messaging-outbox/tests/tests/test_immediate_outbox_impl.rs index cfa61d0dd9..1b1167a895 100644 --- a/src/utils/messaging-outbox/tests/tests/test_immediate_outbox_impl.rs +++ b/src/utils/messaging-outbox/tests/tests/test_immediate_outbox_impl.rs @@ -29,10 +29,10 @@ test_message_type!(B); test_message_type!(C); // No consumers test_message_type!(D); -test_message_consumer!(A, A, TEST_PRODUCER_A, BestEffort); -test_message_consumer!(B, B, TEST_PRODUCER_B, BestEffort); -test_message_consumer!(D, D1, TEST_PRODUCER_D, BestEffort); -test_message_consumer!(D, D2, TEST_PRODUCER_D, BestEffort); +test_message_consumer!(A, A, TEST_PRODUCER_A, Immediate); +test_message_consumer!(B, B, TEST_PRODUCER_B, Immediate); +test_message_consumer!(D, D1, TEST_PRODUCER_D, Immediate); +test_message_consumer!(D, D2, TEST_PRODUCER_D, Immediate); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/messaging-outbox/tests/tests/test_outbox_executor.rs b/src/utils/messaging-outbox/tests/tests/test_outbox_executor.rs index 5f0d5b2c27..6814c0e7fc 100644 --- a/src/utils/messaging-outbox/tests/tests/test_outbox_executor.rs +++ b/src/utils/messaging-outbox/tests/tests/test_outbox_executor.rs @@ -33,10 +33,10 @@ test_message_type!(A); test_message_type!(B); test_message_type!(C); -test_message_consumer!(A, A, TEST_PRODUCER_A, Durable); -test_message_consumer!(B, B, TEST_PRODUCER_B, Durable); -test_message_consumer!(C, C1, TEST_PRODUCER_C, Durable); -test_message_consumer!(C, C2, TEST_PRODUCER_C, Durable); +test_message_consumer!(A, A, TEST_PRODUCER_A, Transactional); +test_message_consumer!(B, B, TEST_PRODUCER_B, Transactional); +test_message_consumer!(C, C1, TEST_PRODUCER_C, Transactional); +test_message_consumer!(C, C2, TEST_PRODUCER_C, Transactional); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////