From 1200dcb265fda64b0a6df9f6fb27c10b8334b023 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 15:16:14 -0800 Subject: [PATCH 1/8] [nexus] Support Bundle API Skeleton (#7008) PR 1 / ??? Adds support bundles to the experimental Nexus API. All methods exist, but return "not implemented" errors. These methods will be implemented in a later PR: https://github.com/oxidecomputer/omicron/pull/7187 --- clients/sled-agent-client/src/lib.rs | 1 + nexus/external-api/output/nexus_tags.txt | 9 + nexus/external-api/src/lib.rs | 103 +++++ nexus/src/external_api/http_entrypoints.rs | 207 +++++++++ .../output/uncovered-authz-endpoints.txt | 9 + nexus/types/src/external_api/params.rs | 10 + nexus/types/src/external_api/shared.rs | 40 ++ openapi/nexus.json | 432 +++++++++++++++++- 8 files changed, 799 insertions(+), 12 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 6a9c721587..f0dfbe289e 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -83,6 +83,7 @@ progenitor::generate_api!( TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::PropolisUuid, TypedUuidForSledKind = omicron_uuid_kinds::SledUuid, + TypedUuidForSupportBundleKind = omicron_uuid_kinds::SupportBundleUuid, TypedUuidForZpoolKind = omicron_uuid_kinds::ZpoolUuid, Vni = omicron_common::api::external::Vni, ZpoolKind = omicron_common::zpool_name::ZpoolKind, diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index a979a9804b..4fc92b18d8 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -30,6 +30,15 @@ probe_create POST /experimental/v1/probes probe_delete DELETE /experimental/v1/probes/{probe} probe_list GET /experimental/v1/probes probe_view GET /experimental/v1/probes/{probe} +support_bundle_create POST /experimental/v1/system/support-bundles +support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} +support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_download_file GET /experimental/v1/system/support-bundles/{support_bundle}/download/{file} +support_bundle_head HEAD /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_head_file HEAD /experimental/v1/system/support-bundles/{support_bundle}/download/{file} +support_bundle_index GET /experimental/v1/system/support-bundles/{support_bundle}/index +support_bundle_list GET /experimental/v1/system/support-bundles +support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} timeseries_query POST /v1/timeseries/query API operations found with tag "images" diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index e2b53a7e6f..54ba3ab34b 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2780,6 +2780,109 @@ pub trait NexusExternalApi { path_params: Path, ) -> Result; + // Support bundles (experimental) + + /// List all support bundles + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_list( + rqctx: RequestContext, + query_params: Query, + ) -> Result>, HttpError>; + + /// View a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_view( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the index of a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/index", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_index( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the contents of a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download a file within a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download_file( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the metadata of a support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the metadata of a file within the support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head_file( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Create a new support bundle + #[endpoint { + method = POST, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Delete an existing support bundle + /// + /// May also be used to cancel a support bundle which is currently being + /// collected, or to remove metadata for a support bundle that has failed. + #[endpoint { + method = DELETE, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + // Probes (experimental) /// List instrumentation probes diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 740895b7e4..cfc9f99851 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6026,6 +6026,213 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn support_bundle_list( + rqctx: RequestContext, + _query_params: Query, + ) -> Result>, HttpError> + { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_view( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_index( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download_file( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head_file( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_delete( + rqctx: RequestContext, + _path_params: Path, + ) -> Result { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn probe_list( rqctx: RequestContext, query_params: Query>, diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index c5091c5a3b..0a9e62707f 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -1,13 +1,22 @@ API endpoints with no coverage in authz tests: probe_delete (delete "/experimental/v1/probes/{probe}") +support_bundle_delete (delete "/experimental/v1/system/support-bundles/{support_bundle}") probe_list (get "/experimental/v1/probes") probe_view (get "/experimental/v1/probes/{probe}") +support_bundle_list (get "/experimental/v1/system/support-bundles") +support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") +support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_download_file (get "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") +support_bundle_index (get "/experimental/v1/system/support-bundles/{support_bundle}/index") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") +support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_head_file (head "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") probe_create (post "/experimental/v1/probes") +support_bundle_create (post "/experimental/v1/system/support-bundles") login_saml (post "/login/{silo_name}/saml/{provider_name}") login_local (post "/v1/login/{silo_name}/local") logout (post "/v1/logout") diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 9b4c2474ad..4e616e698f 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -90,6 +90,7 @@ path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); +id_path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); // TODO: The hardware resources should be represented by its UUID or a hardware @@ -142,6 +143,15 @@ impl From for SiloSelector { } } +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SupportBundleFilePath { + #[serde(flatten)] + pub bundle: SupportBundlePath, + + /// The file within the bundle to download + pub file: String, +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct OptionalSiloSelector { /// Name or ID of the silo diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 9bfa9c8358..9e9bc26ffe 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -6,8 +6,11 @@ use std::net::IpAddr; +use chrono::DateTime; +use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_uuid_kinds::SupportBundleUuid; use parse_display::FromStr; use schemars::JsonSchema; use serde::de::Error as _; @@ -414,6 +417,43 @@ mod test { } } +#[derive(Debug, Clone, Copy, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SupportBundleState { + /// Support Bundle still actively being collected. + /// + /// This is the initial state for a Support Bundle, and it will + /// automatically transition to either "Failing" or "Active". + /// + /// If a user no longer wants to access a Support Bundle, they can + /// request cancellation, which will transition to the "Destroying" state. + Collecting, + + /// Support Bundle is being destroyed. + /// + /// Once backing storage has been freed, this bundle is destroyed. + Destroying, + + /// Support Bundle was not created successfully, or was created and has lost + /// backing storage. + /// + /// The record of the bundle still exists for readability, but the only + /// valid operation on these bundles is to destroy them. + Failed, + + /// Support Bundle has been processed, and is ready for usage. + Active, +} + +#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] +pub struct SupportBundleInfo { + pub id: SupportBundleUuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub reason_for_failure: Option, + pub state: SupportBundleState, +} + #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct ProbeInfo { pub id: Uuid, diff --git a/openapi/nexus.json b/openapi/nexus.json index c0b6a96fcf..bc043059dd 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -301,6 +301,329 @@ } } }, + "/experimental/v1/system/support-bundles": { + "get": { + "tags": [ + "hidden" + ], + "summary": "List all support bundles", + "operationId": "support_bundle_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + }, + { + "in": "query", + "name": "sort_by", + "schema": { + "$ref": "#/components/schemas/IdSortMode" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfoResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + }, + "post": { + "tags": [ + "hidden" + ], + "summary": "Create a new support bundle", + "operationId": "support_bundle_create", + "responses": { + "201": { + "description": "successful creation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "View a support bundle", + "operationId": "support_bundle_view", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "tags": [ + "hidden" + ], + "summary": "Delete an existing support bundle", + "description": "May also be used to cancel a support bundle which is currently being collected, or to remove metadata for a support bundle that has failed.", + "operationId": "support_bundle_delete", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the contents of a support bundle", + "operationId": "support_bundle_download", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a support bundle", + "operationId": "support_bundle_head", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download a file within a support bundle", + "operationId": "support_bundle_download_file", + "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a file within the support bundle", + "operationId": "support_bundle_head_file", + "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/index": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the index of a support bundle", + "operationId": "support_bundle_index", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, "/login/{silo_name}/saml/{provider_name}": { "post": { "tags": [ @@ -20059,6 +20382,87 @@ "items" ] }, + "SupportBundleInfo": { + "type": "object", + "properties": { + "id": { + "$ref": "#/components/schemas/TypedUuidForSupportBundleKind" + }, + "reason_for_creation": { + "type": "string" + }, + "reason_for_failure": { + "nullable": true, + "type": "string" + }, + "state": { + "$ref": "#/components/schemas/SupportBundleState" + }, + "time_created": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "id", + "reason_for_creation", + "state", + "time_created" + ] + }, + "SupportBundleInfoResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "SupportBundleState": { + "oneOf": [ + { + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failing\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Destroying\" state.", + "type": "string", + "enum": [ + "collecting" + ] + }, + { + "description": "Support Bundle is being destroyed.\n\nOnce backing storage has been freed, this bundle is destroyed.", + "type": "string", + "enum": [ + "destroying" + ] + }, + { + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to destroy them.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "Support Bundle has been processed, and is ready for usage.", + "type": "string", + "enum": [ + "active" + ] + } + ] + }, "Switch": { "description": "An operator's view of a Switch.", "type": "object", @@ -21073,6 +21477,10 @@ } } }, + "TypedUuidForSupportBundleKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", @@ -22509,6 +22917,18 @@ } ] }, + "IdSortMode": { + "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", + "oneOf": [ + { + "description": "sort in increasing order of \"id\"", + "type": "string", + "enum": [ + "id_ascending" + ] + } + ] + }, "DiskMetricName": { "type": "string", "enum": [ @@ -22528,18 +22948,6 @@ "descending" ] }, - "IdSortMode": { - "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", - "oneOf": [ - { - "description": "sort in increasing order of \"id\"", - "type": "string", - "enum": [ - "id_ascending" - ] - } - ] - }, "SystemMetricName": { "type": "string", "enum": [ From 94d76b894f3dced18e71465bc6e5f4b15a9cb0e7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 15:16:29 -0800 Subject: [PATCH 2/8] [nexus] Support Bundles Datastore methods (#7021) PR 2 / ??? Adds Support Bundles to the database, along with datastore-specific tests. These implementations are necessary to implement both the Support Bundle background task, in https://github.com/oxidecomputer/omicron/pull/7063 , as well as the HTTP interface for support bundles, in https://github.com/oxidecomputer/omicron/pull/7187 --- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 15 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/support_bundle.rs | 133 ++ nexus/db-queries/src/db/datastore/dataset.rs | 25 +- nexus/db-queries/src/db/datastore/mod.rs | 30 + .../src/db/datastore/support_bundle.rs | 1412 +++++++++++++++++ schema/crdb/dbinit.sql | 62 +- schema/crdb/support-bundles/up01.sql | 28 + schema/crdb/support-bundles/up02.sql | 14 + schema/crdb/support-bundles/up03.sql | 4 + schema/crdb/support-bundles/up04.sql | 4 + 12 files changed, 1706 insertions(+), 26 deletions(-) create mode 100644 nexus/db-model/src/support_bundle.rs create mode 100644 nexus/db-queries/src/db/datastore/support_bundle.rs create mode 100644 schema/crdb/support-bundles/up01.sql create mode 100644 schema/crdb/support-bundles/up02.sql create mode 100644 schema/crdb/support-bundles/up03.sql create mode 100644 schema/crdb/support-bundles/up04.sql diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 66723e0b18..b6aea15c89 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -96,6 +96,7 @@ mod sled_state; mod sled_underlay_subnet_allocation; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod tuf_repo; mod typed_uuid; @@ -204,6 +205,7 @@ pub use sled_state::*; pub use sled_underlay_subnet_allocation::*; pub use snapshot::*; pub use ssh_key::*; +pub use support_bundle::*; pub use switch::*; pub use switch_interface::*; pub use switch_port::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index a54c2e3029..a8e1141db6 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1364,6 +1364,20 @@ joinable!(tuf_repo_artifact -> tuf_repo (tuf_repo_id)); // Can't specify joinable for a composite primary key (tuf_repo_artifact -> // tuf_artifact). +table! { + support_bundle { + id -> Uuid, + time_created -> Timestamptz, + reason_for_creation -> Text, + reason_for_failure -> Nullable, + state -> crate::SupportBundleStateEnum, + zpool_id -> Uuid, + dataset_id -> Uuid, + + assigned_nexus -> Nullable, + } +} + /* hardware inventory */ table! { @@ -2034,6 +2048,7 @@ allow_tables_to_appear_in_same_query!( console_session, sled, sled_resource, + support_bundle, router_route, vmm, volume, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 8b663d2549..4542283aac 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(117, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(118, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(118, "support-bundles"), KnownVersion::new(117, "add-completing-and-new-region-volume"), KnownVersion::new(116, "bp-physical-disk-disposition"), KnownVersion::new(115, "inv-omicron-physical-disks-generation"), diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs new file mode 100644 index 0000000000..a4b14d363b --- /dev/null +++ b/nexus/db-model/src/support_bundle.rs @@ -0,0 +1,133 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::support_bundle; +use crate::typed_uuid::DbTypedUuid; + +use chrono::{DateTime, Utc}; +use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; +use nexus_types::external_api::shared::SupportBundleState as SupportBundleStateView; +use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::OmicronZoneKind; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleKind; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolKind; +use omicron_uuid_kinds::ZpoolUuid; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "support_bundle_state", schema = "public"))] + pub struct SupportBundleStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SupportBundleStateEnum)] + pub enum SupportBundleState; + + // Enum values + Collecting => b"collecting" + Active => b"active" + Destroying => b"destroying" + Failing => b"failing" + Failed => b"failed" +); + +impl SupportBundleState { + /// Returns the list of valid prior states. + /// + /// This is used to confirm that state updates are performed legally, + /// and defines the possible state transitions. + pub fn valid_old_states(&self) -> Vec { + use SupportBundleState::*; + + match self { + Collecting => vec![], + Active => vec![Collecting], + // The "Destroying" state is terminal. + Destroying => vec![Active, Collecting, Failing], + Failing => vec![Collecting, Active], + // The "Failed" state is terminal. + Failed => vec![Active, Collecting, Failing], + } + } +} + +impl From for SupportBundleStateView { + fn from(state: SupportBundleState) -> Self { + use SupportBundleState::*; + + match state { + Collecting => SupportBundleStateView::Collecting, + Active => SupportBundleStateView::Active, + Destroying => SupportBundleStateView::Destroying, + // The distinction between "failing" and "failed" should not be + // visible to end-users. This is internal book-keeping to decide + // whether or not the bundle record can be safely deleted. + // + // Either way, it should be possible to delete the bundle. + // If a user requests that we delete a bundle in these states: + // - "Failing" bundles will become "Destroying" + // - "Failed" bundles can be deleted immediately + Failing => SupportBundleStateView::Failed, + Failed => SupportBundleStateView::Failed, + } + } +} + +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Deserialize, + Serialize, + PartialEq, +)] +#[diesel(table_name = support_bundle)] +pub struct SupportBundle { + pub id: DbTypedUuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub reason_for_failure: Option, + pub state: SupportBundleState, + pub zpool_id: DbTypedUuid, + pub dataset_id: DbTypedUuid, + pub assigned_nexus: Option>, +} + +impl SupportBundle { + pub fn new( + reason_for_creation: &'static str, + zpool_id: ZpoolUuid, + dataset_id: DatasetUuid, + nexus_id: OmicronZoneUuid, + ) -> Self { + Self { + id: SupportBundleUuid::new_v4().into(), + time_created: Utc::now(), + reason_for_creation: reason_for_creation.to_string(), + reason_for_failure: None, + state: SupportBundleState::Collecting, + zpool_id: zpool_id.into(), + dataset_id: dataset_id.into(), + assigned_nexus: Some(nexus_id.into()), + } + } +} + +impl From for SupportBundleView { + fn from(bundle: SupportBundle) -> Self { + Self { + id: bundle.id.into(), + time_created: bundle.time_created, + reason_for_creation: bundle.reason_for_creation, + reason_for_failure: bundle.reason_for_failure, + state: bundle.state.into(), + } + } +} diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 992db3705c..938927ff5e 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -352,14 +352,13 @@ impl DataStore { #[cfg(test)] mod test { use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; - use nexus_types::deployment::Blueprint; - use nexus_types::deployment::BlueprintTarget; use omicron_common::api::internal::shared::DatasetKind as ApiDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; @@ -523,28 +522,6 @@ mod test { logctx.cleanup_successful(); } - async fn bp_insert_and_make_target( - opctx: &OpContext, - datastore: &DataStore, - bp: &Blueprint, - ) { - datastore - .blueprint_insert(opctx, bp) - .await - .expect("inserted blueprint"); - datastore - .blueprint_target_set_current( - opctx, - BlueprintTarget { - target_id: bp.id, - enabled: true, - time_made_target: Utc::now(), - }, - ) - .await - .expect("made blueprint the target"); - } - fn new_dataset_on(zpool_id: ZpoolUuid) -> Dataset { Dataset::new( DatasetUuid::new_v4(), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index d20f24e773..3b19677fe8 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -93,6 +93,7 @@ mod sled; mod sled_instance; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod switch_interface; mod switch_port; @@ -464,6 +465,8 @@ mod test { use nexus_db_fixed_data::silo::DEFAULT_SILO; use nexus_db_model::IpAttachState; use nexus_db_model::{to_db_typed_uuid, Generation}; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintTarget; use nexus_types::external_api::params; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_common::api::external::{ @@ -504,6 +507,33 @@ mod test { } } + /// Inserts a blueprint in the DB and forcibly makes it the target + /// + /// WARNING: This makes no attempts to validate the blueprint relative to + /// parents -- this is just a test-only helper to make testing + /// blueprint-specific checks easier. + pub async fn bp_insert_and_make_target( + opctx: &OpContext, + datastore: &DataStore, + bp: &Blueprint, + ) { + datastore + .blueprint_insert(opctx, bp) + .await + .expect("inserted blueprint"); + datastore + .blueprint_target_set_current( + opctx, + BlueprintTarget { + target_id: bp.id, + enabled: true, + time_made_target: Utc::now(), + }, + ) + .await + .expect("made blueprint the target"); + } + #[tokio::test] async fn test_project_creation() { let logctx = dev::test_setup_log("test_project_creation"); diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs new file mode 100644 index 0000000000..d616b4356b --- /dev/null +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -0,0 +1,1412 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`SupportBundle`]s. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::Dataset; +use crate::db::model::DatasetKind; +use crate::db::model::SupportBundle; +use crate::db::model::SupportBundleState; +use crate::db::pagination::paginated; +use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use futures::FutureExt; +use nexus_types::identity::Asset; +use omicron_common::api::external; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::LookupResult; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; +use uuid::Uuid; + +const CANNOT_ALLOCATE_ERR_MSG: &'static str = +"Current policy limits support bundle creation to 'one per external disk', and \ + no disks are available. You must delete old support bundles before new ones \ + can be created"; + +const FAILURE_REASON_NO_DATASET: &'static str = + "Allocated dataset no longer exists"; +const FAILURE_REASON_NO_NEXUS: &'static str = + "Nexus managing this bundle no longer exists"; + +/// Provides a report on how many bundle were expunged, and why. +#[derive(Default, Debug, Clone, PartialEq)] +pub struct SupportBundleExpungementReport { + /// Bundles marked "failed" because the datasets storing them have been + /// expunged. + pub bundles_failed_missing_datasets: usize, + /// Bundles already in the "destroying" state that have been deleted because + /// the datasets storing them have been expunged. + pub bundles_deleted_missing_datasets: usize, + + /// Bundles marked "destroying" because the nexuses managing them have been + /// expunged. + /// + /// These bundles should be re-assigned to a different nexus for cleanup. + pub bundles_failing_missing_nexus: usize, + + /// Bundles which had a new Nexus assigned to them. + pub bundles_reassigned: usize, +} + +impl DataStore { + /// Creates a new support bundle. + /// + /// Requires that the UUID of the calling Nexus be supplied as input - + /// this particular Zone is responsible for the collection process. + /// + /// Note that really any functioning Nexus would work as the "assignee", + /// but it's clear that our instance will work, because we're currently + /// running. + pub async fn support_bundle_create( + &self, + opctx: &OpContext, + reason_for_creation: &'static str, + this_nexus_id: OmicronZoneUuid, + ) -> CreateResult { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + #[derive(Debug)] + enum SupportBundleError { + TooManyBundles, + } + + let err = OptionalError::new(); + self.transaction_retry_wrapper("support_bundle_create") + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::support_bundle::dsl as support_bundle_dsl; + + // Observe all "non-deleted, debug datasets". + // + // Return the first one we find that doesn't already + // have a support bundle allocated to it. + let free_dataset = dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Debug)) + .left_join(support_bundle_dsl::support_bundle.on( + dataset_dsl::id.eq(support_bundle_dsl::dataset_id), + )) + .filter(support_bundle_dsl::dataset_id.is_null()) + .select(Dataset::as_select()) + .first_async(&conn) + .await + .optional()?; + + let Some(dataset) = free_dataset else { + return Err( + err.bail(SupportBundleError::TooManyBundles) + ); + }; + + // We could check that "this_nexus_id" is not expunged, but + // we have some evidence that it is valid: this Nexus is + // currently running! + // + // Besides, we COULD be expunged immediately after inserting + // the SupportBundle. In this case, we'd fall back to the + // case of "clean up a bundle which is managed by an + // expunged Nexus" anyway. + + let bundle = SupportBundle::new( + reason_for_creation, + ZpoolUuid::from_untyped_uuid(dataset.pool_id), + dataset.id(), + this_nexus_id, + ); + + diesel::insert_into(support_bundle_dsl::support_bundle) + .values(bundle.clone()) + .execute_async(&conn) + .await?; + + Ok(bundle) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SupportBundleError::TooManyBundles => { + return external::Error::insufficient_capacity( + CANNOT_ALLOCATE_ERR_MSG, + "Support Bundle storage exhausted", + ); + } + } + } + public_error_from_diesel(e, ErrorHandler::Server) + }) + } + + /// Looks up a single support bundle + pub async fn support_bundle_get( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + dsl::support_bundle + .filter(dsl::id.eq(id.into_untyped_uuid())) + .select(SupportBundle::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Lists one page of support bundles + pub async fn support_bundle_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Lists one page of support bundles in a particular state, assigned to + /// a particular Nexus. + pub async fn support_bundle_list_assigned_to_nexus( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + nexus_id: OmicronZoneUuid, + states: Vec, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid())) + .filter(dsl::state.eq_any(states)) + .order(dsl::time_created.asc()) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Marks support bundles as failed if their assigned Nexus or backing + /// dataset has been destroyed. + pub async fn support_bundle_fail_expunged( + &self, + opctx: &OpContext, + blueprint: &nexus_types::deployment::Blueprint, + ) -> Result { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // For this blueprint: The set of all expunged Nexus zones + let invalid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::Expunged, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + let valid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::ShouldBeRunning, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + // For this blueprint: The set of expunged debug datasets + let invalid_datasets = blueprint + .all_omicron_datasets( + nexus_types::deployment::BlueprintDatasetFilter::Expunged, + ) + .filter_map(|(_sled_id, dataset_config)| { + if matches!( + dataset_config.kind, + omicron_common::api::internal::shared::DatasetKind::Debug + ) { + Some(dataset_config.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_if_current_blueprint_is( + &conn, + "support_bundle_fail_expunged", + opctx, + BlueprintUuid::from_untyped_uuid(blueprint.id), + |conn| { + let invalid_nexus_zones = invalid_nexus_zones.clone(); + let valid_nexus_zones = valid_nexus_zones.clone(); + let invalid_datasets = invalid_datasets.clone(); + async move { + use db::schema::support_bundle::dsl; + + // Find all bundles without backing storage. + let bundles_with_bad_datasets = dsl::support_bundle + .filter(dsl::dataset_id.eq_any(invalid_datasets)) + .select(SupportBundle::as_select()) + .load_async(conn) + .await?; + + // Split these bundles into two categories: + // - Ones that are being destroyed anyway, and that can be + // fully deleted. + // - Ones that are NOT being destroyed, and should be marked + // failed so the end-user has visibility into their + // destruction. + let (bundles_to_delete, bundles_to_fail): (Vec<_>, Vec<_>) = + bundles_with_bad_datasets.into_iter().partition( + |bundle| bundle.state == SupportBundleState::Destroying + ); + let bundles_to_delete = bundles_to_delete.into_iter().map(|b| b.id).collect::>(); + let bundles_to_fail = bundles_to_fail.into_iter().map(|b| b.id).collect::>(); + + // Find all non-destroying bundles on datasets that no + // longer exist, and mark them "failed". They skip the + // "failing" state because there is no remaining storage to + // be cleaned up. + let state = SupportBundleState::Failed; + let bundles_failed_missing_datasets = + diesel::update(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_fail)) + .set(( + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_DATASET), + )) + .execute_async(conn) + .await?; + // For bundles that are in the process of being destroyed, + // the dataset expungement speeds up the process. + let bundles_deleted_missing_datasets = + diesel::delete(dsl::support_bundle) + .filter(dsl::id.eq_any(bundles_to_delete)) + // This check should be redundant (we already + // partitioned above based on this state) but out of + // an abundance of caution we don't auto-delete a + // bundle in any other state. + .filter(dsl::state.eq(SupportBundleState::Destroying)) + .execute_async(conn) + .await?; + + let Some(arbitrary_valid_nexus) = + valid_nexus_zones.get(0).cloned() + else { + return Err(external::Error::internal_error( + "No valid Nexuses, we cannot re-assign this support bundle", + ) + .into()); + }; + + // Find all bundles on nexuses that no longer exist. + let bundles_with_bad_nexuses = dsl::support_bundle + .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) + .select(SupportBundle::as_select()) + .load_async(conn) + .await?; + + let bundles_to_mark_failing = bundles_with_bad_nexuses.iter() + .map(|b| b.id).collect::>(); + let bundles_to_reassign = bundles_with_bad_nexuses.iter() + .filter_map(|bundle| { + if bundle.state != SupportBundleState::Failed { + Some(bundle.id) + } else { + None + } + }).collect::>(); + + // Mark these support bundles as failing, and assign then + // to a nexus that should still exist. + // + // This should lead to their storage being freed, if it + // exists. + let state = SupportBundleState::Failing; + let bundles_failing_missing_nexus = diesel::update(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_mark_failing)) + .set(( + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_NEXUS), + )) + .execute_async(conn) + .await?; + let bundles_reassigned = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq_any(bundles_to_reassign)) + .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) + .execute_async(conn) + .await?; + + Ok(SupportBundleExpungementReport { + bundles_failed_missing_datasets, + bundles_deleted_missing_datasets, + bundles_failing_missing_nexus, + bundles_reassigned, + }) + } + .boxed() + }, + ) + .await + } + + /// Updates the state of a support bundle. + /// + /// Returns: + /// - "Ok" if the bundle was updated successfully. + /// - "Err::InvalidRequest" if the bundle exists, but could not be updated + /// because the state transition is invalid. + pub async fn support_bundle_update( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + state: SupportBundleState, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl; + let conn = self.pool_connection_authorized(opctx).await?; + let result = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::state.eq_any(state.valid_old_states())) + .set(dsl::state.eq(state)) + .check_if_exists::(id.into_untyped_uuid()) + .execute_and_check(&conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + Err(Error::invalid_request(format!( + "Cannot update support bundle state from {:?} to {:?}", + result.found.state, state + ))) + } + } + } + + /// Deletes a support bundle. + /// + /// This should only be invoked after all storage for the support bundle has + /// been cleared. + pub async fn support_bundle_delete( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::delete(dsl::support_bundle) + .filter( + dsl::state + .eq(SupportBundleState::Destroying) + .or(dsl::state.eq(SupportBundleState::Failed)), + ) + .filter(dsl::id.eq(id.into_untyped_uuid())) + .execute_async(&*conn) + .await + .map(|_rows_modified| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; + use crate::db::pub_test_utils::TestDatabase; + use nexus_db_model::Generation; + use nexus_db_model::SledBaseboard; + use nexus_db_model::SledSystemHardware; + use nexus_db_model::SledUpdate; + use nexus_db_model::Zpool; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_reconfigurator_planning::example::SimRngState; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintDatasetDisposition; + use nexus_types::deployment::BlueprintDatasetFilter; + use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::deployment::BlueprintZoneFilter; + use nexus_types::deployment::BlueprintZoneType; + use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; + use omicron_test_utils::dev; + use omicron_uuid_kinds::DatasetUuid; + use omicron_uuid_kinds::PhysicalDiskUuid; + use omicron_uuid_kinds::SledUuid; + use rand::Rng; + + // Pool/Dataset pairs, for debug datasets only. + struct TestPool { + pool: ZpoolUuid, + dataset: DatasetUuid, + } + + // Sleds and their pools, with a focus on debug datasets only. + struct TestSled { + sled: SledUuid, + pools: Vec, + } + + impl TestSled { + fn new_with_pool_count(pool_count: usize) -> Self { + Self { + sled: SledUuid::new_v4(), + pools: (0..pool_count) + .map(|_| TestPool { + pool: ZpoolUuid::new_v4(), + dataset: DatasetUuid::new_v4(), + }) + .collect(), + } + } + + fn new_from_blueprint(blueprint: &Blueprint) -> Vec { + let mut sleds = vec![]; + for (sled, datasets) in &blueprint.blueprint_datasets { + let pools = datasets + .datasets + .values() + .filter_map(|dataset| { + if !matches!(dataset.kind, DebugDatasetKind) + || !dataset + .disposition + .matches(BlueprintDatasetFilter::InService) + { + return None; + }; + + Some(TestPool { + pool: dataset.pool.id(), + dataset: dataset.id, + }) + }) + .collect(); + + sleds.push(TestSled { sled: *sled, pools }); + } + sleds + } + + async fn create_database_records( + &self, + datastore: &DataStore, + opctx: &OpContext, + ) { + let rack_id = Uuid::new_v4(); + let sled = SledUpdate::new( + *self.sled.as_untyped_uuid(), + "[::1]:0".parse().unwrap(), + SledBaseboard { + serial_number: format!( + "test-{}", + rand::thread_rng().gen::() + ), + part_number: "test-pn".to_string(), + revision: 0, + }, + SledSystemHardware { + is_scrimlet: false, + usable_hardware_threads: 128, + usable_physical_ram: (64 << 30).try_into().unwrap(), + reservoir_size: (16 << 30).try_into().unwrap(), + }, + rack_id, + Generation::new(), + ); + datastore.sled_upsert(sled).await.expect("failed to upsert sled"); + + // Create fake zpools that back our fake datasets. + for pool in &self.pools { + let zpool = Zpool::new( + *pool.pool.as_untyped_uuid(), + *self.sled.as_untyped_uuid(), + PhysicalDiskUuid::new_v4(), + ); + datastore + .zpool_insert(opctx, zpool) + .await + .expect("failed to upsert zpool"); + + let dataset = Dataset::new( + pool.dataset, + pool.pool.into_untyped_uuid(), + None, + DebugDatasetKind, + ); + datastore + .dataset_upsert(dataset) + .await + .expect("failed to upsert dataset"); + } + } + } + + // Creates a fake sled with `pool_count` zpools, and a debug dataset on each + // zpool. + async fn create_sled_and_zpools( + datastore: &DataStore, + opctx: &OpContext, + pool_count: usize, + ) -> TestSled { + let sled = TestSled::new_with_pool_count(pool_count); + sled.create_database_records(&datastore, &opctx).await; + sled + } + + async fn support_bundle_create_expect_no_capacity( + datastore: &DataStore, + opctx: &OpContext, + this_nexus_id: OmicronZoneUuid, + ) { + let err = datastore + .support_bundle_create(&opctx, "for tests", this_nexus_id) + .await + .expect_err("Shouldn't provision bundle without datasets"); + let Error::InsufficientCapacity { message } = err else { + panic!("Unexpected error: {err:?} - we expected 'InsufficientCapacity'"); + }; + assert_eq!( + CANNOT_ALLOCATE_ERR_MSG, + message.external_message(), + "Unexpected error: {message:?}" + ); + } + + #[tokio::test] + async fn test_bundle_list_filtering() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let nexus_a = OmicronZoneUuid::new_v4(); + let nexus_b = OmicronZoneUuid::new_v4(); + + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + let pagparams = DataPageParams::max_page(); + + // No bundles exist yet, so the list should be empty + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles"), + vec![] + ); + + // Create two bundles on "nexus A", one bundle on "nexus B" + + let bundle_a1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_a2 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_b1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_b) + .await + .expect("Should be able to create bundle"); + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_b, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_b1.id,] + ); + + // When we update the state of the bundles, the list results + // should also be filtered. + datastore + .support_bundle_update( + &opctx, + bundle_a1.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // "bundle_a1" is no longer collecting, so it won't appear here. + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a2.id,] + ); + + // ... but if we ask for enough states, it'll show up + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![ + SupportBundleState::Active, + SupportBundleState::Collecting + ] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_create_capacity_limits() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // No sleds, no datasets. Allocation should fail. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // Create a sled with a couple pools. Allocation should succeed. + + const POOL_COUNT: usize = 2; + let _test_sled = + create_sled_and_zpools(&datastore, &opctx, POOL_COUNT).await; + let mut bundles = vec![]; + for _ in 0..POOL_COUNT { + bundles.push( + datastore + .support_bundle_create( + &opctx, + "for the test", + this_nexus_id, + ) + .await + .expect("Should be able to create bundle"), + ); + } + + // If we try to allocate any more bundles, we'll run out of capacity. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we destroy a bundle, it isn't deleted (yet). + // This operation should signify that we can start to free up + // storage on the dataset, but that needs to happen outside the + // database. + // + // We should still expect to hit capacity limits. + + datastore + .support_bundle_update( + &opctx, + bundles[0].id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should be able to destroy this bundle"); + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we delete a bundle, it should be gone. This means we can + // re-allocate from that dataset which was just freed up. + + datastore + .support_bundle_delete(&opctx, bundles[0].id.into()) + .await + .expect("Should be able to destroy this bundle"); + datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_crud_operations() { + let logctx = dev::test_setup_log("test_crud_operations"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let test_sled = create_sled_and_zpools(&datastore, &opctx, 1).await; + let reason = "Bundle for test"; + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // Create the bundle, then observe it through the "getter" APIs + + let mut bundle = datastore + .support_bundle_create(&opctx, reason, this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.reason_for_creation, reason); + assert_eq!(bundle.reason_for_failure, None); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.zpool_id, test_sled.pools[0].pool.into()); + assert_eq!(bundle.dataset_id, test_sled.pools[0].dataset.into()); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + // Overwrite this column; it is modified slightly upon database insertion. + bundle.time_created = observed_bundle.time_created; + assert_eq!(bundle, observed_bundle); + + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(1, observed_bundles.len()); + assert_eq!(bundle, observed_bundles[0]); + + // Destroy the bundle, observe the new state + + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should be able to destroy our bundle"); + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(SupportBundleState::Destroying, observed_bundle.state); + + // Delete the bundle, observe that it's gone + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should be able to destroy our bundle"); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to query when no bundles exist"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + fn get_nexuses_from_blueprint( + bp: &Blueprint, + filter: BlueprintZoneFilter, + ) -> Vec { + bp.blueprint_zones + .values() + .flat_map(|zones_config| { + let mut nexus_zones = vec![]; + for zone in &zones_config.zones { + if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) + && zone.disposition.matches(filter) + { + nexus_zones.push(zone.id); + } + } + nexus_zones + }) + .collect() + } + + fn get_debug_datasets_from_blueprint( + bp: &Blueprint, + filter: BlueprintDatasetFilter, + ) -> Vec { + bp.blueprint_datasets + .values() + .flat_map(|datasets_config| { + let mut debug_datasets = vec![]; + for dataset in datasets_config.datasets.values() { + if matches!(dataset.kind, DebugDatasetKind) + && dataset.disposition.matches(filter) + { + debug_datasets.push(dataset.id); + } + } + debug_datasets + }) + .collect() + } + + fn expunge_dataset_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for datasets in bp.blueprint_datasets.values_mut() { + for dataset in datasets.datasets.values_mut() { + if dataset.id == bundle.dataset_id.into() { + dataset.disposition = BlueprintDatasetDisposition::Expunged; + } + } + } + } + + fn expunge_nexus_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for zones in bp.blueprint_zones.values_mut() { + for zone in &mut zones.zones { + if zone.id == bundle.assigned_nexus.unwrap().into() { + zone.disposition = BlueprintZoneDisposition::Expunged; + } + } + } + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_failed_from_expunged_dataset"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_deleted_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_deleted_from_expunged_dataset"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Start the deletion of this bundle + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should have been able to update state"); + + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_deleted_missing_datasets: 1, + ..Default::default() + }, + report + ); + + // Should observe no bundles (it should have been deleted) + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to query when no bundles exist"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_no_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_no_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Expunge the bundle's dataset. This marks it as "failed", and + // is a prerequisite for the bundle not later being re-assigned. + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + // Expunge the bundle's Nexus + let bp3 = { + let mut bp3 = bp2.clone(); + bp3.id = Uuid::new_v4(); + bp3.parent_blueprint_id = Some(bp2.id); + expunge_nexus_for_bundle(&mut bp3, &bundle); + bp3 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp3).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp3) + .await + .expect("Should have been able to mark bundle state as failed"); + + // Although the record for this bundle already exists, it is not + // re-assigned, and the original reason for it failing (dataset loss) is + // preserved. + assert_eq!(SupportBundleExpungementReport::default(), report); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should have been able to delete support bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_with_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_with_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Update the bundle's state. + // + // This is what we would do when we finish collecting, and + // provisioned storage on a sled. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // Expunge the bundle's Nexus (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_nexus_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as destroying"); + + assert_eq!( + SupportBundleExpungementReport { + bundles_failing_missing_nexus: 1, + bundles_reassigned: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failing, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_NEXUS)); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 67b8ab0041..ce6764bd17 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2395,6 +2395,66 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( /*******************************************************************/ +/* + * Support Bundles + */ + + +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- It might have storage that is partially allocated on a sled. + 'collecting', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', + + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', + + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. + -- + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', + + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' +); + +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID +); + +-- The "UNIQUE" part of this index helps enforce that we allow one support bundle +-- per debug dataset. This constraint can be removed, if the query responsible +-- for allocation changes to allocate more intelligently. +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( + assigned_nexus +); + +/*******************************************************************/ + /* * DNS Propagation * @@ -4697,7 +4757,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '117.0.0', NULL) + (TRUE, NOW(), NOW(), '118.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/support-bundles/up01.sql b/schema/crdb/support-bundles/up01.sql new file mode 100644 index 0000000000..1260e79e18 --- /dev/null +++ b/schema/crdb/support-bundles/up01.sql @@ -0,0 +1,28 @@ +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- It might have storage that is partially allocated on a sled. + 'collecting', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', + + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', + + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. + -- + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', + + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' +); + diff --git a/schema/crdb/support-bundles/up02.sql b/schema/crdb/support-bundles/up02.sql new file mode 100644 index 0000000000..bc61e70480 --- /dev/null +++ b/schema/crdb/support-bundles/up02.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID +); + diff --git a/schema/crdb/support-bundles/up03.sql b/schema/crdb/support-bundles/up03.sql new file mode 100644 index 0000000000..8d4bdb47ca --- /dev/null +++ b/schema/crdb/support-bundles/up03.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + diff --git a/schema/crdb/support-bundles/up04.sql b/schema/crdb/support-bundles/up04.sql new file mode 100644 index 0000000000..58b903e500 --- /dev/null +++ b/schema/crdb/support-bundles/up04.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( + assigned_nexus +); + From 12b65a3078be13343e4ac3be1498f25ad6505c85 Mon Sep 17 00:00:00 2001 From: Rain Date: Mon, 6 Jan 2025 16:29:08 -0800 Subject: [PATCH 3/8] [3/n] [omicron-package] update omicron-zone-package (#7296) This pulls in some API changes, particularly around newtypes for package and service names. --- Cargo.lock | 69 +++++++++++------------------- Cargo.toml | 2 +- clients/dpd-client/build.rs | 6 ++- dev-tools/releng/src/main.rs | 66 ++++++++++++++-------------- package/src/bin/omicron-package.rs | 28 ++++++------ package/src/cargo_plan.rs | 12 +++--- package/src/config.rs | 16 +++---- package/src/dot.rs | 13 +++--- package/src/lib.rs | 5 ++- package/src/target.rs | 16 +++---- workspace-hack/Cargo.toml | 8 ++-- 11 files changed, 116 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fe8ac9704f..5d2b7392af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,7 +718,7 @@ dependencies = [ "bitflags 2.6.0", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -3162,7 +3162,7 @@ dependencies = [ "futures-core", "futures-sink", "nanorand", - "spin 0.9.8", + "spin", ] [[package]] @@ -3792,7 +3792,7 @@ dependencies = [ "atomic-polyfill", "hash32 0.2.1", "rustc_version 0.4.1", - "spin 0.9.8", + "spin", "stable_deref_trait", ] @@ -5051,7 +5051,7 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" dependencies = [ - "spin 0.9.8", + "spin", ] [[package]] @@ -5153,7 +5153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -5622,7 +5622,7 @@ dependencies = [ "httparse", "memchr", "mime", - "spin 0.9.8", + "spin", "version_check", ] @@ -7057,7 +7057,7 @@ dependencies = [ "ref-cast", "regex", "reqwest", - "ring 0.17.8", + "ring", "rustls 0.22.4", "rustls-pemfile 2.2.0", "samael", @@ -7178,7 +7178,7 @@ dependencies = [ "petgraph", "rayon", "reqwest", - "ring 0.17.8", + "ring", "semver 1.0.23", "serde", "shell-words", @@ -7391,7 +7391,7 @@ dependencies = [ "rcgen", "regex", "reqwest", - "ring 0.17.8", + "ring", "rustls 0.22.4", "serde", "slog", @@ -7468,7 +7468,6 @@ dependencies = [ "getrandom", "group", "hashbrown 0.15.1", - "heck 0.4.1", "hex", "hickory-proto", "hmac", @@ -7479,6 +7478,7 @@ dependencies = [ "indicatif", "inout", "itertools 0.10.5", + "itertools 0.12.1", "lalrpop-util", "lazy_static", "libc", @@ -7526,7 +7526,7 @@ dependencies = [ "similar", "slog", "smallvec 1.13.2", - "spin 0.9.8", + "spin", "string_cache", "subtle", "syn 1.0.109", @@ -7557,9 +7557,9 @@ dependencies = [ [[package]] name = "omicron-zone-package" -version = "0.11.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5cd917080e98f2e325ad3d803432be83fbe7eb9ccabb0f0f20390072982f550" +checksum = "8c74f14c5ac8ea6fe525e496c3d4b984e658857a1a34c758bc7a16a632ec9235" dependencies = [ "anyhow", "async-trait", @@ -7573,11 +7573,11 @@ dependencies = [ "futures-util", "hex", "reqwest", - "ring 0.16.20", "semver 1.0.23", "serde", "serde_derive", "serde_json", + "sha2", "slog", "tar", "thiserror 1.0.69", @@ -9242,7 +9242,7 @@ checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" dependencies = [ "bytes", "rand", - "ring 0.17.8", + "ring", "rustc-hash 2.0.0", "rustls 0.23.19", "slab", @@ -9425,7 +9425,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" dependencies = [ "pem", - "ring 0.17.8", + "ring", "time", "yasna", ] @@ -9700,21 +9700,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin 0.5.2", - "untrusted 0.7.1", - "web-sys", - "winapi", -] - [[package]] name = "ring" version = "0.17.8" @@ -9725,7 +9710,7 @@ dependencies = [ "cfg-if", "getrandom", "libc", - "spin 0.9.8", + "spin", "untrusted 0.9.0", "windows-sys 0.52.0", ] @@ -10024,7 +10009,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", - "ring 0.17.8", + "ring", "rustls-webpki 0.101.7", "sct", ] @@ -10036,7 +10021,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", - "ring 0.17.8", + "ring", "rustls-pki-types", "rustls-webpki 0.102.8", "subtle", @@ -10052,7 +10037,7 @@ dependencies = [ "aws-lc-rs", "log", "once_cell", - "ring 0.17.8", + "ring", "rustls-pki-types", "rustls-webpki 0.102.8", "subtle", @@ -10102,7 +10087,7 @@ version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring 0.17.8", + "ring", "untrusted 0.9.0", ] @@ -10113,7 +10098,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "aws-lc-rs", - "ring 0.17.8", + "ring", "rustls-pki-types", "untrusted 0.9.0", ] @@ -10325,7 +10310,7 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.17.8", + "ring", "untrusted 0.9.0", ] @@ -11096,7 +11081,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.87", @@ -11134,12 +11119,6 @@ dependencies = [ "toml 0.8.19", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "spin" version = "0.9.8" diff --git a/Cargo.toml b/Cargo.toml index 535afb94b7..8ea13cce5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -499,7 +499,7 @@ omicron-rpaths = { path = "rpaths" } omicron-sled-agent = { path = "sled-agent" } omicron-test-utils = { path = "test-utils" } omicron-workspace-hack = "0.1.0" -omicron-zone-package = "0.11.1" +omicron-zone-package = "0.12.0" oxide-client = { path = "clients/oxide-client" } oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "b56afeeb14e0042cbd7bda85b166ed86ee17820e", features = [ "api", "std" ] } oxlog = { path = "dev-tools/oxlog" } diff --git a/clients/dpd-client/build.rs b/clients/dpd-client/build.rs index 313c1a452f..3f14fe7cc7 100644 --- a/clients/dpd-client/build.rs +++ b/clients/dpd-client/build.rs @@ -14,12 +14,16 @@ use anyhow::bail; use anyhow::Context; use anyhow::Result; use omicron_zone_package::config::Config; +use omicron_zone_package::config::PackageName; use omicron_zone_package::package::PackageSource; use quote::quote; use std::env; use std::fs; use std::path::Path; +const DENDRITE_ASIC_PACKAGE: PackageName = + PackageName::new_const("dendrite-asic"); + fn main() -> Result<()> { // Find the current dendrite repo commit from our package manifest. let manifest = fs::read_to_string("../../package-manifest.toml") @@ -31,7 +35,7 @@ fn main() -> Result<()> { let dendrite = config .packages - .get("dendrite-asic") + .get(&DENDRITE_ASIC_PACKAGE) .context("missing dendrite package in ../../package-manifest.toml")?; let local_path = match &dendrite.source { diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 9f95344862..b20b7729fb 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -19,6 +19,7 @@ use chrono::Utc; use clap::Parser; use fs_err::tokio as fs; use omicron_zone_package::config::Config; +use omicron_zone_package::config::PackageName; use once_cell::sync::Lazy; use semver::Version; use slog::debug; @@ -56,34 +57,33 @@ enum InstallMethod { } /// Packages to install or bundle in the host OS image. -const HOST_IMAGE_PACKAGES: [(&str, InstallMethod); 8] = [ - ("mg-ddm-gz", InstallMethod::Install), - ("omicron-sled-agent", InstallMethod::Install), - ("overlay", InstallMethod::Bundle), - ("oxlog", InstallMethod::Install), - ("propolis-server", InstallMethod::Bundle), - ("pumpkind-gz", InstallMethod::Install), - ("crucible-dtrace", InstallMethod::Install), - ("switch-asic", InstallMethod::Bundle), +const HOST_IMAGE_PACKAGES: [(&PackageName, InstallMethod); 8] = [ + (&PackageName::new_const("mg-ddm-gz"), InstallMethod::Install), + (&PackageName::new_const("omicron-sled-agent"), InstallMethod::Install), + (&PackageName::new_const("overlay"), InstallMethod::Bundle), + (&PackageName::new_const("oxlog"), InstallMethod::Install), + (&PackageName::new_const("propolis-server"), InstallMethod::Bundle), + (&PackageName::new_const("pumpkind-gz"), InstallMethod::Install), + (&PackageName::new_const("crucible-dtrace"), InstallMethod::Install), + (&PackageName::new_const("switch-asic"), InstallMethod::Bundle), ]; /// Packages to install or bundle in the recovery (trampoline) OS image. -const RECOVERY_IMAGE_PACKAGES: [(&str, InstallMethod); 2] = [ - ("installinator", InstallMethod::Install), - ("mg-ddm-gz", InstallMethod::Install), +const RECOVERY_IMAGE_PACKAGES: [(&PackageName, InstallMethod); 2] = [ + (&PackageName::new_const("installinator"), InstallMethod::Install), + (&PackageName::new_const("mg-ddm-gz"), InstallMethod::Install), ]; -/// Packages to ship with the TUF repo. -const TUF_PACKAGES: [&str; 11] = [ - "clickhouse_keeper", - "clickhouse", - "cockroachdb", - "crucible-pantry-zone", - "crucible-zone", - "external-dns", - "internal-dns", - "nexus", - "ntp", - "oximeter", - "probe", +const TUF_PACKAGES: [&PackageName; 11] = [ + &PackageName::new_const("clickhouse_keeper"), + &PackageName::new_const("clickhouse"), + &PackageName::new_const("cockroachdb"), + &PackageName::new_const("crucible-pantry-zone"), + &PackageName::new_const("crucible-zone"), + &PackageName::new_const("external-dns"), + &PackageName::new_const("internal-dns"), + &PackageName::new_const("nexus"), + &PackageName::new_const("ntp"), + &PackageName::new_const("oximeter"), + &PackageName::new_const("probe"), ]; const HELIOS_REPO: &str = "https://pkg.oxide.computer/helios/2/dev/"; @@ -430,7 +430,7 @@ async fn main() -> Result<()> { "--artifacts", $target.artifacts_path(&args).as_str(), "stamp", - package, + package.as_str(), &version_str, ]) .env_remove("CARGO_MANIFEST_DIR"), @@ -655,14 +655,16 @@ impl Target { } } - fn proto_packages(self) -> &'static [(&'static str, InstallMethod)] { + fn proto_packages( + self, + ) -> &'static [(&'static PackageName, InstallMethod)] { match self { Target::Host => &HOST_IMAGE_PACKAGES, Target::Recovery => &RECOVERY_IMAGE_PACKAGES, } } - fn proto_package_names(self) -> impl Iterator { + fn proto_package_names(self) -> impl Iterator { self.proto_packages().iter().map(|(name, _)| *name) } @@ -694,7 +696,7 @@ impl std::fmt::Display for Target { async fn build_proto_area( mut package_dir: Utf8PathBuf, proto_dir: Utf8PathBuf, - packages: &'static [(&'static str, InstallMethod)], + packages: &'static [(&'static PackageName, InstallMethod)], manifest: Arc, ) -> Result<()> { let opt_oxide = proto_dir.join("root/opt/oxide"); @@ -709,7 +711,7 @@ async fn build_proto_area( manifest.packages.get(package_name).expect("checked in preflight"); match method { InstallMethod::Install => { - let path = opt_oxide.join(&package.service_name); + let path = opt_oxide.join(package.service_name.as_str()); fs::create_dir(&path).await?; let cloned_path = path.clone(); @@ -717,7 +719,7 @@ async fn build_proto_area( tokio::task::spawn_blocking(move || -> Result<()> { let mut archive = tar::Archive::new(std::fs::File::open( cloned_package_dir - .join(package_name) + .join(package_name.as_str()) .with_extension("tar"), )?); archive.unpack(cloned_path).with_context(|| { @@ -733,7 +735,7 @@ async fn build_proto_area( fs::rename( smf_manifest, manifest_site - .join(&package.service_name) + .join(package.service_name.as_str()) .with_extension("xml"), ) .await?; diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index 2cb0512169..a22312c42a 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -14,10 +14,10 @@ use omicron_package::cargo_plan::build_cargo_plan; use omicron_package::config::{Config, ConfigArgs}; use omicron_package::target::{target_command_help, KnownTarget}; use omicron_package::{parse, BuildCommand, DeployCommand, TargetCommand}; -use omicron_zone_package::config::Config as PackageConfig; +use omicron_zone_package::config::{Config as PackageConfig, PackageName}; use omicron_zone_package::package::{Package, PackageOutput, PackageSource}; use omicron_zone_package::progress::Progress; -use omicron_zone_package::target::Target; +use omicron_zone_package::target::TargetMap; use rayon::prelude::*; use ring::digest::{Context as DigestContext, Digest, SHA256}; use sled_hardware::cleanup::cleanup_networking_resources; @@ -32,6 +32,9 @@ use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; use tokio::process::Command; +const OMICRON_SLED_AGENT: PackageName = + PackageName::new_const("omicron-sled-agent"); + /// All packaging subcommands. #[derive(Debug, Subcommand)] enum SubCommand { @@ -181,7 +184,7 @@ async fn do_target( )?; let (name, path) = get_required_named_target(&target_dir, name)?; - tokio::fs::write(&path, Target::from(target).to_string()) + tokio::fs::write(&path, TargetMap::from(target).to_string()) .await .with_context(|| { format!("failed to write target to {}", path) @@ -266,7 +269,7 @@ async fn replace_active_link( let dst = target_dir.join(Config::ACTIVE); if !target_dir.join(src).exists() { - bail!("Target file {} does not exist", src); + bail!("TargetMap file {} does not exist", src); } let _ = tokio::fs::remove_file(&dst).await; tokio::fs::symlink(src, &dst).await.with_context(|| { @@ -301,7 +304,7 @@ async fn get_sha256_digest(path: &Utf8PathBuf) -> Result { async fn download_prebuilt( progress: &PackageProgress, - package_name: &str, + package_name: &PackageName, repo: &str, commit: &str, expected_digest: &Vec, @@ -368,7 +371,7 @@ async fn download_prebuilt( async fn ensure_package( config: &Config, ui: &Arc, - package_name: &String, + package_name: &PackageName, package: &Package, output_directory: &Utf8Path, disable_cache: bool, @@ -497,7 +500,7 @@ async fn do_package( async fn do_stamp( config: &Config, output_directory: &Utf8Path, - package_name: &str, + package_name: &PackageName, version: &semver::Version, ) -> Result<()> { // Find the package which should be stamped @@ -506,7 +509,7 @@ async fn do_stamp( .packages_to_deploy(config.target()) .0 .into_iter() - .find(|(name, _pkg)| name.as_str() == package_name) + .find(|(name, _pkg)| *name == package_name) .ok_or_else(|| anyhow!("Package {package_name} not found"))?; // Stamp it @@ -533,8 +536,7 @@ async fn do_unpack( |(package_name, package)| -> Result<()> { let tarfile = package.get_output_path(&package_name, artifact_dir); let src = tarfile.as_path(); - let dst = - package.get_output_path(&package.service_name, install_dir); + let dst = package.get_output_path_for_service(install_dir); info!( config.log(), "Installing service"; @@ -566,7 +568,7 @@ async fn do_unpack( for service_name in global_zone_service_names { let tar_path = install_dir.join(format!("{}.tar", service_name)); - let service_path = install_dir.join(service_name); + let service_path = install_dir.join(service_name.as_str()); info!( config.log(), "Unpacking service tarball"; @@ -588,10 +590,10 @@ fn do_activate(config: &Config, install_dir: &Utf8Path) -> Result<()> { // Install the bootstrap service, which itself extracts and // installs other services. if let Some(package) = - config.package_config().packages.get("omicron-sled-agent") + config.package_config().packages.get(&OMICRON_SLED_AGENT) { let manifest_path = install_dir - .join(&package.service_name) + .join(package.service_name.as_str()) .join("pkg") .join("manifest.xml"); info!( diff --git a/package/src/cargo_plan.rs b/package/src/cargo_plan.rs index 1a32b199fb..88702452a2 100644 --- a/package/src/cargo_plan.rs +++ b/package/src/cargo_plan.rs @@ -11,6 +11,7 @@ use anyhow::Context; use anyhow::Result; use cargo_metadata::Metadata; use omicron_zone_package::config::PackageMap; +use omicron_zone_package::config::PackageName; use omicron_zone_package::package::PackageSource; use slog::info; use slog::Logger; @@ -46,9 +47,10 @@ pub fn build_cargo_plan<'a>( // Add the package name to the plan plan.packages.insert(name); // Get the package metadata - let metadata = workspace_pkgs.get(name).with_context(|| { - format!("package '{name}' is not a workspace package") - })?; + let metadata = + workspace_pkgs.get(name.as_str()).with_context(|| { + format!("package '{name}' is not a workspace package") + })?; // Add the binaries we want to build to the plan let bins = metadata .targets @@ -92,7 +94,7 @@ impl CargoPlan<'_> { #[derive(Debug)] pub struct CargoTargets<'a> { pub kind: BuildKind, - pub packages: BTreeSet<&'a String>, + pub packages: BTreeSet<&'a PackageName>, pub bins: BTreeSet<&'a String>, pub features: BTreeSet<&'a String>, } @@ -120,7 +122,7 @@ impl CargoTargets<'_> { // --package, and without any --package options Cargo unifies features // across all workspace default members. See rust-lang/cargo#8157. for package in &self.packages { - cmd.arg("--package").arg(package); + cmd.arg("--package").arg(package.as_str()); } for bin in &self.bins { cmd.arg("--bin").arg(bin); diff --git a/package/src/config.rs b/package/src/config.rs index 800af7a0de..1efd523496 100644 --- a/package/src/config.rs +++ b/package/src/config.rs @@ -6,9 +6,9 @@ use anyhow::{bail, Result}; use camino::Utf8Path; use clap::Args; use omicron_zone_package::{ - config::{Config as PackageConfig, PackageMap}, + config::{Config as PackageConfig, PackageMap, PackageName}, package::PackageSource, - target::Target, + target::TargetMap, }; use slog::{debug, Logger}; use std::{collections::BTreeMap, io::Write, str::FromStr, time::Duration}; @@ -50,9 +50,9 @@ pub struct Config { // Description of all possible packages. package_config: PackageConfig, // Description of the target we're trying to operate on. - target: Target, + target: TargetMap, // The list of packages the user wants us to build (all, if empty) - only: Vec, + only: Vec, // True if we should skip confirmations for destructive operations. force: bool, // Number of times to retry failed downloads. @@ -91,7 +91,7 @@ impl Config { target_help_str() ); })?; - let target: Target = KnownTarget::from_str(&raw_target) + let target: TargetMap = KnownTarget::from_str(&raw_target) .inspect_err(|_| { eprintln!( "Failed to parse {} as target\n{}", @@ -115,7 +115,7 @@ impl Config { /// Sets the `only` field. #[inline] - pub fn set_only(&mut self, only: Vec) -> &mut Self { + pub fn set_only(&mut self, only: Vec) -> &mut Self { self.only = only; self } @@ -128,7 +128,7 @@ impl Config { /// Returns the target currently being operated on. #[inline] - pub fn target(&self) -> &Target { + pub fn target(&self) -> &TargetMap { &self.target } @@ -212,7 +212,7 @@ impl Config { output ) }); - if dep_name.as_str() == package_name { + if *dep_name == package_name { panic!("'{}' depends on itself", package_name); } // if we've seen this package already, it will be in diff --git a/package/src/dot.rs b/package/src/dot.rs index 141adcf368..875c626702 100644 --- a/package/src/dot.rs +++ b/package/src/dot.rs @@ -4,7 +4,7 @@ use anyhow::anyhow; use omicron_zone_package::config::Config; use omicron_zone_package::package::PackageOutput; use omicron_zone_package::package::PackageSource; -use omicron_zone_package::target::Target; +use omicron_zone_package::target::TargetMap; use petgraph::dot::Dot; use petgraph::graph::EdgeReference; use petgraph::graph::NodeIndex; @@ -70,7 +70,7 @@ impl std::fmt::Display for GraphNode { // Returns a string that can be passed to dot(1) to visualize a package manifest pub fn do_dot( - target: &Target, + target: &TargetMap, package_config: &Config, ) -> anyhow::Result { let packages = &package_config.packages; @@ -173,9 +173,10 @@ pub fn do_dot( let dep_node = pkg_for_output.get(dependency).ok_or_else(|| { anyhow!( - "package {:?} has dependency on {:?}, which does \ - not correspond to the output of any package", - pkgname, dependency) + "package \"{}\" has dependency on {:?}, which does \ + not correspond to the output of any package", + pkgname, dependency, + ) })?; graph.add_edge(*pkg_node, **dep_node, "merge"); } @@ -295,7 +296,7 @@ mod test { fn dot_output_for(raw_toml: &str) -> Result { let package_config = parse_manifest(raw_toml).expect("test toml was invalid"); - do_dot(&Target::default(), &package_config) + do_dot(&TargetMap::default(), &package_config) } #[test] diff --git a/package/src/lib.rs b/package/src/lib.rs index 8ef9a4c951..969cf6db21 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -2,6 +2,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; +use omicron_zone_package::config::PackageName; use serde::de::DeserializeOwned; use thiserror::Error; @@ -122,12 +123,12 @@ pub enum BuildCommand { disable_cache: bool, /// Limit to building only these packages #[clap(long)] - only: Vec, + only: Vec, }, /// Stamps semver versions onto packages within a manifest Stamp { /// The name of the artifact to be stamped. - package_name: String, + package_name: PackageName, /// The version to be stamped onto the package. version: semver::Version, diff --git a/package/src/target.rs b/package/src/target.rs index d56f7e87c5..bf2c151443 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -6,7 +6,7 @@ use anyhow::{bail, Result}; use clap::ValueEnum; -use omicron_zone_package::target::Target; +use omicron_zone_package::target::TargetMap; use std::collections::BTreeMap; /// Type of OS image to build @@ -74,7 +74,7 @@ pub enum ClickhouseTopology { SingleNode, } -/// A strongly-typed variant of [Target]. +/// A strongly-typed variant of [`TargetMap`]. #[derive(Clone, Debug)] pub struct KnownTarget { image: Image, @@ -123,8 +123,8 @@ impl Default for KnownTarget { } } -impl From for Target { - fn from(kt: KnownTarget) -> Target { +impl From for TargetMap { + fn from(kt: KnownTarget) -> TargetMap { let mut map = BTreeMap::new(); map.insert("image".to_string(), kt.image.to_string()); if let Some(machine) = kt.machine { @@ -138,13 +138,13 @@ impl From for Target { "clickhouse-topology".to_string(), kt.clickhouse_topology.to_string(), ); - Target(map) + TargetMap(map) } } impl std::fmt::Display for KnownTarget { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let target: Target = self.clone().into(); + let target: TargetMap = self.clone().into(); target.fmt(f) } } @@ -153,7 +153,7 @@ impl std::str::FromStr for KnownTarget { type Err = anyhow::Error; fn from_str(s: &str) -> Result { - let target = Target::from_str(s)?; + let target = TargetMap::from_str(s)?; let mut image = Self::default().image; let mut machine = None; @@ -181,7 +181,7 @@ impl std::str::FromStr for KnownTarget { _ => { bail!( "Unknown target key {k}\nValid keys include: [{}]", - Target::from(Self::default()) + TargetMap::from(Self::default()) .0 .keys() .cloned() diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 9b5de946f9..0a9313b75a 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -63,14 +63,14 @@ generic-array = { version = "0.14.7", default-features = false, features = ["mor getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.15.1" } -heck = { version = "0.4.1" } hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.5.0", features = ["full"] } indexmap = { version = "2.6.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } -itertools = { version = "0.10.5" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } lalrpop-util = { version = "0.19.12" } lazy_static = { version = "1.5.0", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2.162", features = ["extra_traits"] } @@ -184,14 +184,14 @@ generic-array = { version = "0.14.7", default-features = false, features = ["mor getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.15.1" } -heck = { version = "0.4.1" } hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.5.0", features = ["full"] } indexmap = { version = "2.6.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } -itertools = { version = "0.10.5" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } lalrpop-util = { version = "0.19.12" } lazy_static = { version = "1.5.0", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2.162", features = ["extra_traits"] } From 390ac60378fa3aee5aae4546fa4e70cd2d063833 Mon Sep 17 00:00:00 2001 From: Rain Date: Mon, 6 Jan 2025 18:26:15 -0800 Subject: [PATCH 4/8] [4/n] [omicron-package] add and use target presets (#7288) The overall goal of this change is to ensure that all release-related configuration is present in `package-manifest.toml`. This will allow linting and SBOM generation based on this config, rather than the knowledge being scattered across omicron-package and the releng tool. --- .github/buildomat/jobs/package.sh | 2 +- .github/workflows/rust.yml | 2 +- Cargo.lock | 1 + dev-tools/releng/src/main.rs | 23 +--- docs/how-to-run.adoc | 51 +++++-- package-manifest.toml | 42 +++++- package/Cargo.toml | 1 + package/src/bin/omicron-package.rs | 84 ++++-------- package/src/cargo_plan.rs | 105 ++++++++++++++- package/src/config.rs | 206 +++++++++++++++++++++++++++-- package/src/lib.rs | 62 ++++----- package/src/target.rs | 128 +++++++++++------- 12 files changed, 531 insertions(+), 176 deletions(-) diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index 7a2cc2c369..53b2e960da 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -28,7 +28,7 @@ ptime -m cargo xtask download softnpu # Build the test target export CARGO_INCREMENTAL=0 ptime -m cargo run --locked --release --bin omicron-package -- \ - -t test target create -i standard -m non-gimlet -s softnpu -r single-sled + -t test target create -p dev ptime -m cargo run --locked --release --bin omicron-package -- \ -t test package mapfile -t packages \ diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 28a0342b39..deec372d85 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -53,7 +53,7 @@ jobs: - name: Install Pre-Requisites run: ./tools/install_builder_prerequisites.sh -y - name: Set default target - run: cargo run --bin omicron-package -- -t default target create -r single-sled + run: cargo run --bin omicron-package -- -t default target create --preset dev - name: Check build of deployed Omicron packages run: cargo run --bin omicron-package -- -t default check diff --git a/Cargo.lock b/Cargo.lock index 5d2b7392af..b49c4bc63b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7172,6 +7172,7 @@ dependencies = [ "futures", "hex", "illumos-utils", + "indent_write", "indicatif", "omicron-workspace-hack", "omicron-zone-package", diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index b20b7729fb..2f0508c41a 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -454,8 +454,13 @@ async fn main() -> Result<()> { artifacts_path.as_str(), "target", "create", + "--preset", + target.as_str(), ]) - .args(target.target_args()) + // Note: Do not override the preset by adding arguments like + // `-m`/`--machine` here, or anywhere else in the releng + // tooling! All release targets must be configured entirely via + // the `target.preset` table in `package-manifest.toml`. .env_remove("CARGO_MANIFEST_DIR"), ) .after("omicron-package"); @@ -639,22 +644,6 @@ impl Target { } } - fn target_args(self) -> &'static [&'static str] { - match self { - Target::Host => &[ - "--image", - "standard", - "--machine", - "gimlet", - "--switch", - "asic", - "--rack-topology", - "multi-sled", - ], - Target::Recovery => &["--image", "trampoline"], - } - } - fn proto_packages( self, ) -> &'static [(&'static PackageName, InstallMethod)] { diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 35efe8cafa..799a447496 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -332,57 +332,88 @@ In some configurations (not the one described here), it may be necessary to upda The `omicron-package` tool builds Omicron and bundles all required files into _packages_ that can be copied to another system (if necessary) and installed there. This tool acts on `package-manifest.toml`, which describes the contents of the packages. -Packages have a notion of "build targets", which are used to select between different variants of certain components. A build target is composed of an image type, a machine type, and a switch type: +Packages have a notion of "build targets", which are used to select between different variants of certain components. For example, the Sled Agent can be built for a real Oxide system, for a standalone Gimlet, or for a non-Gimlet system. This choice is represented by the `--machine` setting here: [source,console] ---- -$ cargo run --release --bin omicron-package -- target create -h - Finished release [optimized] target(s) in 0.70s - Running `target/release/omicron-package target create -h` +$ cargo run --release --bin omicron-package -- target create --help + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.55s + Running `target/release/omicron-package target create --help` Error: Creates a new build target, and sets it as "active" -Usage: omicron-package target create [OPTIONS] +Usage: omicron-package target create [OPTIONS] --preset Options: + -p, --preset + The preset to use as part of the build (use `dev` for development). + + Presets are defined in the `target.preset` section of the config. The other configurations are layered on top of + the preset. + -i, --image - [default: standard] + The image to use for the target. + + If specified, this configuration is layered on top of the preset. Possible values: - standard: A typical host OS image - trampoline: A recovery host OS image, intended to bootstrap a Standard image -m, --machine + The kind of machine to build for + Possible values: - gimlet: Use sled agent configuration for a Gimlet - gimlet-standalone: Use sled agent configuration for a Gimlet running in isolation - non-gimlet: Use sled agent configuration for a device emulating a Gimlet -s, --switch + The switch to use for the target + Possible values: - asic: Use the "real" Dendrite, that attempts to interact with the Tofino - stub: Use a "stub" Dendrite that does not require any real hardware - softnpu: Use a "softnpu" Dendrite that uses the SoftNPU asic emulator -r, --rack-topology + Specify whether nexus will run in a single-sled or multi-sled environment. + + Set single-sled for dev purposes when you're running a single sled-agent. Set multi-sled if you're running with + multiple sleds. Currently this only affects the crucible disk allocation strategy- VM disks will require 3 + distinct sleds with `multi-sled`, which will fail in a single-sled environment. `single-sled` relaxes this + requirement. + Possible values: - multi-sled: Use configurations suitable for a multi-sled deployment, such as dogfood and production racks - single-sled: Use configurations suitable for a single-sled deployment, such as CI and dev machines + -c, --clickhouse-topology + Specify whether clickhouse will be deployed as a replicated cluster or single-node configuration. + + Replicated cluster configuration is an experimental feature to be used only for testing. + + Possible values: + - replicated-cluster: Use configurations suitable for a replicated ClickHouse cluster deployment + - single-node: Use configurations suitable for a single-node ClickHouse deployment + -h, --help Print help (see a summary with '-h') - ---- -To set up a build target for a non-Gimlet machine with simulated (but fully functional) external networking, you would run: +Setting up a target is typically done by selecting a **preset**. Presets are defined in `package-manifest.toml` under `[target.preset]`. + +For development purposes, the recommended preset is `dev`. This preset sets up a build target for a non-Gimlet machine with simulated (but fully functional) external networking: [source,console] ---- -$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu -r single-sled +$ cargo run --release --bin omicron-package -- -t default target create -p dev Finished release [optimized] target(s) in 0.66s - Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu -r single-sled` + Running `target/release/omicron-package -t default target create -p dev` Created new build target 'default' and set it as active ---- +To customize the target beyond the preset, use the other options (for example, `--image`). These options will override the settings in the preset. + NOTE: The `target create` command will set the new target as active and thus let you omit the `-t` flag in subsequent commands. To kick off the build and package everything up, you can run: diff --git a/package-manifest.toml b/package-manifest.toml index 8140ce43af..5c1dd1879d 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -797,7 +797,7 @@ output.intermediate_only = true # To package and install the asic variant of the switch, do: # -# $ cargo run --release --bin omicron-package -- -t default target create -i standard -m gimlet -s asic +# $ cargo run --release --bin omicron-package -- -t default target create -p dev -m gimlet -s asic # $ cargo run --release --bin omicron-package -- package # $ pfexec ./target/release/omicron-package install [package.switch-asic] @@ -825,7 +825,7 @@ output.type = "zone" # To package and install the stub variant of the switch, do the following: # -# $ cargo run --release --bin omicron-package -- -t default target create -i standard -m -s stub +# $ cargo run --release --bin omicron-package -- -t default target create -p dev -m -s stub # $ cargo run --release --bin omicron-package -- package # $ pfexec ./target/release/omicron-package install [package.switch-stub] @@ -851,7 +851,7 @@ output.type = "zone" # To package and install the softnpu variant of the switch, do the following: # -# $ cargo run --release --bin omicron-package -- -t default target create -i standard -m -s softnpu +# $ cargo run --release --bin omicron-package -- -t default target create -p dev -m # $ cargo run --release --bin omicron-package -- package # $ pfexec ./target/release/omicron-package install [package.switch-softnpu] @@ -934,4 +934,38 @@ source.type = "local" source.rust.binary_names = ["clickana"] source.rust.release = true output.type = "zone" -output.intermediate_only = true \ No newline at end of file +output.intermediate_only = true + +# Target configuration +# -------------------- +# +# This section defines "targets" built by Omicron. A target is a map of keys and +# values that are used to filter out packages (via `only_for_targets`) and for +# other purposes. +# +# For what the individual keys mean, see the definition for `TargetCommand` in +# `package/src/lib.rs`. + +# A preset for the host image built during release. +[target.preset.host] +image = "standard" +machine = "gimlet" +switch = "asic" +rack-topology = "multi-sled" +clickhouse-topology = "single-node" + +# A preset for the recovery image built during release. +[target.preset.recovery] +image = "trampoline" +# The trampoline image doesn't execute sled-agent and doesn't contain the switch +# zone, so neither "machine" nor "switch" are defined. +rack-topology = "single-sled" +clickhouse-topology = "single-node" + +# A preset for development. +[target.preset.dev] +image = "standard" +machine = "non-gimlet" +switch = "softnpu" +rack-topology = "single-sled" +clickhouse-topology = "single-node" diff --git a/package/Cargo.toml b/package/Cargo.toml index ccc768bb8e..35cfaaaa9f 100644 --- a/package/Cargo.toml +++ b/package/Cargo.toml @@ -16,6 +16,7 @@ clap.workspace = true futures.workspace = true hex.workspace = true illumos-utils.workspace = true +indent_write.workspace = true indicatif.workspace = true omicron-workspace-hack.workspace = true omicron-zone-package.workspace = true diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index a22312c42a..2b2d415284 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -10,11 +10,14 @@ use clap::{Parser, Subcommand}; use futures::stream::{self, StreamExt, TryStreamExt}; use illumos_utils::{zfs, zone}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; -use omicron_package::cargo_plan::build_cargo_plan; -use omicron_package::config::{Config, ConfigArgs}; -use omicron_package::target::{target_command_help, KnownTarget}; -use omicron_package::{parse, BuildCommand, DeployCommand, TargetCommand}; -use omicron_zone_package::config::{Config as PackageConfig, PackageName}; +use omicron_package::cargo_plan::{ + build_cargo_plan, do_show_cargo_commands_for_config, + do_show_cargo_commands_for_presets, +}; +use omicron_package::config::{BaseConfig, Config, ConfigArgs}; +use omicron_package::target::target_command_help; +use omicron_package::{BuildCommand, DeployCommand, TargetCommand}; +use omicron_zone_package::config::PackageName; use omicron_zone_package::package::{Package, PackageOutput, PackageSource}; use omicron_zone_package::progress::Progress; use omicron_zone_package::target::TargetMap; @@ -30,7 +33,6 @@ use std::fs::create_dir_all; use std::sync::{Arc, OnceLock}; use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; -use tokio::process::Command; const OMICRON_SLED_AGENT: PackageName = PackageName::new_const("omicron-sled-agent"); @@ -70,49 +72,6 @@ struct Args { subcommand: SubCommand, } -async fn do_show_cargo_commands(config: &Config) -> Result<()> { - let metadata = cargo_metadata::MetadataCommand::new().no_deps().exec()?; - let features = config.cargo_features(); - let cargo_plan = - build_cargo_plan(&metadata, config.packages_to_build(), &features)?; - - let release_command = cargo_plan.release.build_command("build"); - let debug_command = cargo_plan.debug.build_command("build"); - - print!("release command: "); - if let Some(command) = release_command { - println!("{}", command_to_string(&command)); - } else { - println!("(none)"); - } - - print!("debug command: "); - if let Some(command) = debug_command { - println!("{}", command_to_string(&command)); - } else { - println!("(none)"); - } - - Ok(()) -} - -fn command_to_string(command: &Command) -> String { - // Use shell-words to join the command and arguments into a single string. - let mut v = vec![command - .as_std() - .get_program() - .to_str() - .expect("program is valid UTF-8")]; - v.extend( - command - .as_std() - .get_args() - .map(|arg| arg.to_str().expect("argument is valid UTF-8")), - ); - - shell_words::join(&v) -} - async fn do_for_all_rust_packages( config: &Config, command: &str, @@ -159,6 +118,7 @@ async fn do_list_outputs( } async fn do_target( + base_config: &BaseConfig, artifact_dir: &Utf8Path, name: Option<&str>, subcommand: &TargetCommand, @@ -169,13 +129,15 @@ async fn do_target( })?; match subcommand { TargetCommand::Create { + preset, image, machine, switch, rack_topology, clickhouse_topology, } => { - let target = KnownTarget::new( + let preset_target = base_config.get_preset(preset)?; + let target = preset_target.with_overrides( image.clone(), machine.clone(), switch.clone(), @@ -269,7 +231,7 @@ async fn replace_active_link( let dst = target_dir.join(Config::ACTIVE); if !target_dir.join(src).exists() { - bail!("TargetMap file {} does not exist", src); + bail!("Target file {} does not exist", src); } let _ = tokio::fs::remove_file(&dst).await; tokio::fs::symlink(src, &dst).await.with_context(|| { @@ -869,7 +831,9 @@ impl Progress for PackageProgress { #[tokio::main] async fn main() -> Result<()> { let args = Args::try_parse()?; - let package_config = parse::<_, PackageConfig>(&args.manifest)?; + let base_config = BaseConfig::load(&args.manifest).with_context(|| { + format!("failed to load base config from {:?}", args.manifest) + })?; let mut open_options = std::fs::OpenOptions::new(); open_options.write(true).create(true).truncate(true); @@ -883,9 +847,9 @@ async fn main() -> Result<()> { let log = Logger::root(drain, o!()); let get_config = || -> Result { - Config::get_config( + Config::load( &log, - package_config, + base_config.package_config(), &args.config_args, &args.artifact_dir, ) @@ -903,6 +867,7 @@ async fn main() -> Result<()> { match args.subcommand { SubCommand::Build(BuildCommand::Target { subcommand }) => { do_target( + &base_config, &args.artifact_dir, args.config_args.target.as_deref(), &subcommand, @@ -930,8 +895,15 @@ async fn main() -> Result<()> { ) .await?; } - SubCommand::Build(BuildCommand::ShowCargoCommands) => { - do_show_cargo_commands(&get_config()?).await?; + SubCommand::Build(BuildCommand::ShowCargoCommands { presets }) => { + // If presets is empty, show the commands from the + // default configuration, otherwise show the commands + // for the specified presets. + if let Some(presets) = presets { + do_show_cargo_commands_for_presets(&base_config, &presets)?; + } else { + do_show_cargo_commands_for_config(&get_config()?)?; + } } SubCommand::Build(BuildCommand::Check) => { do_check(&get_config()?).await? diff --git a/package/src/cargo_plan.rs b/package/src/cargo_plan.rs index 88702452a2..1788d24ba1 100644 --- a/package/src/cargo_plan.rs +++ b/package/src/cargo_plan.rs @@ -4,12 +4,15 @@ use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::fmt; +use std::io::Write; use anyhow::bail; use anyhow::ensure; use anyhow::Context; use anyhow::Result; use cargo_metadata::Metadata; +use indent_write::io::IndentWriter; use omicron_zone_package::config::PackageMap; use omicron_zone_package::config::PackageName; use omicron_zone_package::package::PackageSource; @@ -17,6 +20,11 @@ use slog::info; use slog::Logger; use tokio::process::Command; +use crate::config::cargo_features_for_target; +use crate::config::BaseConfig; +use crate::config::Config; +use crate::config::MultiPresetArg; + /// For a configuration, build a plan: the set of packages, binaries, and /// features to operate on in release and debug modes. pub fn build_cargo_plan<'a>( @@ -82,12 +90,18 @@ pub struct CargoPlan<'a> { pub debug: CargoTargets<'a>, } -impl CargoPlan<'_> { +impl<'a> CargoPlan<'a> { pub async fn run(&self, command: &str, log: &Logger) -> Result<()> { self.release.run(command, log).await?; self.debug.run(command, log).await?; Ok(()) } + + /// Displays a `CargoPlan` in a human-readable format with the provided + /// command name. + pub fn display_human(&'a self, command: &'a str) -> DisplayCargoPlan<'_> { + DisplayCargoPlan { plan: self, command } + } } /// A set of packages, binaries, and features to operate on. @@ -172,3 +186,92 @@ pub enum BuildKind { Release, Debug, } + +/// Show the Cargo commands that would be run for a configuration. +pub fn do_show_cargo_commands_for_config(config: &Config) -> Result<()> { + let metadata = cargo_metadata::MetadataCommand::new().no_deps().exec()?; + let features = config.cargo_features(); + let cargo_plan = + build_cargo_plan(&metadata, config.packages_to_build(), &features)?; + print!("{}", cargo_plan.display_human("build")); + + Ok(()) +} + +/// Show the Cargo commands that would be run for a set of presets. +pub fn do_show_cargo_commands_for_presets( + base_config: &BaseConfig, + presets: &MultiPresetArg, +) -> Result<()> { + let presets = base_config.get_presets(presets)?; + let metadata = cargo_metadata::MetadataCommand::new().no_deps().exec()?; + + for (preset, target) in presets { + let target_map = target.clone().into(); + let features = cargo_features_for_target(&target_map); + + // Build the cargo plan for this preset. + let cargo_plan = build_cargo_plan( + &metadata, + base_config.package_config().packages_to_build(&target_map), + &features, + ) + .with_context(|| { + format!("failed to build cargo plan for preset '{preset}'") + })?; + + // Print out the plan for this preset. + println!("for preset '{}':", preset); + let mut writer = IndentWriter::new(" * ", std::io::stdout().lock()); + writeln!(writer, "{}", cargo_plan.display_human("build"))?; + + writer.flush()?; + } + + Ok(()) +} + +/// A human-readable display of a `CargoPlan`. +/// +/// Created by calling [`CargoPlan::display_human`]. +pub struct DisplayCargoPlan<'a> { + plan: &'a CargoPlan<'a>, + command: &'a str, +} + +impl fmt::Display for DisplayCargoPlan<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "release command: ")?; + if let Some(command) = &self.plan.release.build_command(self.command) { + writeln!(f, "{}", command_to_string(&command))?; + } else { + writeln!(f, "(none)")?; + } + + write!(f, "debug command: ")?; + if let Some(command) = &self.plan.debug.build_command(self.command) { + writeln!(f, "{}", command_to_string(&command))?; + } else { + writeln!(f, "(none)")?; + } + + Ok(()) + } +} + +fn command_to_string(command: &Command) -> String { + // Use shell-words to join the command and arguments into a single string. + let mut v = vec![command + .as_std() + .get_program() + .to_str() + .expect("program is valid UTF-8")]; + v.extend( + command + .as_std() + .get_args() + .map(|arg| arg.to_str().expect("argument is valid UTF-8")), + ); + + shell_words::join(&v) +} diff --git a/package/src/config.rs b/package/src/config.rs index 1efd523496..9aad0bdef5 100644 --- a/package/src/config.rs +++ b/package/src/config.rs @@ -2,11 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use camino::Utf8Path; use clap::Args; use omicron_zone_package::{ - config::{Config as PackageConfig, PackageMap, PackageName}, + config::{ + Config as PackageConfig, PackageMap, PackageName, PresetName, + TargetConfig, + }, package::PackageSource, target::TargetMap, }; @@ -44,6 +47,157 @@ fn parse_duration_ms(arg: &str) -> Result { Ok(Duration::from_millis(ms)) } +/// A specification for zero or more presets over the command line. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum MultiPresetArg { + /// A list of presets. + List(Vec), + /// All presets. + All, +} + +impl MultiPresetArg { + /// Returns true if there are no presets specified. + #[inline] + pub fn is_empty(&self) -> bool { + match self { + Self::List(list) => list.is_empty(), + Self::All => false, + } + } +} + +impl FromStr for MultiPresetArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + if s == "all" { + return Ok(Self::All); + } + + // TODO: it would be nice to collect all errors here. + let list = s + .split(',') + .map(|x| { + let p = x.parse::()?; + + // Ensure that p isn't "all". + if p.as_str() == "all" { + bail!("'all' must not be specified with other presets"); + } + + Ok(p) + }) + .collect::>>()?; + Ok(Self::List(list)) + } +} + +/// A base configuration. +/// +/// This is a thin wrapper around `PackageConfig` that has also validated that +/// the defined presets are valid. +#[derive(Debug)] +pub struct BaseConfig { + package_config: PackageConfig, + presets: BTreeMap, +} + +impl BaseConfig { + /// Loads the base config and ensures that all presets are valid. + pub fn load(manifest: &Utf8Path) -> Result { + let package_config = crate::parse::<_, PackageConfig>(manifest)?; + let presets = build_presets(&package_config.target)?; + + Ok(Self { package_config, presets }) + } + + /// Returns the package configuration. + #[inline] + pub fn package_config(&self) -> &PackageConfig { + &self.package_config + } + + /// Gets the map of all presets. + #[inline] + pub fn presets(&self) -> &BTreeMap { + &self.presets + } + + /// Gets a list of available presets as a string. + pub fn available_presets_str(&self) -> String { + self.presets.keys().map(|x| x.as_str()).collect::>().join(", ") + } + + /// Gets the preset with the given name. + pub fn get_preset(&self, name: &PresetName) -> Result<&KnownTarget> { + self.presets.get(name).with_context(|| { + format!( + "preset '{name}' not found\n(available presets: {})", + self.available_presets_str(), + ) + }) + } + + /// Resolves the specified presets, returning an error if any of them are + /// invalid. + /// + /// Presets are returned in the order they were specified (keeping the order + /// the user specified on the command line). + pub fn get_presets( + &self, + presets: &MultiPresetArg, + ) -> Result> { + match presets { + MultiPresetArg::List(list) => { + let mut valid = Vec::new(); + // Ensure that all specified presets are found in the base config. + let mut missing = Vec::new(); + + for preset in list { + if let Some((preset, target)) = + self.presets.get_key_value(preset) + { + valid.push((preset, target)); + } else { + missing.push(preset); + } + } + + if !missing.is_empty() { + let names = + missing.iter().map(|x| x.as_str()).collect::>(); + bail!( + "presets not found in base config: {}\n(available presets: {})", + names.join(", "), + self.available_presets_str(), + ); + } + + Ok(valid) + } + MultiPresetArg::All => Ok(self.presets.iter().collect()), + } + } +} + +fn build_presets( + config: &TargetConfig, +) -> Result> { + let mut presets = BTreeMap::new(); + + for (name, value) in &config.presets { + // TODO: it would be nice to collect all errors here. + let target = + KnownTarget::from_target_map(value).with_context(|| { + format!("error parsing config for preset '{name}'") + })?; + presets.insert(name.clone(), target); + } + + Ok(presets) +} + #[derive(Debug)] pub struct Config { log: Logger, @@ -66,9 +220,9 @@ impl Config { pub const ACTIVE: &str = "active"; /// Builds a new configuration. - pub fn get_config( + pub fn load( log: &Logger, - package_config: PackageConfig, + package_config: &PackageConfig, args: &ConfigArgs, artifact_dir: &Utf8Path, ) -> Result { @@ -104,7 +258,7 @@ impl Config { Ok(Config { log: log.clone(), - package_config, + package_config: package_config.clone(), target, only: Vec::new(), force: args.force, @@ -234,10 +388,42 @@ impl Config { /// Out of these, the features that actually get requested are determined by /// which features are available for the list of packages being built. pub fn cargo_features(&self) -> Vec { - self.target - .0 - .iter() - .map(|(name, value)| format!("{name}-{value}")) - .collect::>() + cargo_features_for_target(&self.target) + } +} + +/// Return a list of all possible Cargo features that could be requested for the +/// given target. +/// +/// Out of these, the features that actually get requested are determined by +/// which features are available for the list of packages being built. +pub fn cargo_features_for_target(target: &TargetMap) -> Vec { + target.0.iter().map(|(name, value)| format!("{name}-{value}")).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn multi_preset_arg() { + let all = MultiPresetArg::from_str("all").unwrap(); + assert_eq!(all, MultiPresetArg::All); + + let list = MultiPresetArg::from_str("a,b,c").unwrap(); + assert_eq!( + list, + MultiPresetArg::List(vec![ + "a".parse().unwrap(), + "b".parse().unwrap(), + "c".parse().unwrap(), + ]) + ); + + let error = MultiPresetArg::from_str("a,b,all").unwrap_err(); + assert_eq!( + error.to_string(), + "'all' must not be specified with other presets" + ); } } diff --git a/package/src/lib.rs b/package/src/lib.rs index 969cf6db21..2f1d64372e 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -2,7 +2,8 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; -use omicron_zone_package::config::PackageName; +use config::MultiPresetArg; +use omicron_zone_package::config::{PackageName, PresetName}; use serde::de::DeserializeOwned; use thiserror::Error; @@ -36,48 +37,40 @@ pub fn parse, C: DeserializeOwned>( pub enum TargetCommand { /// Creates a new build target, and sets it as "active". Create { - #[clap(short, long, default_value = "standard")] - image: crate::target::Image, + /// The preset to use as part of the build (use `dev` for development). + /// + /// Presets are defined in the `target.preset` section of the config. + /// The other configurations are layered on top of the preset. + #[clap(short, long)] + preset: PresetName, + + /// The image to use for the target. + /// + /// If specified, this configuration is layered on top of the preset. + #[clap(short, long)] + image: Option, - #[clap( - short, - long, - default_value_if("image", "standard", "non-gimlet") - )] + /// The kind of machine to build for. + #[clap(short, long, help_heading = "Preset overrides")] machine: Option, - #[clap(short, long, default_value_if("image", "standard", "stub"))] + /// The switch to use for the target. + #[clap(short, long, help_heading = "Preset overrides")] switch: Option, - #[clap( - short, - long, - default_value_if("image", "trampoline", Some("single-sled")), - - // This opt is required, and clap will enforce that even with - // `required = false`, since it's not an Option. But the - // default_value_if only works if we set `required` to false. It's - // jank, but it is what it is. - // https://github.com/clap-rs/clap/issues/4086 - required = false - )] + #[clap(short, long, help_heading = "Preset overrides")] /// Specify whether nexus will run in a single-sled or multi-sled /// environment. /// /// Set single-sled for dev purposes when you're running a single - /// sled-agent. Set multi-sled if you're running with mulitple sleds. + /// sled-agent. Set multi-sled if you're running with multiple sleds. /// Currently this only affects the crucible disk allocation strategy- /// VM disks will require 3 distinct sleds with `multi-sled`, which will /// fail in a single-sled environment. `single-sled` relaxes this /// requirement. - rack_topology: crate::target::RackTopology, + rack_topology: Option, - #[clap( - short, - long, - default_value = Some("single-node"), - required = false - )] + #[clap(short, long, help_heading = "Preset overrides")] // TODO (https://github.com/oxidecomputer/omicron/issues/4148): Remove // once single-node functionality is removed. /// Specify whether clickhouse will be deployed as a replicated cluster @@ -85,7 +78,7 @@ pub enum TargetCommand { /// /// Replicated cluster configuration is an experimental feature to be /// used only for testing. - clickhouse_topology: crate::target::ClickhouseTopology, + clickhouse_topology: Option, }, /// List all existing targets List, @@ -134,7 +127,14 @@ pub enum BuildCommand { version: semver::Version, }, /// Show the Cargo commands that would be run to build the packages. - ShowCargoCommands, + ShowCargoCommands { + /// Show cargo commands for the specified presets, or `all` for all + /// presets. + /// + /// If not specified, the active target's preset is used. + #[clap(short, long = "preset")] + presets: Option, + }, /// Checks the packages specified in a manifest, without building them. Check, } diff --git a/package/src/target.rs b/package/src/target.rs index bf2c151443..19a5a832a1 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -85,7 +85,88 @@ pub struct KnownTarget { } impl KnownTarget { - pub fn new( + /// Creates a new `KnownTarget` from a [`TargetMap`] defined in configuration. + /// + /// Real `KnownTarget` instances might have overrides applied to them via + /// the command line. + pub fn from_target_map(target: &TargetMap) -> Result { + let mut image = Self::default().image; + let mut machine = None; + let mut switch = None; + let mut rack_topology = None; + let mut clickhouse_topology = None; + + for (k, v) in &target.0 { + match k.as_str() { + "image" => { + image = v.parse()?; + } + "machine" => { + machine = Some(v.parse()?); + } + "switch" => { + switch = Some(v.parse()?); + } + "rack-topology" => { + rack_topology = Some(v.parse()?); + } + "clickhouse-topology" => { + clickhouse_topology = Some(v.parse()?); + } + _ => { + bail!( + "Unknown target key {k}\nValid keys include: [{}]", + TargetMap::from(Self::default()) + .0 + .keys() + .cloned() + .collect::>() + .join(", "), + ) + } + } + } + + Self::validate_and_create( + image, + machine, + switch, + rack_topology.unwrap_or(RackTopology::MultiSled), + clickhouse_topology.unwrap_or(ClickhouseTopology::SingleNode), + ) + } + + /// Applies overrides to the target, returning a new `KnownTarget` with the + /// parameters applied. + /// + /// Errors if the new target does not satisfy target constraints. + pub fn with_overrides( + &self, + image: Option, + machine: Option, + switch: Option, + rack_topology: Option, + clickhouse_topology: Option, + ) -> Result { + let image = image.unwrap_or(self.image.clone()); + let machine = machine.or(self.machine.clone()); + let switch = switch.or(self.switch.clone()); + let rack_topology = rack_topology.unwrap_or(self.rack_topology.clone()); + let clickhouse_topology = + clickhouse_topology.unwrap_or(self.clickhouse_topology.clone()); + + Self::validate_and_create( + image, + machine, + switch, + rack_topology, + clickhouse_topology, + ) + } + + /// Creates a new `KnownTarget` from the given parameters, validating + /// constraints. + fn validate_and_create( image: Image, machine: Option, switch: Option, @@ -154,50 +235,7 @@ impl std::str::FromStr for KnownTarget { fn from_str(s: &str) -> Result { let target = TargetMap::from_str(s)?; - - let mut image = Self::default().image; - let mut machine = None; - let mut switch = None; - let mut rack_topology = None; - let mut clickhouse_topology = None; - - for (k, v) in target.0.into_iter() { - match k.as_str() { - "image" => { - image = v.parse()?; - } - "machine" => { - machine = Some(v.parse()?); - } - "switch" => { - switch = Some(v.parse()?); - } - "rack-topology" => { - rack_topology = Some(v.parse()?); - } - "clickhouse-topology" => { - clickhouse_topology = Some(v.parse()?); - } - _ => { - bail!( - "Unknown target key {k}\nValid keys include: [{}]", - TargetMap::from(Self::default()) - .0 - .keys() - .cloned() - .collect::>() - .join(", "), - ) - } - } - } - KnownTarget::new( - image, - machine, - switch, - rack_topology.unwrap_or(RackTopology::MultiSled), - clickhouse_topology.unwrap_or(ClickhouseTopology::SingleNode), - ) + Self::from_target_map(&target) } } From 0603f0b2d5abf673c2adc381c8c733a3ac734b4e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 7 Jan 2025 15:48:35 -0500 Subject: [PATCH 5/8] [RSS] Initial blueprint: only set `address` for Crucible datasets (#7310) This should prevent #7299 from occurring on new invocations of RSS; I'll add a followup PR (with more testing) that adds the `CHECK` constraint mentioned in that issue and a migration that does cleanup of any bad dataset address rows. While I was in here, fixed #7115. It was smaller than I thought; should've done that a while ago. --- Cargo.lock | 1 + common/src/disk.rs | 4 +- nexus/reconfigurator/blippy/src/blippy.rs | 30 +++ nexus/reconfigurator/blippy/src/checks.rs | 175 +++++++++++++++++- .../planning/src/blueprint_builder/builder.rs | 2 +- .../src/blueprint_editor/sled_editor.rs | 2 +- .../blueprint_editor/sled_editor/datasets.rs | 4 +- sled-agent/Cargo.toml | 1 + sled-agent/src/artifact_store.rs | 2 +- sled-agent/src/rack_setup/plan/sled.rs | 2 +- sled-agent/src/rack_setup/service.rs | 117 +++++++++--- sled-agent/src/services.rs | 2 +- sled-agent/src/sim/server.rs | 12 +- sled-storage/src/manager.rs | 2 +- 14 files changed, 312 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b49c4bc63b..86d7de4669 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7303,6 +7303,7 @@ dependencies = [ "mg-admin-client", "nexus-client", "nexus-config", + "nexus-reconfigurator-blippy", "nexus-sled-agent-shared", "nexus-types", "omicron-common", diff --git a/common/src/disk.rs b/common/src/disk.rs index df6efe3196..c2e341f4d6 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -112,9 +112,7 @@ impl DatasetName { &self.pool_name } - // TODO(https://github.com/oxidecomputer/omicron/issues/7115): Rename - // this to "kind? - pub fn dataset(&self) -> &DatasetKind { + pub fn kind(&self) -> &DatasetKind { &self.kind } diff --git a/nexus/reconfigurator/blippy/src/blippy.rs b/nexus/reconfigurator/blippy/src/blippy.rs index 9e9cc84b32..4e105fa9ca 100644 --- a/nexus/reconfigurator/blippy/src/blippy.rs +++ b/nexus/reconfigurator/blippy/src/blippy.rs @@ -19,6 +19,7 @@ use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; use std::collections::BTreeSet; use std::net::IpAddr; +use std::net::SocketAddrV6; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Note { @@ -169,6 +170,17 @@ pub enum SledKind { OrphanedDataset { dataset: BlueprintDatasetConfig }, /// A dataset claims to be on a zpool that does not exist. DatasetOnNonexistentZpool { dataset: BlueprintDatasetConfig }, + /// A Crucible dataset does not have its `address` set to its corresponding + /// Crucible zone. + CrucibleDatasetWithIncorrectAddress { + dataset: BlueprintDatasetConfig, + expected_address: SocketAddrV6, + }, + /// A non-Crucible dataset has an address. + NonCrucibleDatasetWithAddress { + dataset: BlueprintDatasetConfig, + address: SocketAddrV6, + }, } impl fmt::Display for SledKind { @@ -352,6 +364,24 @@ impl fmt::Display for SledKind { dataset.kind, dataset.id, dataset.pool ) } + SledKind::CrucibleDatasetWithIncorrectAddress { + dataset, + expected_address, + } => { + write!( + f, + "Crucible dataset {} has bad address {:?} (expected {})", + dataset.id, dataset.address, expected_address, + ) + } + SledKind::NonCrucibleDatasetWithAddress { dataset, address } => { + write!( + f, + "non-Crucible dataset ({:?} {}) has an address: {} \ + (only Crucible datasets should have addresses)", + dataset.kind, dataset.id, address, + ) + } } } } diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index f5673cb77c..faf456c73a 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -6,10 +6,12 @@ use crate::blippy::Blippy; use crate::blippy::Severity; use crate::blippy::SledKind; use nexus_sled_agent_shared::inventory::ZoneKind; +use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::BlueprintDatasetConfig; use nexus_types::deployment::BlueprintDatasetFilter; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneFilter; +use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use omicron_common::address::DnsSubnet; use omicron_common::address::Ipv6Subnet; @@ -413,6 +415,10 @@ fn check_datasets(blippy: &mut Blippy<'_>) { } } + // In a check below, we want to look up Crucible zones by zpool; build that + // map as we perform the next set of checks. + let mut crucible_zone_by_zpool = BTreeMap::new(); + // There should be a dataset for every dataset referenced by a running zone // (filesystem or durable). for (sled_id, zone_config) in blippy @@ -431,7 +437,7 @@ fn check_datasets(blippy: &mut Blippy<'_>) { Some(dataset) => { match sled_datasets .get(&dataset.pool().id()) - .and_then(|by_zpool| by_zpool.get(dataset.dataset())) + .and_then(|by_zpool| by_zpool.get(dataset.kind())) { Some(dataset) => { expected_datasets.insert(dataset.id); @@ -471,6 +477,21 @@ fn check_datasets(blippy: &mut Blippy<'_>) { ); } } + + if dataset.kind == DatasetKind::Crucible { + match &zone_config.zone_type { + BlueprintZoneType::Crucible(crucible_zone_config) => { + crucible_zone_by_zpool.insert( + dataset.dataset.pool_name.id(), + crucible_zone_config, + ); + } + _ => unreachable!( + "zone_type.durable_dataset() returned Crucible for \ + non-Crucible zone type" + ), + } + } } } @@ -533,6 +554,50 @@ fn check_datasets(blippy: &mut Blippy<'_>) { continue; } } + + // All Crucible datasets should have their address set to the address of the + // Crucible zone on their same zpool, and all non-Crucible datasets should + // not have addresses. + for (sled_id, dataset) in blippy + .blueprint() + .all_omicron_datasets(BlueprintDatasetFilter::InService) + { + match dataset.kind { + DatasetKind::Crucible => { + let Some(blueprint_zone_type::Crucible { address, .. }) = + crucible_zone_by_zpool.get(&dataset.pool.id()) + else { + // We already checked above that all datasets have + // corresponding zones, so a failure to find the zone for + // this dataset would have already produced a note; just + // skip it. + continue; + }; + if dataset.address != Some(*address) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::CrucibleDatasetWithIncorrectAddress { + dataset: dataset.clone(), + expected_address: *address, + }, + ); + } + } + _ => { + if let Some(address) = dataset.address { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::NonCrucibleDatasetWithAddress { + dataset: dataset.clone(), + address, + }, + ); + } + } + } + } } #[cfg(test)] @@ -1294,7 +1359,7 @@ mod tests { == durable_zone.zone_type.durable_dataset().unwrap().kind); let root_dataset = root_zone.filesystem_dataset().unwrap(); let matches_root = (dataset.pool == *root_dataset.pool()) - && (dataset.kind == *root_dataset.dataset()); + && (dataset.kind == *root_dataset.kind()); !matches_durable && !matches_root }); @@ -1465,7 +1530,7 @@ mod tests { if (dataset.pool == durable_dataset.dataset.pool_name && dataset.kind == durable_dataset.kind) || (dataset.pool == *root_dataset.pool() - && dataset.kind == *root_dataset.dataset()) + && dataset.kind == *root_dataset.kind()) { Some(Note { severity: Severity::Fatal, @@ -1561,4 +1626,108 @@ mod tests { logctx.cleanup_successful(); } + + #[test] + fn test_dataset_with_bad_address() { + static TEST_NAME: &str = "test_dataset_with_bad_address"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + let crucible_addr_by_zpool = blueprint + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .filter_map(|(_, z)| match z.zone_type { + BlueprintZoneType::Crucible( + blueprint_zone_type::Crucible { address, .. }, + ) => { + let zpool_id = z + .zone_type + .durable_zpool() + .expect("crucible zone has durable zpool") + .id(); + Some((zpool_id, address)) + } + _ => None, + }) + .collect::>(); + + // We have three ways a dataset address can be wrong: + // + // * A Crucible dataset has no address + // * A Crucible dataset has an address but it doesn't match its zone + // * A non-Crucible dataset has an address + // + // Make all three kinds of modifications to three different datasets and + // ensure we see all three noted. + let mut cleared_crucible_addr = false; + let mut changed_crucible_addr = false; + let mut set_non_crucible_addr = false; + let mut expected_notes = Vec::new(); + + for (sled_id, datasets_config) in + blueprint.blueprint_datasets.iter_mut() + { + for dataset in datasets_config.datasets.values_mut() { + match dataset.kind { + DatasetKind::Crucible => { + let bad_address = if !cleared_crucible_addr { + cleared_crucible_addr = true; + None + } else if !changed_crucible_addr { + changed_crucible_addr = true; + Some("[1234:5678:9abc::]:0".parse().unwrap()) + } else { + continue; + }; + + dataset.address = bad_address; + let expected_address = *crucible_addr_by_zpool + .get(&dataset.pool.id()) + .expect("found crucible zone for zpool"); + expected_notes.push(Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::CrucibleDatasetWithIncorrectAddress { + dataset: dataset.clone(), + expected_address, + }, + }, + }); + } + _ => { + if !set_non_crucible_addr { + set_non_crucible_addr = true; + let address = "[::1]:0".parse().unwrap(); + dataset.address = Some(address); + expected_notes.push(Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::NonCrucibleDatasetWithAddress { + dataset: dataset.clone(), + address, + }, + }, + }); + } + } + } + } + } + + // We should have modified 3 datasets. + assert_eq!(expected_notes.len(), 3); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 0aaadb624d..09a6de84ef 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -2790,7 +2790,7 @@ pub mod test { { for dataset in dataset_configs { let id = dataset.id; - let kind = dataset.name.dataset(); + let kind = dataset.name.kind(); let by_kind: &mut BTreeMap<_, _> = input_dataset_ids.entry(*zpool_id).or_default(); let prev = by_kind.insert(kind.clone(), id); diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs index 995cc306a5..3355390dc9 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs @@ -460,7 +460,7 @@ impl ActiveSledEditor { } if let Some(dataset) = config.filesystem_dataset() { - self.datasets.expunge(&dataset.pool().id(), dataset.dataset())?; + self.datasets.expunge(&dataset.pool().id(), dataset.kind())?; } if let Some(dataset) = config.zone_type.durable_dataset() { self.datasets diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs index 211af59aab..d164a26a84 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs @@ -70,7 +70,7 @@ impl DatasetIdsBackfillFromDb { let iter = resources.all_datasets(ZpoolFilter::InService).flat_map( |(&zpool_id, configs)| { configs.iter().map(move |config| { - (zpool_id, config.name.dataset().clone(), config.id) + (zpool_id, config.name.kind().clone(), config.id) }) }, ); @@ -162,7 +162,7 @@ impl PartialDatasetConfig { pub fn for_transient_zone(name: DatasetName) -> Self { assert!( - matches!(name.dataset(), DatasetKind::TransientZone { .. }), + matches!(name.kind(), DatasetKind::TransientZone { .. }), "for_transient_zone called with incorrect dataset kind: {name:?}" ); Self { diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 88c758dc31..12efddda07 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -119,6 +119,7 @@ guppy.workspace = true hex-literal.workspace = true http.workspace = true hyper.workspace = true +nexus-reconfigurator-blippy.workspace = true omicron-test-utils.workspace = true pretty_assertions.workspace = true rcgen.workspace = true diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 5b78c0099d..dc0331457d 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -453,7 +453,7 @@ pub(crate) fn filter_dataset_mountpoints( config .datasets .into_values() - .filter(|dataset| *dataset.name.dataset() == DatasetKind::Update) + .filter(|dataset| *dataset.name.kind() == DatasetKind::Update) .map(|dataset| dataset.name.mountpoint(root)) } diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 3d06a91e27..eb447e605c 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -26,7 +26,7 @@ pub struct Plan { } impl Plan { - pub async fn create( + pub fn create( log: &Logger, config: &Config, bootstrap_addrs: BTreeSet, diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 7bb6bffe02..37a49b930a 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -797,7 +797,8 @@ impl ServiceInner { let blueprint = build_initial_blueprint_from_plan( &sled_configs_by_id, service_plan, - ); + ) + .map_err(SetupServiceError::ConvertPlanToBlueprint)?; info!(self.log, "Nexus address: {}", nexus_address.to_string()); @@ -834,13 +835,13 @@ impl ServiceInner { // to usage by specific zones. for dataset in sled_config.datasets.datasets.values() { let duplicate = datasets.insert( - (dataset.name.pool().id(), dataset.name.dataset().clone()), + (dataset.name.pool().id(), dataset.name.kind().clone()), NexusTypes::DatasetCreateRequest { zpool_id: dataset.name.pool().id().into_untyped_uuid(), dataset_id: dataset.id, request: NexusTypes::DatasetPutRequest { address: None, - kind: dataset.name.dataset().clone(), + kind: dataset.name.kind().clone(), }, }, ); @@ -1287,8 +1288,7 @@ impl ServiceInner { config, bootstrap_addrs, config.trust_quorum_peers.is_some(), - ) - .await; + ); let config = &sled_plan.config; rss_step.update(RssStep::InitTrustQuorum); @@ -1520,7 +1520,7 @@ fn build_sled_configs_by_id( fn build_initial_blueprint_from_plan( sled_configs_by_id: &BTreeMap, service_plan: &ServicePlan, -) -> Blueprint { +) -> anyhow::Result { build_initial_blueprint_from_sled_configs( sled_configs_by_id, service_plan.dns_config.generation, @@ -1530,7 +1530,7 @@ fn build_initial_blueprint_from_plan( pub(crate) fn build_initial_blueprint_from_sled_configs( sled_configs_by_id: &BTreeMap, internal_dns_version: Generation, -) -> Blueprint { +) -> anyhow::Result { let blueprint_disks: BTreeMap<_, _> = sled_configs_by_id .iter() .map(|(sled_id, sled_config)| (*sled_id, sled_config.disks.clone())) @@ -1541,17 +1541,29 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( let mut datasets = BTreeMap::new(); for d in sled_config.datasets.datasets.values() { // Only the "Crucible" dataset needs to know the address - let address = sled_config.zones.iter().find_map(|z| { - if let BlueprintZoneType::Crucible( - blueprint_zone_type::Crucible { address, dataset }, - ) = &z.zone_type - { - if &dataset.pool_name == d.name.pool() { - return Some(*address); - } - }; + let address = if *d.name.kind() == DatasetKind::Crucible { + let address = sled_config.zones.iter().find_map(|z| { + if let BlueprintZoneType::Crucible( + blueprint_zone_type::Crucible { address, dataset }, + ) = &z.zone_type + { + if &dataset.pool_name == d.name.pool() { + return Some(*address); + } + }; + None + }); + if address.is_some() { + address + } else { + bail!( + "could not find Crucible zone for zpool {}", + d.name.pool() + ) + } + } else { None - }); + }; datasets.insert( d.id, @@ -1559,7 +1571,7 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( disposition: BlueprintDatasetDisposition::InService, id: d.id, pool: d.name.pool().clone(), - kind: d.name.dataset().clone(), + kind: d.name.kind().clone(), address, compression: d.inner.compression, quota: d.inner.quota, @@ -1599,7 +1611,7 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( sled_state.insert(*sled_id, SledState::Active); } - Blueprint { + Ok(Blueprint { id: Uuid::new_v4(), blueprint_zones, blueprint_disks, @@ -1621,7 +1633,7 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( time_created: Utc::now(), creator: "RSS".to_string(), comment: "initial blueprint from rack setup".to_string(), - } + }) } /// Facilitates creating a sequence of OmicronZonesConfig objects for each sled @@ -1719,8 +1731,9 @@ impl<'a> OmicronZonesConfigGenerator<'a> { #[cfg(test)] mod test { - use super::{Config, OmicronZonesConfigGenerator}; + use super::*; use crate::rack_setup::plan::service::{Plan as ServicePlan, SledInfo}; + use nexus_reconfigurator_blippy::{Blippy, BlippyReportSortKey}; use nexus_sled_agent_shared::inventory::{ Baseboard, Inventory, InventoryDisk, OmicronZoneType, OmicronZonesConfig, SledRole, @@ -1778,9 +1791,8 @@ mod test { ) } - fn make_test_service_plan() -> ServicePlan { - let rss_config = Config::test_config(); - let fake_sleds = vec![ + fn make_fake_sleds() -> Vec { + vec![ make_sled_info( SledUuid::new_v4(), Ipv6Subnet::::new( @@ -1795,7 +1807,19 @@ mod test { ), 5, ), - ]; + make_sled_info( + SledUuid::new_v4(), + Ipv6Subnet::::new( + "fd00:1122:3344:103::1".parse().unwrap(), + ), + 5, + ), + ] + } + + fn make_test_service_plan() -> ServicePlan { + let rss_config = Config::test_config(); + let fake_sleds = make_fake_sleds(); let service_plan = ServicePlan::create_transient(&rss_config, fake_sleds) .expect("failed to create service plan"); @@ -1913,4 +1937,47 @@ mod test { } assert!(v6_nfound > v5_nfound); } + + #[test] + fn rss_blueprint_is_blippy_clean() { + let logctx = omicron_test_utils::dev::test_setup_log( + "rss_blueprint_is_blippy_clean", + ); + + let fake_sleds = make_fake_sleds(); + + let rss_config = Config::test_config(); + let use_trust_quorum = false; + let sled_plan = SledPlan::create( + &logctx.log, + &rss_config, + fake_sleds + .iter() + .map(|sled_info| *sled_info.sled_address.ip()) + .collect(), + use_trust_quorum, + ); + let service_plan = + ServicePlan::create_transient(&rss_config, fake_sleds) + .expect("created service plan"); + + let sled_configs_by_id = + build_sled_configs_by_id(&sled_plan, &service_plan) + .expect("built sled configs"); + let blueprint = build_initial_blueprint_from_plan( + &sled_configs_by_id, + &service_plan, + ) + .expect("built blueprint"); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + + if !report.notes().is_empty() { + eprintln!("{}", report.display()); + panic!("RSS blueprint should have no blippy notes"); + } + + logctx.cleanup_successful(); + } } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 7536824da3..54b0a559cd 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -3703,7 +3703,7 @@ impl ServiceManager { }; check_property("zoned", zoned, "on")?; check_property("canmount", canmount, "on")?; - if dataset.dataset().dataset_should_be_encrypted() { + if dataset.kind().dataset_should_be_encrypted() { check_property("encryption", encryption, "aes-256-gcm")?; } diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 3833d5ca7c..0d3a682b57 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -15,7 +15,7 @@ use crate::rack_setup::{ from_ipaddr_to_external_floating_ip, from_sockaddr_to_external_floating_addr, }; -use anyhow::anyhow; +use anyhow::{anyhow, Context as _}; use crucible_agent_client::types::State as RegionState; use illumos_utils::zpool::ZpoolName; use internal_dns_types::config::DnsConfigBuilder; @@ -564,11 +564,13 @@ pub async fn run_standalone_server( }, ); + let blueprint = build_initial_blueprint_from_sled_configs( + &sled_configs, + internal_dns_version, + ) + .context("could not construct initial blueprint")?; let rack_init_request = NexusTypes::RackInitializationRequest { - blueprint: build_initial_blueprint_from_sled_configs( - &sled_configs, - internal_dns_version, - ), + blueprint, physical_disks, zpools, datasets, diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index d6ffd42d0a..f1739fe5f4 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -1054,7 +1054,7 @@ impl StorageManager { let mountpoint_root = &self.resources.disks().mount_config().root; let mountpoint_path = config.name.mountpoint(mountpoint_root); let details = DatasetCreationDetails { - zoned: config.name.dataset().zoned(), + zoned: config.name.kind().zoned(), mountpoint: Mountpoint::Path(mountpoint_path), full_name: config.name.full_name(), }; From d77cebea72efc5a98b53b6e192de1a112a58e5d0 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Tue, 7 Jan 2025 16:51:07 -0700 Subject: [PATCH 6/8] Add a4x2 package preset (#7317) Fixes #7316. --- package-manifest.toml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/package-manifest.toml b/package-manifest.toml index 5c1dd1879d..277052ef3d 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -969,3 +969,11 @@ machine = "non-gimlet" switch = "softnpu" rack-topology = "single-sled" clickhouse-topology = "single-node" + +# A preset for the `a4x2` testbed environment. +[target.preset.a4x2] +image = "standard" +machine = "non-gimlet" +switch = "softnpu" +rack-topology = "multi-sled" +clickhouse-topology = "single-node" From a77566a145ebb6ce5f51c270f347a3df89166287 Mon Sep 17 00:00:00 2001 From: Michael Zeller Date: Tue, 7 Jan 2025 19:01:56 -0500 Subject: [PATCH 7/8] Add support for finding oxide processes on a sled (#7320) This PR adds support for finding all oxide processes via libcontract and relying on the fact that we deploy oxide services with the fmri prefix svc:/oxide/ or svc:/system/illumos. This allows us to find every pid started via smf even if the smf script uses ctrun. You can see this for yourself by running pfexec ctstat -av on a helios/illumos based system: ``` 205 1 process owned 2492 0 - - cookie: 0 informative event set: core critical event set: hwerr empty fatal event set: hwerr parameter set: noorphan regent member processes: 2497 inherited contracts: none service fmri: svc:/oxide/wicketd:default service fmri ctid: 204 creator: ctrun aux: 249 1 process owned 1334 0 - - cookie: 0x20 informative event set: none critical event set: hwerr empty fatal event set: none parameter set: inherit regent member processes: 2794 inherited contracts: none service fmri: svc:/oxide/dendrite:default service fmri ctid: 249 creator: svc.startd aux: start 254 1 process owned 1334 0 - - cookie: 0x20 informative event set: none critical event set: hwerr empty fatal event set: none parameter set: inherit regent member processes: 2802 inherited contracts: none service fmri: svc:/oxide/lldpd:default service fmri ctid: 254 creator: svc.startd aux: start ``` This PR is on top of: - #7193 --- Cargo.lock | 4 + openapi/sled-agent.json | 42 ++++++ sled-agent/api/src/lib.rs | 16 +++ sled-agent/src/http_entrypoints.rs | 32 +++++ sled-agent/src/sim/http_entrypoints.rs | 12 ++ sled-agent/src/sled_agent.rs | 12 ++ sled-diagnostics/Cargo.toml | 4 + sled-diagnostics/src/contract.rs | 182 +++++++++++++++++++++++++ sled-diagnostics/src/contract_stub.rs | 18 +++ sled-diagnostics/src/lib.rs | 56 +++++++- sled-diagnostics/src/queries.rs | 24 +++- 11 files changed, 397 insertions(+), 5 deletions(-) create mode 100644 sled-diagnostics/src/contract.rs create mode 100644 sled-diagnostics/src/contract_stub.rs diff --git a/Cargo.lock b/Cargo.lock index 86d7de4669..459467e96d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10803,8 +10803,12 @@ dependencies = [ name = "sled-diagnostics" version = "0.1.0" dependencies = [ + "cfg-if", + "fs-err", "futures", + "libc", "omicron-workspace-hack", + "slog", "thiserror 1.0.69", "tokio", ] diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 81d9211104..9b750b0cf7 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -715,6 +715,48 @@ } } }, + "/support/pargs-info": { + "get": { + "operationId": "support_pargs_info", + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/support/pstack-info": { + "get": { + "operationId": "support_pstack_info", + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/support/zoneadm-info": { "get": { "operationId": "support_zoneadm_info", diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 56c6760be7..25b943064b 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -647,6 +647,22 @@ pub trait SledAgentApi { async fn support_dladm_info( request_context: RequestContext, ) -> Result, HttpError>; + + #[endpoint { + method = GET, + path = "/support/pargs-info", + }] + async fn support_pargs_info( + request_context: RequestContext, + ) -> Result, HttpError>; + + #[endpoint { + method = GET, + path = "/support/pstack-info", + }] + async fn support_pstack_info( + request_context: RequestContext, + ) -> Result, HttpError>; } #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index edba1c47b1..54e93aa792 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -1001,4 +1001,36 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseOk(FreeformBody(output.into()))) } + + async fn support_pargs_info( + request_context: RequestContext, + ) -> Result, HttpError> { + let sa = request_context.context(); + let output = sa + .support_pargs_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>() + .as_slice() + .join("\n\n"); + + Ok(HttpResponseOk(FreeformBody(output.into()))) + } + + async fn support_pstack_info( + request_context: RequestContext, + ) -> Result, HttpError> { + let sa = request_context.context(); + let output = sa + .support_pstack_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>() + .as_slice() + .join("\n\n"); + + Ok(HttpResponseOk(FreeformBody(output.into()))) + } } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 60dcb1be31..66414fc74e 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -737,6 +737,18 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { method_unimplemented() } + + async fn support_pargs_info( + _request_context: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } + + async fn support_pstack_info( + _request_context: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } } fn method_unimplemented() -> Result { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 37526690cb..c626957097 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1388,6 +1388,18 @@ impl SledAgent { ) -> Vec> { sled_diagnostics::dladm_info().await } + + pub(crate) async fn support_pargs_info( + &self, + ) -> Vec> { + sled_diagnostics::pargs_oxide_processes(&self.log).await + } + + pub(crate) async fn support_pstack_info( + &self, + ) -> Vec> { + sled_diagnostics::pstack_oxide_processes(&self.log).await + } } #[derive(From, thiserror::Error, Debug)] diff --git a/sled-diagnostics/Cargo.toml b/sled-diagnostics/Cargo.toml index 98ac59b674..9e10dfe1c7 100644 --- a/sled-diagnostics/Cargo.toml +++ b/sled-diagnostics/Cargo.toml @@ -7,7 +7,11 @@ edition = "2021" workspace = true [dependencies] +cfg-if.workspace = true +fs-err.workspace = true futures.workspace = true +libc.workspace = true omicron-workspace-hack.workspace = true +slog.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["full"] } diff --git a/sled-diagnostics/src/contract.rs b/sled-diagnostics/src/contract.rs new file mode 100644 index 0000000000..f6bca904f9 --- /dev/null +++ b/sled-diagnostics/src/contract.rs @@ -0,0 +1,182 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// ! Bindings to libcontract(3lib). + +use fs_err as fs; +use libc::{c_char, c_int, c_void, pid_t}; +use slog::{warn, Logger}; +use thiserror::Error; + +use std::{ + collections::BTreeSet, + ffi::{CStr, CString}, + os::fd::AsRawFd, + path::Path, +}; + +const CT_ALL: &str = "/system/contract/all"; +// Most Oxide services +const OXIDE_FMRI: &str = "svc:/oxide/"; +// NB: Used for propolis zones +const ILLUMOS_FMRI: &str = "svc:/system/illumos/"; +const CTD_ALL: i32 = 2; + +#[allow(non_camel_case_types)] +type ct_stathdl_t = *mut c_void; + +#[link(name = "contract")] +extern "C" { + fn ct_status_read( + fd: c_int, + detail: c_int, + stathdlp: *mut ct_stathdl_t, + ) -> c_int; + fn ct_status_free(stathdlp: ct_stathdl_t); + fn ct_status_get_id(stathdlp: ct_stathdl_t) -> i32; + fn ct_pr_status_get_members( + stathdlp: ct_stathdl_t, + pidpp: *mut *mut pid_t, + n: *mut u32, + ) -> c_int; + fn ct_pr_status_get_svc_fmri( + stathdlp: ct_stathdl_t, + fmri: *mut *mut c_char, + ) -> c_int; +} + +#[derive(Error, Debug)] +pub enum ContractError { + #[error(transparent)] + FileIo(#[from] std::io::Error), + #[error( + "Failed to call ct_pr_status_get_svc_fmri for contract {ctid}: {error}" + )] + Fmri { ctid: i32, error: std::io::Error }, + #[error( + "Failed to call ct_pr_status_get_members for contract {ctid}: {error}" + )] + Members { ctid: i32, error: std::io::Error }, + #[error("ct_status_read returned successfully but handed back a null ptr for {0}")] + Null(std::path::PathBuf), + #[error("Failed to call ct_status_read on {path}: {error}")] + StatusRead { path: std::path::PathBuf, error: std::io::Error }, +} + +pub struct ContractStatus { + handle: ct_stathdl_t, +} + +impl Drop for ContractStatus { + fn drop(&mut self) { + unsafe { ct_status_free(self.handle) }; + } +} + +macro_rules! libcall_io { + ($fn: ident ( $($arg: expr), * $(,)*) ) => {{ + let res = unsafe { $fn($($arg, )*) }; + if res == 0 { + Ok(res) + } else { + Err(std::io::Error::last_os_error()) + } + }}; + } + +impl ContractStatus { + fn new(contract_status: &Path) -> Result { + let file = fs::File::open(contract_status)?; + let mut handle: ct_stathdl_t = std::ptr::null_mut(); + libcall_io!(ct_status_read(file.as_raw_fd(), CTD_ALL, &mut handle,)) + .map_err(|error| ContractError::StatusRead { + path: contract_status.to_path_buf(), + error, + })?; + + // We don't ever expect the system to hand back a null ptr when + // returning success but let's be extra cautious anyways. + if handle.is_null() { + return Err(ContractError::Null(contract_status.to_path_buf())); + } + + Ok(Self { handle }) + } + + fn get_members(&self) -> Result<&[i32], ContractError> { + let mut numpids = 0; + let mut pids: *mut pid_t = std::ptr::null_mut(); + + let pids = { + libcall_io!(ct_pr_status_get_members( + self.handle, + &mut pids, + &mut numpids, + )) + .map_err(|error| { + let ctid = unsafe { ct_status_get_id(self.handle) }; + ContractError::Members { ctid, error } + })?; + + unsafe { + if pids.is_null() { + &[] + } else { + std::slice::from_raw_parts(pids, numpids as usize) + } + } + }; + + Ok(pids) + } + + fn get_fmri(&self) -> Result, ContractError> { + // The lifetime of this string is tied to the lifetime of the status + // handle itself and will be cleaned up when the handle is freed. + let mut ptr: *mut c_char = std::ptr::null_mut(); + libcall_io!(ct_pr_status_get_svc_fmri(self.handle, &mut ptr)).map_err( + |error| { + let ctid = unsafe { ct_status_get_id(self.handle) }; + ContractError::Fmri { ctid, error } + }, + )?; + + if ptr.is_null() { + return Ok(None); + } + + let cstr = unsafe { CStr::from_ptr(ptr) }; + Ok(Some(cstr.to_owned())) + } +} + +pub fn find_oxide_pids(log: &Logger) -> Result, ContractError> { + let mut pids = BTreeSet::new(); + let ents = fs::read_dir(CT_ALL)?; + for ct in ents { + let ctid = ct?; + let mut path = ctid.path(); + path.push("status"); + + let status = match ContractStatus::new(path.as_path()) { + Ok(status) => status, + Err(e) => { + // There's a race between the time we find the contracts to the + // time we attempt to read the contract's status. We can safely + // skip all of the errors for diagnostics purposes but we should + // leave a log in our wake. + warn!(log, "Failed to read contract ({:?}): {}", path, e); + continue; + } + }; + + let fmri_owned = status.get_fmri()?.unwrap_or_default(); + let fmri = fmri_owned.to_string_lossy(); + if fmri.starts_with(OXIDE_FMRI) || fmri.starts_with(ILLUMOS_FMRI) { + pids.extend(status.get_members()?); + } + } + + Ok(pids) +} diff --git a/sled-diagnostics/src/contract_stub.rs b/sled-diagnostics/src/contract_stub.rs new file mode 100644 index 0000000000..9637c3486d --- /dev/null +++ b/sled-diagnostics/src/contract_stub.rs @@ -0,0 +1,18 @@ +//! Stub implementation for platfroms without libcontract(3lib). + +use std::collections::BTreeSet; + +use slog::{warn, Logger}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ContractError {} + +pub fn find_oxide_pids(log: &Logger) -> Result, ContractError> { + warn!( + log, + "Unable to find oxide pids on a non illumos platform, \ + returning empty set" + ); + Ok(BTreeSet::new()) +} diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs index 466c1ec446..cbb3a5a0d0 100644 --- a/sled-diagnostics/src/lib.rs +++ b/sled-diagnostics/src/lib.rs @@ -5,6 +5,16 @@ //! Diagnostics for an Oxide sled that exposes common support commands. use futures::{stream::FuturesUnordered, StreamExt}; +use slog::Logger; + +cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + mod contract; + } else { + mod contract_stub; + use contract_stub as contract; + } +} mod queries; pub use crate::queries::{ @@ -28,7 +38,7 @@ pub async fn ipadm_info( execute_command_with_timeout(c, DEFAULT_TIMEOUT).await }) .collect::>() - .collect::>>() + .collect::>>() .await } @@ -47,6 +57,48 @@ pub async fn dladm_info( execute_command_with_timeout(c, DEFAULT_TIMEOUT).await }) .collect::>() - .collect::>>() + .collect::>>() + .await +} + +pub async fn pargs_oxide_processes( + log: &Logger, +) -> Vec> { + // In a diagnostics context we care about looping over every pid we find, + // but on failure we should just return a single error in a vec that + // represents the entire failed operation. + let pids = match contract::find_oxide_pids(log) { + Ok(pids) => pids, + Err(e) => return vec![Err(e.into())], + }; + + pids.iter() + .map(|pid| pargs_process(*pid)) + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} + +pub async fn pstack_oxide_processes( + log: &Logger, +) -> Vec> { + // In a diagnostics context we care about looping over every pid we find, + // but on failure we should just return a single error in a vec that + // represents the entire failed operation. + let pids = match contract::find_oxide_pids(log) { + Ok(pids) => pids, + Err(e) => return vec![Err(e.into())], + }; + + pids.iter() + .map(|pid| pstack_process(*pid)) + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() .await } diff --git a/sled-diagnostics/src/queries.rs b/sled-diagnostics/src/queries.rs index 9a66842cb2..2f2b135f0d 100644 --- a/sled-diagnostics/src/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -9,9 +9,17 @@ use std::{process::Command, time::Duration}; use thiserror::Error; use tokio::io::AsyncReadExt; +#[cfg(target_os = "illumos")] +use crate::contract::ContractError; + +#[cfg(not(target_os = "illumos"))] +use crate::contract_stub::ContractError; + const DLADM: &str = "/usr/sbin/dladm"; const IPADM: &str = "/usr/sbin/ipadm"; const PFEXEC: &str = "/usr/bin/pfexec"; +const PSTACK: &str = "/usr/bin/pstack"; +const PARGS: &str = "/usr/bin/pargs"; const ZONEADM: &str = "/usr/sbin/zoneadm"; pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); @@ -22,6 +30,8 @@ pub trait SledDiagnosticsCommandHttpOutput { #[derive(Error, Debug)] pub enum SledDiagnosticsCmdError { + #[error("libcontract error: {0}")] + Contract(#[from] ContractError), #[error("Failed to duplicate pipe for command [{command}]: {error}")] Dup { command: String, error: std::io::Error }, #[error("Failed to proccess output for command [{command}]: {error}")] @@ -205,9 +215,17 @@ pub fn dladm_show_linkprop() -> Command { cmd } -/* - * Public API - */ +pub fn pargs_process(pid: i32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(PARGS).arg("-ae").arg(pid.to_string()); + cmd +} + +pub fn pstack_process(pid: i32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(PSTACK).arg(pid.to_string()); + cmd +} #[cfg(test)] mod test { From 78ca57a8e221489cbdcc35f923439b92c1d24da8 Mon Sep 17 00:00:00 2001 From: Michael Zeller Date: Tue, 7 Jan 2025 20:58:11 -0500 Subject: [PATCH 8/8] [sled-diagnostics] Output for support bundles should be structured (#7228) This introduces the `SledDiagnosticsQueryOutput` type so support bundles can now have structured output in collected files. This makes it potentially easier for an operator to write scripts that search for particular commands based on various fields. This is on top of: - #7194 - #7193 --- Cargo.lock | 3 + openapi/sled-agent.json | 118 +++++++++++++++++++++---- sled-agent/api/Cargo.toml | 1 + sled-agent/api/src/lib.rs | 11 +-- sled-agent/src/http_entrypoints.rs | 88 +++++++++--------- sled-agent/src/sim/http_entrypoints.rs | 15 ++-- sled-diagnostics/Cargo.toml | 2 + sled-diagnostics/src/lib.rs | 2 +- sled-diagnostics/src/queries.rs | 62 ++++++++----- 9 files changed, 208 insertions(+), 94 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 459467e96d..ea34752d1a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10741,6 +10741,7 @@ dependencies = [ "schemars", "serde", "sled-agent-types", + "sled-diagnostics", "sled-hardware-types", "uuid", ] @@ -10808,6 +10809,8 @@ dependencies = [ "futures", "libc", "omicron-workspace-hack", + "schemars", + "serde", "slog", "thiserror 1.0.69", "tokio", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 9b750b0cf7..90eea0504b 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -678,10 +678,16 @@ "operationId": "support_dladm_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } } } }, @@ -699,10 +705,16 @@ "operationId": "support_ipadm_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } } } }, @@ -720,10 +732,16 @@ "operationId": "support_pargs_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } } } }, @@ -741,10 +759,16 @@ "operationId": "support_pstack_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } } } }, @@ -762,10 +786,12 @@ "operationId": "support_zoneadm_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } } } }, @@ -5569,6 +5595,68 @@ "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, + "SledDiagnosticsQueryOutput": { + "oneOf": [ + { + "type": "object", + "properties": { + "success": { + "type": "object", + "properties": { + "command": { + "description": "The command and its arguments.", + "type": "string" + }, + "exit_code": { + "nullable": true, + "description": "The exit code if one was present when the command exited.", + "type": "integer", + "format": "int32" + }, + "exit_status": { + "description": "The exit status of the command. This will be the exit code (if any) and exit reason such as from a signal.", + "type": "string" + }, + "stdio": { + "description": "Any stdout/stderr produced by the command.", + "type": "string" + } + }, + "required": [ + "command", + "exit_status", + "stdio" + ] + } + }, + "required": [ + "success" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "failure": { + "type": "object", + "properties": { + "error": { + "description": "The reason the command failed to execute.", + "type": "string" + } + }, + "required": [ + "error" + ] + } + }, + "required": [ + "failure" + ], + "additionalProperties": false + } + ] + }, "SledIdentifiers": { "description": "Identifiers for a single sled.\n\nThis is intended primarily to be used in timeseries, to identify sled from which metric data originates.", "type": "object", diff --git a/sled-agent/api/Cargo.toml b/sled-agent/api/Cargo.toml index 95e9552f53..e53dfb6948 100644 --- a/sled-agent/api/Cargo.toml +++ b/sled-agent/api/Cargo.toml @@ -19,4 +19,5 @@ schemars.workspace = true serde.workspace = true sled-agent-types.workspace = true sled-hardware-types.workspace = true +sled-diagnostics.workspace = true uuid.workspace = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 25b943064b..18f397a51c 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -54,6 +54,7 @@ use sled_agent_types::{ ZoneBundleId, ZoneBundleMetadata, }, }; +use sled_diagnostics::SledDiagnosticsQueryOutput; use uuid::Uuid; #[dropshot::api_description] @@ -630,7 +631,7 @@ pub trait SledAgentApi { }] async fn support_zoneadm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; #[endpoint { method = GET, @@ -638,7 +639,7 @@ pub trait SledAgentApi { }] async fn support_ipadm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; #[endpoint { method = GET, @@ -646,7 +647,7 @@ pub trait SledAgentApi { }] async fn support_dladm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; #[endpoint { method = GET, @@ -654,7 +655,7 @@ pub trait SledAgentApi { }] async fn support_pargs_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; #[endpoint { method = GET, @@ -662,7 +663,7 @@ pub trait SledAgentApi { }] async fn support_pstack_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; } #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 54e93aa792..0c7706a459 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -53,7 +53,9 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, StorageLimit, ZoneBundleId, ZoneBundleMetadata, }; -use sled_diagnostics::SledDiagnosticsCommandHttpOutput; +use sled_diagnostics::{ + SledDiagnosticsCommandHttpOutput, SledDiagnosticsQueryOutput, +}; use std::collections::BTreeMap; type SledApiDescription = ApiDescription; @@ -964,73 +966,65 @@ impl SledAgentApi for SledAgentImpl { async fn support_zoneadm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = request_context.context(); let res = sa.support_zoneadm_info().await; - Ok(HttpResponseOk(FreeformBody(res.get_output().into()))) + Ok(HttpResponseOk(res.get_output())) } async fn support_ipadm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); - let output = sa - .support_ipadm_info() - .await - .into_iter() - .map(|cmd| cmd.get_output()) - .collect::>() - .as_slice() - .join("\n\n"); - - Ok(HttpResponseOk(FreeformBody(output.into()))) + Ok(HttpResponseOk( + sa.support_ipadm_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) } async fn support_dladm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); - let output = sa - .support_dladm_info() - .await - .into_iter() - .map(|cmd| cmd.get_output()) - .collect::>() - .as_slice() - .join("\n\n"); - - Ok(HttpResponseOk(FreeformBody(output.into()))) + Ok(HttpResponseOk( + sa.support_dladm_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) } async fn support_pargs_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); - let output = sa - .support_pargs_info() - .await - .into_iter() - .map(|cmd| cmd.get_output()) - .collect::>() - .as_slice() - .join("\n\n"); - - Ok(HttpResponseOk(FreeformBody(output.into()))) + Ok(HttpResponseOk( + sa.support_pargs_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) } async fn support_pstack_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); - let output = sa - .support_pstack_info() - .await - .into_iter() - .map(|cmd| cmd.get_output()) - .collect::>() - .as_slice() - .join("\n\n"); - - Ok(HttpResponseOk(FreeformBody(output.into()))) + Ok(HttpResponseOk( + sa.support_pstack_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) } } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 66414fc74e..a0634d132a 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -58,6 +58,7 @@ use sled_agent_types::zone_bundle::CleanupContext; use sled_agent_types::zone_bundle::CleanupCount; use sled_agent_types::zone_bundle::ZoneBundleId; use sled_agent_types::zone_bundle::ZoneBundleMetadata; +use sled_diagnostics::SledDiagnosticsQueryOutput; use std::collections::BTreeMap; use std::sync::Arc; @@ -722,31 +723,35 @@ impl SledAgentApi for SledAgentSimImpl { async fn support_zoneadm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { method_unimplemented() } async fn support_ipadm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { method_unimplemented() } async fn support_dladm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { method_unimplemented() } async fn support_pargs_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { method_unimplemented() } async fn support_pstack_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { method_unimplemented() } } diff --git a/sled-diagnostics/Cargo.toml b/sled-diagnostics/Cargo.toml index 9e10dfe1c7..afbe642d76 100644 --- a/sled-diagnostics/Cargo.toml +++ b/sled-diagnostics/Cargo.toml @@ -12,6 +12,8 @@ fs-err.workspace = true futures.workspace = true libc.workspace = true omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true slog.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["full"] } diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs index cbb3a5a0d0..fb596977f1 100644 --- a/sled-diagnostics/src/lib.rs +++ b/sled-diagnostics/src/lib.rs @@ -19,7 +19,7 @@ cfg_if::cfg_if! { mod queries; pub use crate::queries::{ SledDiagnosticsCmdError, SledDiagnosticsCmdOutput, - SledDiagnosticsCommandHttpOutput, + SledDiagnosticsCommandHttpOutput, SledDiagnosticsQueryOutput, }; use queries::*; diff --git a/sled-diagnostics/src/queries.rs b/sled-diagnostics/src/queries.rs index 2f2b135f0d..2e8cf55993 100644 --- a/sled-diagnostics/src/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -4,8 +4,13 @@ //! Wrapper for command execution with timeout. -use std::{process::Command, time::Duration}; +use std::{ + process::{Command, ExitStatus}, + time::Duration, +}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::io::AsyncReadExt; @@ -25,7 +30,27 @@ const ZONEADM: &str = "/usr/sbin/zoneadm"; pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); pub trait SledDiagnosticsCommandHttpOutput { - fn get_output(self) -> String; + fn get_output(self) -> SledDiagnosticsQueryOutput; +} + +#[derive(Clone, Debug, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SledDiagnosticsQueryOutput { + Success { + /// The command and its arguments. + command: String, + /// Any stdout/stderr produced by the command. + stdio: String, + /// The exit status of the command. This will be the exit code (if any) + /// and exit reason such as from a signal. + exit_status: String, + /// The exit code if one was present when the command exited. + exit_code: Option, + }, + Failure { + /// The reason the command failed to execute. + error: String, + }, } #[derive(Error, Debug)] @@ -54,24 +79,23 @@ pub enum SledDiagnosticsCmdError { pub struct SledDiagnosticsCmdOutput { pub command: String, pub stdio: String, - pub exit_status: String, -} - -impl std::fmt::Display for SledDiagnosticsCmdOutput { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "Command executed [{}]:", self.command)?; - writeln!(f, " ==== stdio ====\n{}", self.stdio)?; - writeln!(f, " ==== exit status ====\n{}", self.exit_status) - } + pub exit_status: ExitStatus, } impl SledDiagnosticsCommandHttpOutput for Result { - fn get_output(self) -> String { + fn get_output(self) -> SledDiagnosticsQueryOutput { match self { - Ok(output) => format!("{output}"), - Err(error) => format!("{error}"), + Ok(output) => SledDiagnosticsQueryOutput::Success { + command: output.command, + stdio: output.stdio, + exit_status: output.exit_status.to_string(), + exit_code: output.exit_status.code(), + }, + Err(error) => { + SledDiagnosticsQueryOutput::Failure { error: error.to_string() } + } } } } @@ -134,13 +158,9 @@ async fn execute( } })?; - let exit_status = - child.wait().await.map(|es| format!("{es}")).map_err(|e| { - SledDiagnosticsCmdError::Wait { - command: cmd_string.clone(), - error: e, - } - })?; + let exit_status = child.wait().await.map_err(|e| { + SledDiagnosticsCmdError::Wait { command: cmd_string.clone(), error: e } + })?; Ok(SledDiagnosticsCmdOutput { command: cmd_string, stdio, exit_status }) }