From f59c55e89eb7cc9897338adc7ce7b393284eb44d Mon Sep 17 00:00:00 2001 From: Rain Date: Thu, 19 Dec 2024 11:49:33 -0800 Subject: [PATCH 01/11] [omicron-package] add a way to show what Cargo commands will be run (#7268) Add a way to see what commands will be run, without actually running them. As part of this, refactor `CargoPlan` and change its meaning slightly -- it now indicates the set of debug and release packages that would be built. In the future, we'll be able to use these structures to add lints around which features get built. --- Cargo.lock | 1 + package/Cargo.toml | 1 + package/src/bin/omicron-package.rs | 174 ++++++++++------------------- package/src/cargo_plan.rs | 172 ++++++++++++++++++++++++++++ package/src/lib.rs | 3 + 5 files changed, 238 insertions(+), 113 deletions(-) create mode 100644 package/src/cargo_plan.rs diff --git a/Cargo.lock b/Cargo.lock index 4b1809d3b8..05a7082aaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7140,6 +7140,7 @@ dependencies = [ "ring 0.17.8", "semver 1.0.23", "serde", + "shell-words", "sled-hardware", "slog", "slog-async", diff --git a/package/Cargo.toml b/package/Cargo.toml index b63a5ed96f..ccc768bb8e 100644 --- a/package/Cargo.toml +++ b/package/Cargo.toml @@ -25,6 +25,7 @@ reqwest = { workspace = true, features = [ "rustls-tls" ] } ring.workspace = true semver.workspace = true serde.workspace = true +shell-words.workspace = true sled-hardware.workspace = true slog.workspace = true slog-async.workspace = true diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index cc4050cbce..6ea10ae27b 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -4,12 +4,13 @@ //! Utility for bundling target binaries as tarfiles. -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use futures::stream::{self, StreamExt, TryStreamExt}; use illumos_utils::{zfs, zone}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use omicron_package::cargo_plan::build_cargo_plan; use omicron_package::target::KnownTarget; use omicron_package::{parse, BuildCommand, DeployCommand, TargetCommand}; use omicron_zone_package::config::{Config as PackageConfig, PackageMap}; @@ -24,7 +25,7 @@ use slog::o; use slog::Drain; use slog::Logger; use slog::{info, warn}; -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeMap; use std::env; use std::fs::create_dir_all; use std::io::Write; @@ -105,128 +106,59 @@ struct Args { subcommand: SubCommand, } -#[derive(Debug, Default)] -struct CargoPlan<'a> { - command: &'a str, - packages: BTreeSet<&'a String>, - bins: BTreeSet<&'a String>, - features: BTreeSet<&'a String>, - release: bool, -} +async fn do_show_cargo_commands(config: &Config) -> Result<()> { + let metadata = cargo_metadata::MetadataCommand::new().no_deps().exec()?; + let features = config.cargo_features(); + let cargo_plan = + build_cargo_plan(&metadata, config.packages_to_build(), &features)?; -impl<'a> CargoPlan<'a> { - async fn run(&self, log: &Logger) -> Result<()> { - if self.bins.is_empty() { - return Ok(()); - } + let release_command = cargo_plan.release.build_command("build"); + let debug_command = cargo_plan.debug.build_command("build"); - let mut cmd = Command::new("cargo"); - // We rely on the rust-toolchain.toml file for toolchain information, - // rather than specifying one within the packaging tool. - cmd.arg(self.command); - // We specify _both_ --package and --bin; --bin does not imply - // --package, and without any --package options Cargo unifies features - // across all workspace default members. See rust-lang/cargo#8157. - for package in &self.packages { - cmd.arg("--package").arg(package); - } - for bin in &self.bins { - cmd.arg("--bin").arg(bin); - } - if !self.features.is_empty() { - cmd.arg("--features").arg(self.features.iter().fold( - String::new(), - |mut acc, s| { - if !acc.is_empty() { - acc.push(' '); - } - acc.push_str(s); - acc - }, - )); - } - if self.release { - cmd.arg("--release"); - } - info!(log, "running: {:?}", cmd.as_std()); - let status = cmd - .status() - .await - .context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - bail!("Failed to build packages"); - } + print!("release command: "); + if let Some(command) = release_command { + println!("{}", command_to_string(&command)); + } else { + println!("(none)"); + } - Ok(()) + print!("debug command: "); + if let Some(command) = debug_command { + println!("{}", command_to_string(&command)); + } else { + println!("(none)"); } + + Ok(()) +} + +fn command_to_string(command: &Command) -> String { + // Use shell-words to join the command and arguments into a single string. + let mut v = vec![command + .as_std() + .get_program() + .to_str() + .expect("program is valid UTF-8")]; + v.extend( + command + .as_std() + .get_args() + .map(|arg| arg.to_str().expect("argument is valid UTF-8")), + ); + + shell_words::join(&v) } async fn do_for_all_rust_packages( config: &Config, command: &str, ) -> Result<()> { - // Collect a map of all of the workspace packages - let workspace = cargo_metadata::MetadataCommand::new().no_deps().exec()?; - let workspace_pkgs = workspace - .packages - .into_iter() - .filter_map(|package| { - workspace - .workspace_members - .contains(&package.id) - .then_some((package.name.clone(), package)) - }) - .collect::>(); + let metadata = cargo_metadata::MetadataCommand::new().no_deps().exec()?; + let features = config.cargo_features(); + let cargo_plan = + build_cargo_plan(&metadata, config.packages_to_build(), &features)?; - // Generate a list of all features we might want to request - let features = config - .target - .0 - .iter() - .map(|(name, value)| format!("{name}-{value}")) - .collect::>(); - - // We split the packages to be built into "release" and "debug" lists - let mut release = - CargoPlan { command, release: true, ..Default::default() }; - let mut debug = CargoPlan { command, release: false, ..Default::default() }; - - for (name, pkg) in config.packages_to_build().0 { - // If this is a Rust package, `name` (the map key) is the name of the - // corresponding Rust crate. - if let PackageSource::Local { rust: Some(rust_pkg), .. } = &pkg.source { - let plan = if rust_pkg.release { &mut release } else { &mut debug }; - // Add the package name to the plan - plan.packages.insert(name); - // Get the package metadata - let metadata = workspace_pkgs.get(name).with_context(|| { - format!("package '{name}' is not a workspace package") - })?; - // Add the binaries we want to build to the plan - let bins = metadata - .targets - .iter() - .filter_map(|target| target.is_bin().then_some(&target.name)) - .collect::>(); - for bin in &rust_pkg.binary_names { - ensure!( - bins.contains(bin), - "bin target '{bin}' does not belong to package '{name}'" - ); - plan.bins.insert(bin); - } - // Add all features we want to request to the plan - plan.features.extend( - features - .iter() - .filter(|feature| metadata.features.contains_key(*feature)), - ); - } - } - - release.run(&config.log).await?; - debug.run(&config.log).await?; - Ok(()) + cargo_plan.run(command, &config.log).await } async fn do_check(config: &Config) -> Result<()> { @@ -1051,6 +983,19 @@ impl Config { } filtered_packages } + + /// Return a list of all possible Cargo features that could be requested for + /// the packages being built. + /// + /// Out of these, the features that actually get requested are determined by + /// which features are available for the list of packages being built. + fn cargo_features(&self) -> Vec { + self.target + .0 + .iter() + .map(|(name, value)| format!("{name}-{value}")) + .collect::>() + } } #[tokio::main] @@ -1142,6 +1087,9 @@ async fn main() -> Result<()> { ) .await?; } + SubCommand::Build(BuildCommand::ShowCargoCommands) => { + do_show_cargo_commands(&get_config()?).await?; + } SubCommand::Build(BuildCommand::Check) => { do_check(&get_config()?).await? } diff --git a/package/src/cargo_plan.rs b/package/src/cargo_plan.rs new file mode 100644 index 0000000000..1a32b199fb --- /dev/null +++ b/package/src/cargo_plan.rs @@ -0,0 +1,172 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::collections::BTreeMap; +use std::collections::BTreeSet; + +use anyhow::bail; +use anyhow::ensure; +use anyhow::Context; +use anyhow::Result; +use cargo_metadata::Metadata; +use omicron_zone_package::config::PackageMap; +use omicron_zone_package::package::PackageSource; +use slog::info; +use slog::Logger; +use tokio::process::Command; + +/// For a configuration, build a plan: the set of packages, binaries, and +/// features to operate on in release and debug modes. +pub fn build_cargo_plan<'a>( + metadata: &Metadata, + package_map: PackageMap<'a>, + features: &'a [String], +) -> Result> { + // Collect a map of all of the workspace packages + let workspace_pkgs = metadata + .packages + .iter() + .filter_map(|package| { + metadata + .workspace_members + .contains(&package.id) + .then_some((package.name.clone(), package)) + }) + .collect::>(); + + let mut release = CargoTargets::new(BuildKind::Release); + let mut debug = CargoTargets::new(BuildKind::Debug); + + for (name, pkg) in package_map.0 { + // If this is a Rust package, `name` (the map key) is the name of the + // corresponding Rust crate. + if let PackageSource::Local { rust: Some(rust_pkg), .. } = &pkg.source { + let plan = if rust_pkg.release { &mut release } else { &mut debug }; + // Add the package name to the plan + plan.packages.insert(name); + // Get the package metadata + let metadata = workspace_pkgs.get(name).with_context(|| { + format!("package '{name}' is not a workspace package") + })?; + // Add the binaries we want to build to the plan + let bins = metadata + .targets + .iter() + .filter_map(|target| target.is_bin().then_some(&target.name)) + .collect::>(); + for bin in &rust_pkg.binary_names { + ensure!( + bins.contains(bin), + "bin target '{bin}' does not belong to package '{name}'" + ); + plan.bins.insert(bin); + } + // Add all features we want to request to the plan + plan.features.extend( + features + .iter() + .filter(|feature| metadata.features.contains_key(*feature)), + ); + } + } + + Ok(CargoPlan { release, debug }) +} + +#[derive(Debug)] +pub struct CargoPlan<'a> { + pub release: CargoTargets<'a>, + pub debug: CargoTargets<'a>, +} + +impl CargoPlan<'_> { + pub async fn run(&self, command: &str, log: &Logger) -> Result<()> { + self.release.run(command, log).await?; + self.debug.run(command, log).await?; + Ok(()) + } +} + +/// A set of packages, binaries, and features to operate on. +#[derive(Debug)] +pub struct CargoTargets<'a> { + pub kind: BuildKind, + pub packages: BTreeSet<&'a String>, + pub bins: BTreeSet<&'a String>, + pub features: BTreeSet<&'a String>, +} + +impl CargoTargets<'_> { + fn new(kind: BuildKind) -> Self { + Self { + kind, + packages: BTreeSet::new(), + bins: BTreeSet::new(), + features: BTreeSet::new(), + } + } + + pub fn build_command(&self, command: &str) -> Option { + if self.bins.is_empty() { + return None; + } + + let mut cmd = Command::new("cargo"); + // We rely on the rust-toolchain.toml file for toolchain information, + // rather than specifying one within the packaging tool. + cmd.arg(command); + // We specify _both_ --package and --bin; --bin does not imply + // --package, and without any --package options Cargo unifies features + // across all workspace default members. See rust-lang/cargo#8157. + for package in &self.packages { + cmd.arg("--package").arg(package); + } + for bin in &self.bins { + cmd.arg("--bin").arg(bin); + } + if !self.features.is_empty() { + cmd.arg("--features").arg(self.features.iter().fold( + String::new(), + |mut acc, s| { + if !acc.is_empty() { + acc.push(' '); + } + acc.push_str(s); + acc + }, + )); + } + match self.kind { + BuildKind::Release => { + cmd.arg("--release"); + } + BuildKind::Debug => {} + } + + Some(cmd) + } + + pub async fn run(&self, command: &str, log: &Logger) -> Result<()> { + let Some(mut cmd) = self.build_command(command) else { + return Ok(()); + }; + + info!(log, "running: {:?}", cmd.as_std()); + let status = cmd + .status() + .await + .context(format!("Failed to run command: ({:?})", cmd))?; + if !status.success() { + bail!("Failed to build packages"); + } + + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BuildKind { + Release, + Debug, +} diff --git a/package/src/lib.rs b/package/src/lib.rs index b37c1774fd..9d58f476b2 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -5,6 +5,7 @@ use clap::Subcommand; use serde::de::DeserializeOwned; use thiserror::Error; +pub mod cargo_plan; pub mod dot; pub mod target; @@ -130,6 +131,8 @@ pub enum BuildCommand { /// The version to be stamped onto the package. version: semver::Version, }, + /// Show the Cargo commands that would be run to build the packages. + ShowCargoCommands, /// Checks the packages specified in a manifest, without building them. Check, } From 460f038ccd57fead6a78b01ae9bf8b4fee38d54f Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 19 Dec 2024 18:25:29 -0500 Subject: [PATCH 02/11] Pick a non-expunged clone source (#7283) When performing region snapshot replacement, the associated start saga chose the request's region snapshot as the clone source, but if that region snapshot was backed by an expunged dataset then it may be gone. This commit adds logic to choose another clone source, either another region snapshot from the same snapshot, or one of the read-only regions for that snapshot. Basic sanity tests were added for ensuring that region replacements and region snapshot replacements resulting from expungement can occur. It was an oversight not to originally include these! Rn order to support these new sanity tests, the simulated pantry has to fake activating volumes in the background. This commit also refactors the simulated Pantry to have one Mutex around an "inner" struct instead of many Mutexes. Fixes #7209 --- nexus/db-queries/src/db/datastore/region.rs | 36 ++ .../src/db/datastore/region_snapshot.rs | 36 ++ .../region_snapshot_replacement_start.rs | 568 +++++++++++++++++- .../crucible_replacements.rs | 379 +++++++++++- sled-agent/src/sim/storage.rs | 97 +-- 5 files changed, 1043 insertions(+), 73 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 8e59462aa3..67bd37cf69 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -548,6 +548,42 @@ impl DataStore { Ok(records) } + + /// Find regions not on expunged disks that match a volume id + pub async fn find_non_expunged_regions( + &self, + opctx: &OpContext, + volume_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::physical_disk::dsl as physical_disk_dsl; + use db::schema::region::dsl as region_dsl; + use db::schema::zpool::dsl as zpool_dsl; + + region_dsl::region + .filter(region_dsl::dataset_id.eq_any( + dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::pool_id.eq_any( + zpool_dsl::zpool + .filter(zpool_dsl::time_deleted.is_null()) + .filter(zpool_dsl::physical_disk_id.eq_any( + physical_disk_dsl::physical_disk + .filter(physical_disk_dsl::disk_policy.eq(PhysicalDiskPolicy::InService)) + .select(physical_disk_dsl::id) + )) + .select(zpool_dsl::id) + )) + .select(dataset_dsl::id) + )) + .filter(region_dsl::volume_id.eq(volume_id)) + .select(Region::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } #[cfg(test)] diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 0129869f4f..f7a34fdb52 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -120,4 +120,40 @@ impl DataStore { .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// Find region snapshots not on expunged disks that match a snapshot id + pub async fn find_non_expunged_region_snapshots( + &self, + opctx: &OpContext, + snapshot_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::physical_disk::dsl as physical_disk_dsl; + use db::schema::region_snapshot::dsl as region_snapshot_dsl; + use db::schema::zpool::dsl as zpool_dsl; + + region_snapshot_dsl::region_snapshot + .filter(region_snapshot_dsl::dataset_id.eq_any( + dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::pool_id.eq_any( + zpool_dsl::zpool + .filter(zpool_dsl::time_deleted.is_null()) + .filter(zpool_dsl::physical_disk_id.eq_any( + physical_disk_dsl::physical_disk + .filter(physical_disk_dsl::disk_policy.eq(PhysicalDiskPolicy::InService)) + .select(physical_disk_dsl::id) + )) + .select(zpool_dsl::id) + )) + .select(dataset_dsl::id) + )) + .filter(region_snapshot_dsl::snapshot_id.eq(snapshot_id)) + .select(RegionSnapshot::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index bb5fd60209..b9ed75c288 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -65,6 +65,7 @@ use crate::app::{authn, db}; use nexus_types::identity::Asset; use nexus_types::identity::Resource; use omicron_common::api::external::Error; +use omicron_uuid_kinds::DatasetUuid; use serde::Deserialize; use serde::Serialize; use sled_agent_client::types::CrucibleOpts; @@ -91,6 +92,9 @@ declare_saga_actions! { + rsrss_set_saga_id - rsrss_set_saga_id_undo } + GET_CLONE_SOURCE -> "clone_source" { + + rsrss_get_clone_source + } GET_ALLOC_REGION_PARAMS -> "alloc_region_params" { + rsrss_get_alloc_region_params } @@ -194,6 +198,7 @@ impl NexusSaga for SagaRegionSnapshotReplacementStart { )); builder.append(set_saga_id_action()); + builder.append(get_clone_source_action()); builder.append(get_alloc_region_params_action()); builder.append(alloc_new_region_action()); builder.append(find_new_region_action()); @@ -265,6 +270,169 @@ async fn rsrss_set_saga_id_undo( Ok(()) } +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +enum CloneSource { + RegionSnapshot { dataset_id: DatasetUuid, region_id: Uuid }, + Region { region_id: Uuid }, +} + +async fn rsrss_get_clone_source( + sagactx: NexusActionContext, +) -> Result { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + + // Find either a region snapshot or a read-only region that is associated + // with the request snapshot that has not been expunged, and return that as + // the source to be used to populate the read-only region that will replace + // the request's region snapshot. + // + // Importantly, determine the clone source before new region alloc step in + // this saga, otherwise the query that searches for read-only region + // candidates will match the newly allocated region (that is not created + // yet!). + // + // Choose a clone source based on the following policy: + // + // - choose a region snapshot associated with the one being replaced + // + // - choose a read-only region from the associated snapshot volume + // + // - choose the region snapshot being replaced (only if it is not expunged! + // if the downstairs being cloned from is on an expunged dataset, we have + // to assume that the clone will never succeed, even if the expunged + // thing is still there) + // + // The policy here prefers to choose a clone source that isn't the region + // snapshot in the request: if it's flaky, it shouldn't be used as a clone + // source! This function does not know _why_ the replacement request was + // created for that region snapshot, and assumes that there may be a problem + // with it and will choose it as a last resort (if no other candidate clone + // source is found and the request's region snapshot is not on an expunged + // dataset, then it has to be chosen as a clone source, as the alternative + // is lost data). The request's region snapshot may also be completely fine, + // for example if a scrub is being requested. + // + // Also, the policy also prefers to choose to clone from a region snapshot + // instead of a read-only region: this is an arbitrary order, there is no + // reason behind this. The region snapshots and read-only regions will have + // identical contents. + + // First, try to select another region snapshot that's part of this + // snapshot. + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let mut non_expunged_region_snapshots = osagactx + .datastore() + .find_non_expunged_region_snapshots( + &opctx, + params.request.old_snapshot_id, + ) + .await + .map_err(ActionError::action_failed)?; + + // Filter out the request's region snapshot - if there are no other + // candidates, this could be chosen later in this function. + + non_expunged_region_snapshots.retain(|rs| { + !(rs.dataset_id == params.request.old_dataset_id + && rs.region_id == params.request.old_region_id + && rs.snapshot_id == params.request.old_snapshot_id) + }); + + if let Some(candidate) = non_expunged_region_snapshots.pop() { + info!( + log, + "found another non-expunged region snapshot"; + "snapshot_id" => %params.request.old_snapshot_id, + "dataset_id" => %candidate.dataset_id, + "region_id" => %candidate.region_id, + ); + + return Ok(CloneSource::RegionSnapshot { + dataset_id: candidate.dataset_id.into(), + region_id: candidate.region_id, + }); + } + + // Next, try to select a read-only region that's associated with the + // snapshot volume + + info!( + log, + "no region snapshot clone source candidates"; + "snapshot_id" => %params.request.old_snapshot_id, + ); + + // Look up the existing snapshot + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) + .await + .map_err(ActionError::action_failed)?; + + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + + let mut non_expunged_read_only_regions = osagactx + .datastore() + .find_non_expunged_regions(&opctx, db_snapshot.volume_id) + .await + .map_err(ActionError::action_failed)?; + + if let Some(candidate) = non_expunged_read_only_regions.pop() { + info!( + log, + "found region clone source candidate"; + "snapshot_id" => %params.request.old_snapshot_id, + "dataset_id" => %candidate.dataset_id(), + "region_id" => %candidate.id(), + ); + + return Ok(CloneSource::Region { region_id: candidate.id() }); + } + + // If no other non-expunged region snapshot or read-only region exists, then + // check if the request's region snapshot is non-expunged. This will use the + // region snapshot that is being replaced as a clone source, which may not + // work if there's a problem with that region snapshot that this replacement + // request is meant to fix! + + let request_dataset_on_in_service_physical_disk = osagactx + .datastore() + .dataset_physical_disk_in_service(params.request.old_dataset_id.into()) + .await + .map_err(ActionError::action_failed)?; + + if request_dataset_on_in_service_physical_disk { + // If the request region snapshot's dataset has not been expunged, it + // can be used + return Ok(CloneSource::RegionSnapshot { + dataset_id: params.request.old_dataset_id.into(), + region_id: params.request.old_region_id, + }); + } + + // If all targets of a Volume::Region are on expunged datasets, then the + // user's data is gone, and this code will fail to select a clone source. + + return Err(ActionError::action_failed(format!( + "no clone source candidate for {}!", + params.request.old_snapshot_id, + ))); +} + #[derive(Debug, Deserialize, Serialize)] struct AllocRegionParams { block_size: u64, @@ -445,46 +613,67 @@ async fn rsrss_new_region_ensure( "new_dataset_and_region", )?; - let region_snapshot = osagactx - .datastore() - .region_snapshot_get( - params.request.old_dataset_id.into(), - params.request.old_region_id, - params.request.old_snapshot_id, - ) - .await - .map_err(ActionError::action_failed)?; + let clone_source = sagactx.lookup::("clone_source")?; + + let mut source_repair_addr: SocketAddrV6 = match clone_source { + CloneSource::RegionSnapshot { dataset_id, region_id } => { + let region_snapshot = osagactx + .datastore() + .region_snapshot_get( + dataset_id, + region_id, + params.request.old_snapshot_id, + ) + .await + .map_err(ActionError::action_failed)?; - let Some(region_snapshot) = region_snapshot else { - return Err(ActionError::action_failed(format!( - "region snapshot {} {} {} deleted!", - params.request.old_dataset_id, - params.request.old_region_id, - params.request.old_snapshot_id, - ))); - }; + let Some(region_snapshot) = region_snapshot else { + return Err(ActionError::action_failed(format!( + "region snapshot {} {} {} deleted!", + dataset_id, region_id, params.request.old_snapshot_id, + ))); + }; - let (new_dataset, new_region) = new_dataset_and_region; + match region_snapshot.snapshot_addr.parse() { + Ok(addr) => addr, - // Currently, the repair port is set using a fixed offset above the - // downstairs port. Once this goes away, Nexus will require a way to query - // for the repair port! + Err(e) => { + return Err(ActionError::action_failed(format!( + "error parsing region_snapshot.snapshot_addr: {e}" + ))); + } + } + } - let mut source_repair_addr: SocketAddrV6 = - match region_snapshot.snapshot_addr.parse() { - Ok(addr) => addr, + CloneSource::Region { region_id } => { + let maybe_addr = osagactx + .datastore() + .region_addr(region_id) + .await + .map_err(ActionError::action_failed)?; - Err(e) => { - return Err(ActionError::action_failed(format!( - "error parsing region_snapshot.snapshot_addr: {e}" - ))); + match maybe_addr { + Some(addr) => addr, + + None => { + return Err(ActionError::action_failed(format!( + "region clone source {region_id} has no port!" + ))); + } } - }; + } + }; + + // Currently, the repair port is set using a fixed offset above the + // downstairs port. Once this goes away, Nexus will require a way to query + // for the repair port! source_repair_addr.set_port( source_repair_addr.port() + crucible_common::REPAIR_PORT_OFFSET, ); + let (new_dataset, new_region) = new_dataset_and_region; + let ensured_region = osagactx .nexus() .ensure_region_in_dataset( @@ -945,6 +1134,7 @@ pub(crate) mod test { app::sagas::region_snapshot_replacement_start::*, app::sagas::test_helpers::test_opctx, app::RegionAllocationStrategy, }; + use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::RegionSnapshotReplacement; use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_model::Volume; @@ -954,9 +1144,11 @@ pub(crate) mod test { use nexus_test_utils::resource_helpers::create_project; use nexus_test_utils::resource_helpers::create_snapshot; use nexus_test_utils::resource_helpers::DiskTest; + use nexus_test_utils::resource_helpers::DiskTestBuilder; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::views; use nexus_types::identity::Asset; + use omicron_uuid_kinds::GenericUuid; use sled_agent_client::types::VolumeConstructionRequest; type ControlPlaneTestContext = @@ -1517,4 +1709,322 @@ pub(crate) mod test { ) .await; } + + /// Tests that the region snapshot replacement start saga will not choose + /// the request's region snapshot, but instead will choose the other + /// non-expunged one. + #[nexus_test(server = crate::Server)] + async fn test_region_snapshot_replacement_start_prefer_not_self( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move the + // data, and for this test we're doing one expungements. + let sled_id = cptestctx.first_sled(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(4) + .build() + .await; + + // Any volumes sent to the Pantry for reconciliation should return + // active for this test + + cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .set_auto_activate_volumes() + .await; + + // Create a disk and a snapshot + let client = &cptestctx.external_client; + let _project_id = + create_project(&client, PROJECT_NAME).await.identity.id; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snap").await; + + // Before expunging any physical disk, save some DB models + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let snapshot_allocated_regions = datastore + .get_allocated_regions(db_snapshot.volume_id) + .await + .unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + assert_eq!(snapshot_allocated_regions.len(), 0); + + // Expunge one physical disk + { + let (dataset, _) = &disk_allocated_regions[0]; + + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + } + + // Request that the second region snapshot be replaced + + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_regions[1].0.id(), // dataset id + disk_allocated_regions[1].1.id(), // region id + snapshot.identity.id, + ) + .await + .unwrap() + .unwrap(); + + let request_id = datastore + .create_region_snapshot_replacement_request( + &opctx, + ®ion_snapshot, + ) + .await + .unwrap(); + + // Manually invoke the region snapshot replacement start saga + + let saga_outputs = nexus + .sagas + .saga_execute::(Params { + serialized_authn: Serialized::for_opctx(&opctx), + + request: datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(), + + allocation_strategy: RegionAllocationStrategy::Random { + seed: None, + }, + }) + .await + .unwrap(); + + // The third region snapshot should have been selected as the clone + // source + + let selected_clone_source = saga_outputs + .lookup_node_output::("clone_source") + .unwrap(); + + assert_eq!( + selected_clone_source, + CloneSource::RegionSnapshot { + dataset_id: disk_allocated_regions[2].0.id(), + region_id: disk_allocated_regions[2].1.id(), + }, + ); + + let snapshot_allocated_regions = datastore + .get_allocated_regions(db_snapshot.volume_id) + .await + .unwrap(); + + assert_eq!(snapshot_allocated_regions.len(), 1); + assert!(snapshot_allocated_regions.iter().all(|(_, r)| r.read_only())); + } + + /// Tests that a region snapshot replacement request can select the region + /// snapshot being replaced as a clone source (but only if it is not + /// expunged!) + #[nexus_test(server = crate::Server)] + async fn test_region_snapshot_replacement_start_hail_mary( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create five zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move the + // data, and for this test we're doing two expungements. + let sled_id = cptestctx.first_sled(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(5) + .build() + .await; + + // Any volumes sent to the Pantry for reconciliation should return + // active for this test + + cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .set_auto_activate_volumes() + .await; + + // Create a disk and a snapshot + let client = &cptestctx.external_client; + let _project_id = + create_project(&client, PROJECT_NAME).await.identity.id; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snap").await; + + // Before expunging any physical disk, save some DB models + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let snapshot_allocated_regions = datastore + .get_allocated_regions(db_snapshot.volume_id) + .await + .unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + assert_eq!(snapshot_allocated_regions.len(), 0); + + // Expunge two physical disks + for i in [0, 1] { + let (dataset, _) = &disk_allocated_regions[i]; + + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + } + + // Request that the third region snapshot be replaced + + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_regions[2].0.id(), // dataset id + disk_allocated_regions[2].1.id(), // region id + snapshot.identity.id, + ) + .await + .unwrap() + .unwrap(); + + let request_id = datastore + .create_region_snapshot_replacement_request( + &opctx, + ®ion_snapshot, + ) + .await + .unwrap(); + + // Manually invoke the region snapshot replacement start saga + + let saga_outputs = nexus + .sagas + .saga_execute::(Params { + serialized_authn: Serialized::for_opctx(&opctx), + + request: datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(), + + allocation_strategy: RegionAllocationStrategy::Random { + seed: None, + }, + }) + .await + .unwrap(); + + // This should have chosen the request's region snapshot as a clone + // source, and replaced it with a read-only region + + let selected_clone_source = saga_outputs + .lookup_node_output::("clone_source") + .unwrap(); + + assert_eq!( + selected_clone_source, + CloneSource::RegionSnapshot { + dataset_id: disk_allocated_regions[2].0.id(), + region_id: disk_allocated_regions[2].1.id(), + }, + ); + + let snapshot_allocated_regions = datastore + .get_allocated_regions(db_snapshot.volume_id) + .await + .unwrap(); + + assert_eq!(snapshot_allocated_regions.len(), 1); + assert!(snapshot_allocated_regions.iter().all(|(_, r)| r.read_only())); + } } diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index e84c8a0614..57dc624187 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -881,11 +881,36 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( .await; // Assert the region snapshot was deleted. - assert!(datastore - .region_snapshot_get(dataset.id(), region.id(), snapshot.identity.id) - .await - .unwrap() - .is_none()); + wait_for_condition( + || { + let dataset_id = dataset.id(); + let region_id = region.id(); + let snapshot_id = snapshot.identity.id; + + async move { + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap(); + + match region_snapshot { + Some(_) => { + // Region snapshot not garbage collected yet + Err(CondCheckError::<()>::NotYet) + } + + None => { + // Region snapshot garbage collected ok + Ok(()) + } + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("region snapshot garbage collected"); // Assert that the disk's volume is still only soft-deleted, because the two // other associated region snapshots still exist. @@ -959,12 +984,19 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( // The saga transitioned the request ok Ok(()) } else if state == RegionReplacementState::Driving { - // The saga is still running + // The drive saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == RegionReplacementState::Running { + // The drive saga hasn't started yet Err(CondCheckError::<()>::NotYet) } else if state == RegionReplacementState::Completing { // The saga transitioned the request ok, and it's now being // finished by the region replacement finish saga Ok(()) + } else if state == RegionReplacementState::Complete { + // The saga transitioned the request ok, and it was finished + // by the region replacement finish saga + Ok(()) } else { // Any other state is not expected panic!("unexpected state {state:?}!"); @@ -1707,3 +1739,338 @@ async fn test_delete_volume_region_snapshot_replacement_step( test_harness.assert_no_crucible_resources_leaked().await; } + +/// Tests that replacement can occur until completion +#[nexus_test] +async fn test_replacement_sanity(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create four zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data. + let sled_id = cptestctx.first_sled(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(4) + .build() + .await; + + // Create a disk and a snapshot and a disk from that snapshot + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + let snapshot = create_snapshot(&client, PROJECT_NAME, "disk", "snap").await; + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snap", + snapshot.identity.id, + ) + .await; + + // Before expunging the physical disk, save the DB model + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + // Next, expunge a physical disk that contains a region + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let (dataset, _) = &disk_allocated_regions[0]; + + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Any volumes sent to the Pantry for reconciliation should return active + // for this test + + cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .set_auto_activate_volumes() + .await; + + // Now, run all replacement tasks to completion + let internal_client = &cptestctx.internal_client; + run_replacement_tasks_to_completion(&internal_client).await; +} + +/// Tests that multiple replacements can occur until completion +#[nexus_test] +async fn test_region_replacement_triple_sanity( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create five zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data, and + // for this test we're doing two expungements. + let sled_id = cptestctx.first_sled(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(6) + .build() + .await; + + // Any volumes sent to the Pantry for reconciliation should return active + // for this test + + cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .set_auto_activate_volumes() + .await; + + // Create a disk and a snapshot and a disk from that snapshot + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + let snapshot = create_snapshot(&client, PROJECT_NAME, "disk", "snap").await; + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snap", + snapshot.identity.id, + ) + .await; + + // Before expunging any physical disk, save some DB models + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let internal_client = &cptestctx.internal_client; + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let snapshot_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + assert_eq!(snapshot_allocated_regions.len(), 0); + + for i in disk_allocated_regions { + let (dataset, _) = &i; + + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Now, run all replacement tasks to completion + run_replacement_tasks_to_completion(&internal_client).await; + } + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let snapshot_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + assert!(disk_allocated_regions.iter().all(|(_, r)| !r.read_only())); + + // Assert region snapshots replaced with three read-only regions + assert_eq!(snapshot_allocated_regions.len(), 3); + assert!(snapshot_allocated_regions.iter().all(|(_, r)| r.read_only())); +} + +/// Tests that multiple replacements can occur until completion, after expunging +/// two physical disks before any replacements occur (aka we can lose two +/// physical disks and still recover) +#[nexus_test] +async fn test_region_replacement_triple_sanity_2( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create five zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data, and + // for this test we're doing two expungements. + let sled_id = cptestctx.first_sled(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(6) + .build() + .await; + + // Any volumes sent to the Pantry for reconciliation should return active + // for this test + + cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .set_auto_activate_volumes() + .await; + + // Create a disk and a snapshot and a disk from that snapshot + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + let snapshot = create_snapshot(&client, PROJECT_NAME, "disk", "snap").await; + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snap", + snapshot.identity.id, + ) + .await; + + // Before expunging any physical disk, save some DB models + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let internal_client = &cptestctx.internal_client; + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let snapshot_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + assert_eq!(snapshot_allocated_regions.len(), 0); + + // Expunge two physical disks before any replacements occur + for i in [0, 1] { + let (dataset, _) = &disk_allocated_regions[i]; + + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + } + + // Now, run all replacement tasks to completion + run_replacement_tasks_to_completion(&internal_client).await; + + // Expunge the last physical disk + { + let (dataset, _) = &disk_allocated_regions[2]; + + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + } + + // Now, run all replacement tasks to completion + run_replacement_tasks_to_completion(&internal_client).await; + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let snapshot_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + assert!(disk_allocated_regions.iter().all(|(_, r)| !r.read_only())); + + // Assert region snapshots replaced with three read-only regions + assert_eq!(snapshot_allocated_regions.len(), 3); + assert!(snapshot_allocated_regions.iter().all(|(_, r)| r.read_only())); +} diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 8fd648096a..2299ba9db9 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -117,6 +117,8 @@ impl CrucibleDataInner { bail!("region creation error!"); } + let read_only = params.source.is_some(); + let region = Region { id: params.id, block_size: params.block_size, @@ -129,8 +131,8 @@ impl CrucibleDataInner { cert_pem: None, key_pem: None, root_pem: None, - source: None, - read_only: params.source.is_some(), + source: params.source, + read_only, }; let old = self.regions.insert(id, region.clone()); @@ -1364,29 +1366,41 @@ pub struct PantryVolume { activate_job: Option, } +pub struct PantryInner { + /// Map Volume UUID to PantryVolume struct + volumes: HashMap, + + jobs: HashSet, + + /// Auto activate volumes attached in the background + auto_activate_volumes: bool, +} + /// Simulated crucible pantry pub struct Pantry { pub id: OmicronZoneUuid, - /// Map Volume UUID to PantryVolume struct - volumes: Mutex>, sled_agent: Arc, - jobs: Mutex>, + inner: Mutex, } impl Pantry { pub fn new(sled_agent: Arc) -> Self { Self { id: OmicronZoneUuid::new_v4(), - volumes: Mutex::new(HashMap::default()), sled_agent, - jobs: Mutex::new(HashSet::default()), + inner: Mutex::new(PantryInner { + volumes: HashMap::default(), + jobs: HashSet::default(), + auto_activate_volumes: false, + }), } } pub async fn status(&self) -> Result { + let inner = self.inner.lock().await; Ok(PantryStatus { - volumes: self.volumes.lock().await.keys().cloned().collect(), - num_job_handles: self.jobs.lock().await.len(), + volumes: inner.volumes.keys().cloned().collect(), + num_job_handles: inner.jobs.len(), }) } @@ -1394,8 +1408,9 @@ impl Pantry { &self, volume_id: String, ) -> Result { - let volumes = self.volumes.lock().await; - match volumes.get(&volume_id) { + let inner = self.inner.lock().await; + + match inner.volumes.get(&volume_id) { Some(entry) => Ok(entry.vcr.clone()), None => Err(HttpError::for_not_found(None, volume_id)), @@ -1407,9 +1422,9 @@ impl Pantry { volume_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<()> { - let mut volumes = self.volumes.lock().await; + let mut inner = self.inner.lock().await; - volumes.insert( + inner.volumes.insert( volume_id, PantryVolume { vcr: volume_construction_request, @@ -1425,29 +1440,34 @@ impl Pantry { Ok(()) } + pub async fn set_auto_activate_volumes(&self) { + self.inner.lock().await.auto_activate_volumes = true; + } + pub async fn attach_activate_background( &self, volume_id: String, activate_job_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<(), HttpError> { - let mut volumes = self.volumes.lock().await; - let mut jobs = self.jobs.lock().await; + let mut inner = self.inner.lock().await; + + let auto_activate_volumes = inner.auto_activate_volumes; - volumes.insert( + inner.volumes.insert( volume_id, PantryVolume { vcr: volume_construction_request, status: VolumeStatus { - active: false, - seen_active: false, + active: auto_activate_volumes, + seen_active: auto_activate_volumes, num_job_handles: 1, }, activate_job: Some(activate_job_id.clone()), }, ); - jobs.insert(activate_job_id); + inner.jobs.insert(activate_job_id); Ok(()) } @@ -1457,8 +1477,8 @@ impl Pantry { volume_id: String, ) -> Result { let activate_job = { - let volumes = self.volumes.lock().await; - volumes.get(&volume_id).unwrap().activate_job.clone().unwrap() + let inner = self.inner.lock().await; + inner.volumes.get(&volume_id).unwrap().activate_job.clone().unwrap() }; let mut status = self.volume_status(volume_id.clone()).await?; @@ -1475,9 +1495,9 @@ impl Pantry { &self, volume_id: String, ) -> Result { - let volumes = self.volumes.lock().await; + let inner = self.inner.lock().await; - match volumes.get(&volume_id) { + match inner.volumes.get(&volume_id) { Some(pantry_volume) => Ok(pantry_volume.status.clone()), None => Err(HttpError::for_not_found(None, volume_id)), @@ -1489,9 +1509,9 @@ impl Pantry { volume_id: String, status: VolumeStatus, ) -> Result<(), HttpError> { - let mut volumes = self.volumes.lock().await; + let mut inner = self.inner.lock().await; - match volumes.get_mut(&volume_id) { + match inner.volumes.get_mut(&volume_id) { Some(pantry_volume) => { pantry_volume.status = status; Ok(()) @@ -1505,8 +1525,8 @@ impl Pantry { &self, job_id: String, ) -> Result { - let jobs = self.jobs.lock().await; - if !jobs.contains(&job_id) { + let inner = self.inner.lock().await; + if !inner.jobs.contains(&job_id) { return Err(HttpError::for_not_found(None, job_id)); } Ok(true) @@ -1516,11 +1536,11 @@ impl Pantry { &self, job_id: String, ) -> Result, HttpError> { - let mut jobs = self.jobs.lock().await; - if !jobs.contains(&job_id) { + let mut inner = self.inner.lock().await; + if !inner.jobs.contains(&job_id) { return Err(HttpError::for_not_found(None, job_id)); } - jobs.remove(&job_id); + inner.jobs.remove(&job_id); Ok(Ok(true)) } @@ -1533,9 +1553,9 @@ impl Pantry { self.entry(volume_id).await?; // Make up job - let mut jobs = self.jobs.lock().await; + let mut inner = self.inner.lock().await; let job_id = Uuid::new_v4().to_string(); - jobs.insert(job_id.clone()); + inner.jobs.insert(job_id.clone()); Ok(job_id) } @@ -1549,8 +1569,9 @@ impl Pantry { // the simulated instance ensure, then call // [`instance_issue_disk_snapshot_request`] as the snapshot logic is the // same. - let volumes = self.volumes.lock().await; - let volume_construction_request = &volumes.get(&volume_id).unwrap().vcr; + let inner = self.inner.lock().await; + let volume_construction_request = + &inner.volumes.get(&volume_id).unwrap().vcr; self.sled_agent .map_disk_ids_to_region_ids(volume_construction_request) @@ -1630,16 +1651,16 @@ impl Pantry { self.entry(volume_id).await?; // Make up job - let mut jobs = self.jobs.lock().await; + let mut inner = self.inner.lock().await; let job_id = Uuid::new_v4().to_string(); - jobs.insert(job_id.clone()); + inner.jobs.insert(job_id.clone()); Ok(job_id) } pub async fn detach(&self, volume_id: String) -> Result<()> { - let mut volumes = self.volumes.lock().await; - volumes.remove(&volume_id); + let mut inner = self.inner.lock().await; + inner.volumes.remove(&volume_id); Ok(()) } } From 17ee11009365f5896b0bc9d57328177bb4428eba Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 20 Dec 2024 13:12:31 -0800 Subject: [PATCH 03/11] [1/n] [omicron-package] move config into a separate file (#7285) We're going to add to this config, and I didn't want to cram `omicron-package.rs` with even more information. This is pure code movement with no functional changes. --- package/src/bin/omicron-package.rs | 270 +++++------------------------ package/src/config.rs | 246 ++++++++++++++++++++++++++ package/src/lib.rs | 1 + 3 files changed, 295 insertions(+), 222 deletions(-) create mode 100644 package/src/config.rs diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index 6ea10ae27b..f4bda47e2c 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -11,25 +11,22 @@ use futures::stream::{self, StreamExt, TryStreamExt}; use illumos_utils::{zfs, zone}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use omicron_package::cargo_plan::build_cargo_plan; +use omicron_package::config::{Config, ConfigArgs}; use omicron_package::target::KnownTarget; use omicron_package::{parse, BuildCommand, DeployCommand, TargetCommand}; -use omicron_zone_package::config::{Config as PackageConfig, PackageMap}; +use omicron_zone_package::config::Config as PackageConfig; use omicron_zone_package::package::{Package, PackageOutput, PackageSource}; use omicron_zone_package::progress::Progress; use omicron_zone_package::target::Target; use rayon::prelude::*; use ring::digest::{Context as DigestContext, Digest, SHA256}; use sled_hardware::cleanup::cleanup_networking_resources; -use slog::debug; use slog::o; use slog::Drain; use slog::Logger; use slog::{info, warn}; -use std::collections::BTreeMap; use std::env; use std::fs::create_dir_all; -use std::io::Write; -use std::str::FromStr; use std::sync::{Arc, OnceLock}; use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; @@ -44,11 +41,6 @@ enum SubCommand { Deploy(DeployCommand), } -fn parse_duration_ms(arg: &str) -> Result { - let ms = arg.parse()?; - Ok(std::time::Duration::from_millis(ms)) -} - #[derive(Debug, Parser)] #[clap(name = "packaging tool")] struct Args { @@ -64,43 +56,12 @@ struct Args { )] manifest: Utf8PathBuf, - #[clap( - short, - long, - help = "The name of the build target to use for this command", - default_value_t = ACTIVE.to_string(), - )] - target: String, - /// The output directory, where artifacts should be built and staged #[clap(long = "artifacts", default_value = "out/")] - artifact_dir: Utf8PathBuf, - - #[clap( - short, - long, - help = "Skip confirmation prompt for destructive operations", - action, - default_value_t = false - )] - force: bool, - - #[clap( - long, - help = "Number of retries to use when re-attempting failed package downloads", - action, - default_value_t = 10 - )] - retry_count: usize, + pub artifact_dir: Utf8PathBuf, - #[clap( - long, - help = "Duration, in ms, to wait before re-attempting failed package downloads", - action, - value_parser = parse_duration_ms, - default_value = "1000", - )] - retry_duration: std::time::Duration, + #[clap(flatten)] + config_args: ConfigArgs, #[clap(subcommand)] subcommand: SubCommand, @@ -158,7 +119,7 @@ async fn do_for_all_rust_packages( let cargo_plan = build_cargo_plan(&metadata, config.packages_to_build(), &features)?; - cargo_plan.run(command, &config.log).await + cargo_plan.run(command, config.log()).await } async fn do_check(config: &Config) -> Result<()> { @@ -172,7 +133,7 @@ async fn do_build(config: &Config) -> Result<()> { async fn do_dot(config: &Config) -> Result<()> { println!( "{}", - omicron_package::dot::do_dot(&config.target, &config.package_config)? + omicron_package::dot::do_dot(config.target(), config.package_config())? ); Ok(()) } @@ -194,9 +155,6 @@ async fn do_list_outputs( Ok(()) } -// The name reserved for the currently-in-use build target. -const ACTIVE: &str = "active"; - async fn do_target( artifact_dir: &Utf8Path, name: &str, @@ -234,7 +192,8 @@ async fn do_target( println!("Created new build target '{name}' and set it as active"); } TargetCommand::List => { - let active = tokio::fs::read_link(target_dir.join(ACTIVE)).await?; + let active = + tokio::fs::read_link(target_dir.join(Config::ACTIVE)).await?; let active = Utf8PathBuf::try_from(active)?; for entry in walkdir::WalkDir::new(&target_dir) .max_depth(1) @@ -273,7 +232,7 @@ async fn get_single_target( target_dir: impl AsRef, name: &str, ) -> Result { - if name == ACTIVE { + if name == Config::ACTIVE { bail!( "The name '{name}' is reserved, please try another (e.g. 'default')\n\ Usage: '{} -t target ...'", @@ -290,7 +249,7 @@ async fn replace_active_link( let src = src.as_ref(); let target_dir = target_dir.as_ref(); - let dst = target_dir.join(ACTIVE); + let dst = target_dir.join(Config::ACTIVE); if !target_dir.join(src).exists() { bail!("Target file {} does not exist", src); } @@ -399,7 +358,7 @@ async fn ensure_package( output_directory: &Utf8Path, disable_cache: bool, ) -> Result<()> { - let target = &config.target; + let target = config.target(); let progress = ui.add_package(package_name.to_string()); match &package.source { PackageSource::Prebuilt { repo, commit, sha256 } => { @@ -416,7 +375,7 @@ async fn ensure_package( }; if should_download { - let mut attempts_left = config.retry_count + 1; + let mut attempts_left = config.retry_count() + 1; loop { match download_prebuilt( &progress, @@ -436,7 +395,7 @@ async fn ensure_package( if attempts_left == 0 { return Err(err); } - tokio::time::sleep(config.retry_duration).await; + tokio::time::sleep(config.retry_duration()).await; progress.reset(); } } @@ -487,7 +446,7 @@ async fn do_package( create_dir_all(&output_directory) .map_err(|err| anyhow!("Cannot create output directory: {}", err))?; - let ui = ProgressUI::new(&config.log); + let ui = ProgressUI::new(config.log()); do_build(&config).await?; @@ -528,8 +487,8 @@ async fn do_stamp( ) -> Result<()> { // Find the package which should be stamped let (_name, package) = config - .package_config - .packages_to_deploy(&config.target) + .package_config() + .packages_to_deploy(config.target()) .0 .into_iter() .find(|(name, _pkg)| name.as_str() == package_name) @@ -552,7 +511,8 @@ async fn do_unpack( })?; // Copy all packages to the install location in parallel. - let packages = config.package_config.packages_to_deploy(&config.target).0; + let packages = + config.package_config().packages_to_deploy(&config.target()).0; packages.par_iter().try_for_each( |(package_name, package)| -> Result<()> { @@ -561,7 +521,7 @@ async fn do_unpack( let dst = package.get_output_path(&package.service_name, install_dir); info!( - &config.log, + config.log(), "Installing service"; "src" => %src, "dst" => %dst, @@ -593,7 +553,7 @@ async fn do_unpack( let tar_path = install_dir.join(format!("{}.tar", service_name)); let service_path = install_dir.join(service_name); info!( - &config.log, + config.log(), "Unpacking service tarball"; "tar_path" => %tar_path, "service_path" => %service_path, @@ -613,14 +573,14 @@ fn do_activate(config: &Config, install_dir: &Utf8Path) -> Result<()> { // Install the bootstrap service, which itself extracts and // installs other services. if let Some(package) = - config.package_config.packages.get("omicron-sled-agent") + config.package_config().packages.get("omicron-sled-agent") { let manifest_path = install_dir .join(&package.service_name) .join("pkg") .join("manifest.xml"); info!( - config.log, + config.log(), "Installing bootstrap service from {}", manifest_path ); @@ -654,7 +614,7 @@ async fn uninstall_all_omicron_zones() -> Result<()> { fn uninstall_all_omicron_datasets(config: &Config) -> Result<()> { let datasets = match zfs::get_all_omicron_datasets_for_delete() { Err(e) => { - warn!(config.log, "Failed to get omicron datasets: {}", e); + warn!(config.log(), "Failed to get omicron datasets: {}", e); return Err(e); } Ok(datasets) => datasets, @@ -669,7 +629,7 @@ fn uninstall_all_omicron_datasets(config: &Config) -> Result<()> { datasets ))?; for dataset in &datasets { - info!(config.log, "Deleting dataset: {dataset}"); + info!(config.log(), "Deleting dataset: {dataset}"); zfs::Zfs::destroy_dataset(dataset)?; } @@ -679,8 +639,8 @@ fn uninstall_all_omicron_datasets(config: &Config) -> Result<()> { // Attempts to both disable and delete all requested packages. fn uninstall_all_packages(config: &Config) { for (_, package) in config - .package_config - .packages_to_deploy(&config.target) + .package_config() + .packages_to_deploy(config.target()) .0 .into_iter() .filter(|(_, package)| matches!(package.output, PackageOutput::Tarball)) @@ -744,18 +704,18 @@ fn remove_all_except>( } async fn do_deactivate(config: &Config) -> Result<()> { - info!(&config.log, "Removing all Omicron zones"); + info!(config.log(), "Removing all Omicron zones"); uninstall_all_omicron_zones().await?; - info!(config.log, "Uninstalling all packages"); + info!(config.log(), "Uninstalling all packages"); uninstall_all_packages(config); - info!(config.log, "Removing networking resources"); - cleanup_networking_resources(&config.log).await?; + info!(config.log(), "Removing networking resources"); + cleanup_networking_resources(config.log()).await?; Ok(()) } async fn do_uninstall(config: &Config) -> Result<()> { do_deactivate(config).await?; - info!(config.log, "Removing datasets"); + info!(config.log(), "Removing datasets"); uninstall_all_omicron_datasets(config)?; Ok(()) } @@ -766,7 +726,7 @@ async fn do_clean( install_dir: &Utf8Path, ) -> Result<()> { do_uninstall(&config).await?; - info!(config.log, "Removing artifacts from {}", artifact_dir); + info!(config.log(), "Removing artifacts from {}", artifact_dir); const ARTIFACTS_TO_KEEP: &[&str] = &[ "clickhouse", "cockroachdb", @@ -775,10 +735,10 @@ async fn do_clean( "downloads", "softnpu", ]; - remove_all_except(artifact_dir, ARTIFACTS_TO_KEEP, &config.log)?; - info!(config.log, "Removing installed objects in: {}", install_dir); + remove_all_except(artifact_dir, ARTIFACTS_TO_KEEP, config.log())?; + info!(config.log(), "Removing installed objects in: {}", install_dir); const INSTALLED_OBJECTS_TO_KEEP: &[&str] = &["opte"]; - remove_all_except(install_dir, INSTALLED_OBJECTS_TO_KEEP, &config.log)?; + remove_all_except(install_dir, INSTALLED_OBJECTS_TO_KEEP, config.log())?; Ok(()) } @@ -889,115 +849,6 @@ impl Progress for PackageProgress { } } -struct Config { - log: Logger, - // Description of all possible packages. - package_config: PackageConfig, - // Description of the target we're trying to operate on. - target: Target, - // The list of packages the user wants us to build (all, if empty) - only: Vec, - // True if we should skip confirmations for destructive operations. - force: bool, - // Number of times to retry failed downloads. - retry_count: usize, - // Duration to wait before retrying failed downloads. - retry_duration: std::time::Duration, -} - -impl Config { - /// Prompts the user for input before proceeding with an operation. - fn confirm(&self, prompt: &str) -> Result<()> { - if self.force { - return Ok(()); - } - - print!("{prompt}\n[yY to confirm] >> "); - let _ = std::io::stdout().flush(); - - let mut input = String::new(); - std::io::stdin().read_line(&mut input)?; - match input.as_str().trim() { - "y" | "Y" => Ok(()), - _ => bail!("Aborting"), - } - } - - /// Returns target packages to be assembled on the builder machine, limited - /// to those specified in `only` (if set). - fn packages_to_build(&self) -> PackageMap<'_> { - let packages = self.package_config.packages_to_build(&self.target); - if self.only.is_empty() { - return packages; - } - - let mut filtered_packages = PackageMap(BTreeMap::new()); - let mut to_walk = PackageMap(BTreeMap::new()); - // add the requested packages to `to_walk` - for package_name in &self.only { - to_walk.0.insert( - package_name, - packages.0.get(package_name).unwrap_or_else(|| { - panic!( - "Explicitly-requested package '{}' does not exist", - package_name - ) - }), - ); - } - // dependencies are listed by output name, so create a lookup table to - // get a package by its output name. - let lookup_by_output = packages - .0 - .iter() - .map(|(name, package)| { - (package.get_output_file(name), (*name, *package)) - }) - .collect::>(); - // packages yet to be walked are added to `to_walk`. pop each entry and - // add its dependencies to `to_walk`, then add the package we finished - // walking to `filtered_packages`. - while let Some((package_name, package)) = to_walk.0.pop_first() { - if let PackageSource::Composite { packages } = &package.source { - for output in packages { - // find the package by output name - let (dep_name, dep_package) = - lookup_by_output.get(output).unwrap_or_else(|| { - panic!( - "Could not find a package which creates '{}'", - output - ) - }); - if dep_name.as_str() == package_name { - panic!("'{}' depends on itself", package_name); - } - // if we've seen this package already, it will be in - // `filtered_packages`. otherwise, add it to `to_walk`. - if !filtered_packages.0.contains_key(dep_name) { - to_walk.0.insert(dep_name, dep_package); - } - } - } - // we're done looking at this package's deps - filtered_packages.0.insert(package_name, package); - } - filtered_packages - } - - /// Return a list of all possible Cargo features that could be requested for - /// the packages being built. - /// - /// Out of these, the features that actually get requested are determined by - /// which features are available for the list of packages being built. - fn cargo_features(&self) -> Vec { - self.target - .0 - .iter() - .map(|(name, value)| format!("{name}-{value}")) - .collect::>() - } -} - #[tokio::main] async fn main() -> Result<()> { let args = Args::try_parse()?; @@ -1014,43 +865,13 @@ async fn main() -> Result<()> { let drain = slog_async::Async::new(drain).build().fuse(); let log = Logger::root(drain, o!()); - let target_help_str = || -> String { - format!( - "Try calling: '{} -t default target create' to create a new build target", - env::current_exe().unwrap().display() - ) - }; - let get_config = || -> Result { - let target_path = args.artifact_dir.join("target").join(&args.target); - let raw_target = - std::fs::read_to_string(&target_path).inspect_err(|_| { - eprintln!( - "Failed to read build target: {}\n{}", - target_path, - target_help_str() - ); - })?; - let target: Target = KnownTarget::from_str(&raw_target) - .inspect_err(|_| { - eprintln!( - "Failed to parse {} as target\n{}", - target_path, - target_help_str() - ); - })? - .into(); - debug!(log, "target[{}]: {:?}", args.target, target); - - Ok(Config { - log: log.clone(), + Config::get_config( + &log, package_config, - target, - only: Vec::new(), - force: args.force, - retry_count: args.retry_count, - retry_duration: args.retry_duration, - }) + &args.config_args, + &args.artifact_dir, + ) }; // Use a CWD that is the root of the Omicron repository. @@ -1064,7 +885,12 @@ async fn main() -> Result<()> { match args.subcommand { SubCommand::Build(BuildCommand::Target { subcommand }) => { - do_target(&args.artifact_dir, &args.target, &subcommand).await?; + do_target( + &args.artifact_dir, + &args.config_args.target, + &subcommand, + ) + .await?; } SubCommand::Build(BuildCommand::Dot) => { do_dot(&get_config()?).await?; @@ -1075,7 +901,7 @@ async fn main() -> Result<()> { } SubCommand::Build(BuildCommand::Package { disable_cache, only }) => { let mut config = get_config()?; - config.only = only; + config.set_only(only); do_package(&config, &args.artifact_dir, disable_cache).await?; } SubCommand::Build(BuildCommand::Stamp { package_name, version }) => { diff --git a/package/src/config.rs b/package/src/config.rs new file mode 100644 index 0000000000..f80bd36057 --- /dev/null +++ b/package/src/config.rs @@ -0,0 +1,246 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{bail, Result}; +use camino::Utf8Path; +use clap::Args; +use omicron_zone_package::{ + config::{Config as PackageConfig, PackageMap}, + package::PackageSource, + target::Target, +}; +use slog::{debug, Logger}; +use std::{ + collections::BTreeMap, env, io::Write, str::FromStr, time::Duration, +}; + +use crate::target::KnownTarget; + +#[derive(Debug, Args)] +pub struct ConfigArgs { + /// The name of the build target to use for this command + #[clap( + short, + long, + default_value_t = Config::ACTIVE.to_string(), + )] + pub target: String, + + /// Skip confirmation prompt for destructive operations + #[clap(short, long, action, default_value_t = false)] + pub force: bool, + + /// Number of retries to use when re-attempting failed package downloads + #[clap(long, action, default_value_t = 10)] + pub retry_count: usize, + + /// Duration, in ms, to wait before re-attempting failed package downloads + #[clap( + long, + action, + value_parser = parse_duration_ms, + default_value = "1000", + )] + pub retry_duration: Duration, +} + +fn parse_duration_ms(arg: &str) -> Result { + let ms = arg.parse()?; + Ok(Duration::from_millis(ms)) +} + +#[derive(Debug)] +pub struct Config { + log: Logger, + // Description of all possible packages. + package_config: PackageConfig, + // Description of the target we're trying to operate on. + target: Target, + // The list of packages the user wants us to build (all, if empty) + only: Vec, + // True if we should skip confirmations for destructive operations. + force: bool, + // Number of times to retry failed downloads. + retry_count: usize, + // Duration to wait before retrying failed downloads. + retry_duration: Duration, +} + +impl Config { + /// The name reserved for the currently-in-use build target. + pub const ACTIVE: &str = "active"; + + /// Builds a new configuration. + pub fn get_config( + log: &Logger, + package_config: PackageConfig, + args: &ConfigArgs, + artifact_dir: &Utf8Path, + ) -> Result { + let target_help_str = || -> String { + format!( + "Try calling: '{} -t default target create' to create a new build target", + env::current_exe().unwrap().display() + ) + }; + + let target_path = artifact_dir.join("target").join(&args.target); + let raw_target = + std::fs::read_to_string(&target_path).inspect_err(|_| { + eprintln!( + "Failed to read build target: {}\n{}", + target_path, + target_help_str() + ); + })?; + let target: Target = KnownTarget::from_str(&raw_target) + .inspect_err(|_| { + eprintln!( + "Failed to parse {} as target\n{}", + target_path, + target_help_str() + ); + })? + .into(); + debug!(log, "target[{}]: {:?}", args.target, target); + + Ok(Config { + log: log.clone(), + package_config, + target, + only: Vec::new(), + force: args.force, + retry_count: args.retry_count, + retry_duration: args.retry_duration, + }) + } + + /// Sets the `only` field. + #[inline] + pub fn set_only(&mut self, only: Vec) -> &mut Self { + self.only = only; + self + } + + /// Returns the logger. + #[inline] + pub fn log(&self) -> &Logger { + &self.log + } + + /// Returns the target currently being operated on. + #[inline] + pub fn target(&self) -> &Target { + &self.target + } + + /// Returns the underlying package configuration. + #[inline] + pub fn package_config(&self) -> &PackageConfig { + &self.package_config + } + + /// Returns the retry count. + #[inline] + pub fn retry_count(&self) -> usize { + self.retry_count + } + + /// Returns the retry duration. + #[inline] + pub fn retry_duration(&self) -> Duration { + self.retry_duration + } + + /// Prompts the user for input before proceeding with an operation. + pub fn confirm(&self, prompt: &str) -> Result<()> { + if self.force { + return Ok(()); + } + + print!("{prompt}\n[yY to confirm] >> "); + let _ = std::io::stdout().flush(); + + let mut input = String::new(); + std::io::stdin().read_line(&mut input)?; + match input.as_str().trim() { + "y" | "Y" => Ok(()), + _ => bail!("Aborting"), + } + } + + /// Returns target packages to be assembled on the builder machine, limited + /// to those specified in `only` (if set). + pub fn packages_to_build(&self) -> PackageMap<'_> { + let packages = self.package_config.packages_to_build(&self.target); + if self.only.is_empty() { + return packages; + } + + let mut filtered_packages = PackageMap(BTreeMap::new()); + let mut to_walk = PackageMap(BTreeMap::new()); + // add the requested packages to `to_walk` + for package_name in &self.only { + to_walk.0.insert( + package_name, + packages.0.get(package_name).unwrap_or_else(|| { + panic!( + "Explicitly-requested package '{}' does not exist", + package_name + ) + }), + ); + } + // dependencies are listed by output name, so create a lookup table to + // get a package by its output name. + let lookup_by_output = packages + .0 + .iter() + .map(|(name, package)| { + (package.get_output_file(name), (*name, *package)) + }) + .collect::>(); + // packages yet to be walked are added to `to_walk`. pop each entry and + // add its dependencies to `to_walk`, then add the package we finished + // walking to `filtered_packages`. + while let Some((package_name, package)) = to_walk.0.pop_first() { + if let PackageSource::Composite { packages } = &package.source { + for output in packages { + // find the package by output name + let (dep_name, dep_package) = + lookup_by_output.get(output).unwrap_or_else(|| { + panic!( + "Could not find a package which creates '{}'", + output + ) + }); + if dep_name.as_str() == package_name { + panic!("'{}' depends on itself", package_name); + } + // if we've seen this package already, it will be in + // `filtered_packages`. otherwise, add it to `to_walk`. + if !filtered_packages.0.contains_key(dep_name) { + to_walk.0.insert(dep_name, dep_package); + } + } + } + // we're done looking at this package's deps + filtered_packages.0.insert(package_name, package); + } + filtered_packages + } + + /// Return a list of all possible Cargo features that could be requested for + /// the packages being built. + /// + /// Out of these, the features that actually get requested are determined by + /// which features are available for the list of packages being built. + pub fn cargo_features(&self) -> Vec { + self.target + .0 + .iter() + .map(|(name, value)| format!("{name}-{value}")) + .collect::>() + } +} diff --git a/package/src/lib.rs b/package/src/lib.rs index 9d58f476b2..8ef9a4c951 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -6,6 +6,7 @@ use serde::de::DeserializeOwned; use thiserror::Error; pub mod cargo_plan; +pub mod config; pub mod dot; pub mod target; From 9eca6bcc77a92b77275a5dbd264738d32b941972 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 20 Dec 2024 13:37:14 -0800 Subject: [PATCH 04/11] external_endpoints should support more than 200 silos and TLS certificates (#7291) --- nexus/src/app/external_endpoints.rs | 114 +++++++++++++--------------- 1 file changed, 54 insertions(+), 60 deletions(-) diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs index f837edc4fb..b93b692465 100644 --- a/nexus/src/app/external_endpoints.rs +++ b/nexus/src/app/external_endpoints.rs @@ -33,15 +33,17 @@ use anyhow::Context; use nexus_db_model::AuthenticationMode; use nexus_db_model::Certificate; use nexus_db_model::DnsGroup; +use nexus_db_model::DnsZone; +use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::Discoverability; use nexus_db_queries::db::model::ServiceKind; +use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; use nexus_types::identity::Resource; use nexus_types::silo::silo_dns_name; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_common::api::external::http_pagination::PaginatedBy; -use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::bail_unless; use openssl::pkey::PKey; @@ -488,69 +490,61 @@ pub(crate) async fn read_all_endpoints( datastore: &DataStore, opctx: &OpContext, ) -> Result { - // We will not look for more than this number of external DNS zones, Silos, - // or certificates. We do not expect very many of any of these objects. - const MAX: u32 = 200; - let pagparams_id = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - let pagbyid = PaginatedBy::Id(pagparams_id); - let pagparams_name = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - - let silos = - datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?; - let external_dns_zones = datastore - .dns_zones_list(opctx, DnsGroup::External, &pagparams_name) - .await?; + // The batch size here is pretty arbitrary. On the vast majority of + // systems, there will only ever be a handful of any of these objects. Some + // systems are known to have a few dozen silos and a few hundred TLS + // certificates. This code path is not particularly latency-sensitive. Our + // purpose in limiting the batch size is just to avoid unbounded-size + // database transactions. + // + // unwrap(): safe because 200 is non-zero. + let batch_size = NonZeroU32::new(200).unwrap(); + + // Fetch all silos. + let mut silos = Vec::new(); + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = datastore + .silos_list( + opctx, + &PaginatedBy::Id(p.current_pagparams()), + Discoverability::All, + ) + .await?; + paginator = p.found_batch(&batch, &|s: &Silo| s.id()); + silos.extend(batch.into_iter()); + } + + // Fetch all external DNS zones. We should really only ever have one, but + // we may as well paginate this. + let mut external_dns_zones = Vec::new(); + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = datastore + .dns_zones_list(opctx, DnsGroup::External, &p.current_pagparams()) + .await?; + paginator = p.found_batch(&batch, &|z: &DnsZone| z.zone_name.clone()); + external_dns_zones.extend(batch.into_iter()); + } bail_unless!( !external_dns_zones.is_empty(), "expected at least one external DNS zone" ); - let certs = datastore - .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false) - .await?; - - // If we found too many of any of these things, complain as loudly as we - // can. Our results will be wrong. But we still don't want to fail if we - // can avoid it because we want to be able to serve as many endpoints as we - // can. - // TODO-reliability we should prevent people from creating more than this - // maximum number of Silos and certificates. - let max = usize::try_from(MAX).unwrap(); - if silos.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} silos, but found at \ - least {}. TLS may not work on some Silos' external endpoints.", - MAX, - silos.len(), - ); - } - if external_dns_zones.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} external DNS zones, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - external_dns_zones.len(), - ); - } - if certs.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} certificates, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - certs.len(), - ); + + // Fetch all TLS certificates. + let mut certs = Vec::new(); + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = datastore + .certificate_list_for( + opctx, + Some(ServiceKind::Nexus), + &PaginatedBy::Id(p.current_pagparams()), + false, + ) + .await?; + paginator = p.found_batch(&batch, &|s: &Certificate| s.id()); + certs.extend(batch); } Ok(ExternalEndpoints::new(silos, certs, external_dns_zones)) From b1978e2529f8d2ce6966e84ec8623b4b297f2877 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 20 Dec 2024 16:47:21 -0500 Subject: [PATCH 05/11] [Reconfigurator] Blippy (#7276) This PR introduces Blippy for linting blueprints (see #6987). It initially only reports `FATAL` errors associated with specific sleds, but hopefully provides enough structure to see how that can expand to include other severities and other components. (At a minimum, there will be some blueprint- or policy-level component for things like "there aren't enough Nexus zones" that aren't associated with any particular sled.) As of this PR, the only user of Blippy is the builder test's `verify_blueprint`, from which I imported most of the checks that it current performs. I made a few of these checks slightly more strict, and from that I had to patch up a handful of tests that were doing weird things (e.g., manually expunging a zone without expunging its datasets) and also found one legitimate planner bug (I'll note in a separate comment below). --- Cargo.lock | 14 + Cargo.toml | 3 + common/src/api/external/mod.rs | 2 + common/src/api/internal/shared.rs | 24 +- nexus-sled-agent-shared/src/inventory.rs | 11 +- nexus/reconfigurator/blippy/Cargo.toml | 19 + nexus/reconfigurator/blippy/src/blippy.rs | 428 +++++ nexus/reconfigurator/blippy/src/checks.rs | 1564 +++++++++++++++++ nexus/reconfigurator/blippy/src/lib.rs | 24 + nexus/reconfigurator/blippy/src/report.rs | 85 + nexus/reconfigurator/execution/src/lib.rs | 4 +- nexus/reconfigurator/planning/Cargo.toml | 1 + .../planning/src/blueprint_builder/builder.rs | 217 +-- .../src/blueprint_builder/internal_dns.rs | 19 + nexus/reconfigurator/planning/src/planner.rs | 15 + ...dataset_settings_modified_in_place_1_2.txt | 4 +- .../planner_decommissions_sleds_1_2.txt | 4 +- ...lanner_expunge_clickhouse_clusters_3_4.txt | 4 +- ...lanner_expunge_clickhouse_clusters_5_6.txt | 4 +- .../output/planner_nonprovisionable_1_2.txt | 12 +- .../output/planner_nonprovisionable_2_2a.txt | 12 +- nexus/types/src/deployment.rs | 32 +- .../types/src/deployment/network_resources.rs | 48 +- nexus/types/src/deployment/zone_type.rs | 132 +- 24 files changed, 2447 insertions(+), 235 deletions(-) create mode 100644 nexus/reconfigurator/blippy/Cargo.toml create mode 100644 nexus/reconfigurator/blippy/src/blippy.rs create mode 100644 nexus/reconfigurator/blippy/src/checks.rs create mode 100644 nexus/reconfigurator/blippy/src/lib.rs create mode 100644 nexus/reconfigurator/blippy/src/report.rs diff --git a/Cargo.lock b/Cargo.lock index 05a7082aaa..5048914c74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6035,6 +6035,19 @@ dependencies = [ "uuid", ] +[[package]] +name = "nexus-reconfigurator-blippy" +version = "0.1.0" +dependencies = [ + "nexus-reconfigurator-planning", + "nexus-sled-agent-shared", + "nexus-types", + "omicron-common", + "omicron-test-utils", + "omicron-uuid-kinds", + "omicron-workspace-hack", +] + [[package]] name = "nexus-reconfigurator-execution" version = "0.1.0" @@ -6101,6 +6114,7 @@ dependencies = [ "maplit", "nexus-config", "nexus-inventory", + "nexus-reconfigurator-blippy", "nexus-sled-agent-shared", "nexus-types", "omicron-common", diff --git a/Cargo.toml b/Cargo.toml index 3d8efe3acb..3d29f61cf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,6 +79,7 @@ members = [ "nexus/macros-common", "nexus/metrics-producer-gc", "nexus/networking", + "nexus/reconfigurator/blippy", "nexus/reconfigurator/execution", "nexus/reconfigurator/planning", "nexus/reconfigurator/preparation", @@ -213,6 +214,7 @@ default-members = [ "nexus/macros-common", "nexus/metrics-producer-gc", "nexus/networking", + "nexus/reconfigurator/blippy", "nexus/reconfigurator/execution", "nexus/reconfigurator/planning", "nexus/reconfigurator/preparation", @@ -468,6 +470,7 @@ nexus-internal-api = { path = "nexus/internal-api" } nexus-macros-common = { path = "nexus/macros-common" } nexus-metrics-producer-gc = { path = "nexus/metrics-producer-gc" } nexus-networking = { path = "nexus/networking" } +nexus-reconfigurator-blippy = { path = "nexus/reconfigurator/blippy" } nexus-reconfigurator-execution = { path = "nexus/reconfigurator/execution" } nexus-reconfigurator-planning = { path = "nexus/reconfigurator/planning" } nexus-reconfigurator-preparation = { path = "nexus/reconfigurator/preparation" } diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 38a9de0564..ab46f9f7f6 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -1930,6 +1930,8 @@ impl JsonSchema for L4PortRange { DeserializeFromStr, PartialEq, Eq, + PartialOrd, + Ord, SerializeDisplay, Hash, )] diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 94440df2d5..a3f28a258a 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -48,7 +48,16 @@ pub enum NetworkInterfaceKind { /// Information required to construct a virtual network interface #[derive( - Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, + Clone, + Debug, + Deserialize, + Serialize, + JsonSchema, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, )] pub struct NetworkInterface { pub id: Uuid, @@ -68,7 +77,18 @@ pub struct NetworkInterface { /// outbound network connections from guests or services. // Note that `Deserialize` is manually implemented; if you make any changes to // the fields of this structure, you must make them to that implementation too. -#[derive(Debug, Clone, Copy, Serialize, JsonSchema, PartialEq, Eq, Hash)] +#[derive( + Debug, + Clone, + Copy, + Serialize, + JsonSchema, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, +)] pub struct SourceNatConfig { /// The external address provided to the instance or service. pub ip: IpAddr, diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 5fb2d55203..3b9daf583e 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -173,7 +173,16 @@ impl OmicronZoneConfig { /// Describes a persistent ZFS dataset associated with an Omicron zone #[derive( - Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, + Clone, + Debug, + Deserialize, + Serialize, + JsonSchema, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, )] pub struct OmicronZoneDataset { pub pool_name: ZpoolName, diff --git a/nexus/reconfigurator/blippy/Cargo.toml b/nexus/reconfigurator/blippy/Cargo.toml new file mode 100644 index 0000000000..e7f7208871 --- /dev/null +++ b/nexus/reconfigurator/blippy/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "nexus-reconfigurator-blippy" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +nexus-sled-agent-shared.workspace = true +nexus-types.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true + +omicron-workspace-hack.workspace = true + +[dev-dependencies] +nexus-reconfigurator-planning.workspace = true +omicron-test-utils.workspace = true diff --git a/nexus/reconfigurator/blippy/src/blippy.rs b/nexus/reconfigurator/blippy/src/blippy.rs new file mode 100644 index 0000000000..9e9cc84b32 --- /dev/null +++ b/nexus/reconfigurator/blippy/src/blippy.rs @@ -0,0 +1,428 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::checks; +use crate::report::BlippyReport; +use crate::report::BlippyReportSortKey; +use core::fmt; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintDatasetConfig; +use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::inventory::ZpoolName; +use omicron_common::address::DnsSubnet; +use omicron_common::address::Ipv6Subnet; +use omicron_common::address::SLED_PREFIX; +use omicron_common::api::external::MacAddr; +use omicron_common::disk::DatasetKind; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; +use std::collections::BTreeSet; +use std::net::IpAddr; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Note { + pub severity: Severity, + pub kind: Kind, +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum Severity { + /// Indicator of a serious problem that means the blueprint is invalid. + Fatal, +} + +impl fmt::Display for Severity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Severity::Fatal => write!(f, "FATAL"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum Kind { + Sled { sled_id: SledUuid, kind: SledKind }, +} + +impl Kind { + pub fn display_component(&self) -> impl fmt::Display + '_ { + enum Component<'a> { + Sled(&'a SledUuid), + } + + impl fmt::Display for Component<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Component::Sled(id) => write!(f, "sled {id}"), + } + } + } + + match self { + Kind::Sled { sled_id, .. } => Component::Sled(sled_id), + } + } + + pub fn display_subkind(&self) -> impl fmt::Display + '_ { + enum Subkind<'a> { + Sled(&'a SledKind), + } + + impl fmt::Display for Subkind<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Subkind::Sled(kind) => write!(f, "{kind}"), + } + } + } + + match self { + Kind::Sled { kind, .. } => Subkind::Sled(kind), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum SledKind { + /// Two running zones have the same underlay IP address. + DuplicateUnderlayIp { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + }, + /// A sled has two zones that are not members of the same sled subnet. + SledWithMixedUnderlaySubnets { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + }, + /// Two sleds are using the same sled subnet. + ConflictingSledSubnets { + other_sled: SledUuid, + subnet: Ipv6Subnet, + }, + /// An internal DNS zone has an IP that is not one of the expected rack DNS + /// subnets. + InternalDnsZoneBadSubnet { + zone: BlueprintZoneConfig, + rack_dns_subnets: BTreeSet, + }, + /// Two running zones have the same external IP address. + DuplicateExternalIp { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + ip: IpAddr, + }, + /// Two running zones' NICs have the same IP address. + DuplicateNicIp { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + ip: IpAddr, + }, + /// Two running zones' NICs have the same MAC address. + DuplicateNicMac { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + mac: MacAddr, + }, + /// Two zones with the same durable dataset kind are on the same zpool. + ZoneDurableDatasetCollision { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + zpool: ZpoolName, + }, + /// Two zones with the same filesystem dataset kind are on the same zpool. + ZoneFilesystemDatasetCollision { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + zpool: ZpoolName, + }, + /// One zpool has two datasets of the same kind. + ZpoolWithDuplicateDatasetKinds { + dataset1: BlueprintDatasetConfig, + dataset2: BlueprintDatasetConfig, + zpool: ZpoolUuid, + }, + /// A zpool is missing its Debug dataset. + ZpoolMissingDebugDataset { zpool: ZpoolUuid }, + /// A zpool is missing its Zone Root dataset. + ZpoolMissingZoneRootDataset { zpool: ZpoolUuid }, + /// A zone's filesystem dataset is missing from `blueprint_datasets`. + ZoneMissingFilesystemDataset { zone: BlueprintZoneConfig }, + /// A zone's durable dataset is missing from `blueprint_datasets`. + ZoneMissingDurableDataset { zone: BlueprintZoneConfig }, + /// A zone's durable dataset and transient root dataset are on different + /// zpools. + ZoneWithDatasetsOnDifferentZpools { + zone: BlueprintZoneConfig, + durable_zpool: ZpoolName, + transient_zpool: ZpoolName, + }, + /// A sled is missing entries in `Blueprint::blueprint_datasets`. + /// + /// `why` indicates why we expected this sled to have an entry. + SledMissingDatasets { why: &'static str }, + /// A sled is missing entries in `Blueprint::blueprint_disks`. + /// + /// `why` indicates why we expected this sled to have an entry. + SledMissingDisks { why: &'static str }, + /// A dataset is present but not referenced by any in-service zone or disk. + OrphanedDataset { dataset: BlueprintDatasetConfig }, + /// A dataset claims to be on a zpool that does not exist. + DatasetOnNonexistentZpool { dataset: BlueprintDatasetConfig }, +} + +impl fmt::Display for SledKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + SledKind::DuplicateUnderlayIp { zone1, zone2 } => { + write!( + f, + "duplicate underlay IP {} ({:?} {} and {:?} {})", + zone1.underlay_ip(), + zone1.zone_type.kind(), + zone1.id, + zone2.zone_type.kind(), + zone2.id, + ) + } + SledKind::SledWithMixedUnderlaySubnets { zone1, zone2 } => { + write!( + f, + "zones have underlay IPs on two different sled subnets: \ + {:?} {} ({}) and {:?} {} ({})", + zone1.zone_type.kind(), + zone1.id, + zone1.underlay_ip(), + zone2.zone_type.kind(), + zone2.id, + zone2.underlay_ip(), + ) + } + SledKind::ConflictingSledSubnets { other_sled, subnet } => { + write!( + f, + "duplicate sled subnet {} with sled {other_sled}", + subnet.net() + ) + } + SledKind::InternalDnsZoneBadSubnet { zone, rack_dns_subnets } => { + write!( + f, + "internal DNS zone {} underlay IP {} is not \ + one of the reserved rack DNS subnets ({:?})", + zone.id, + zone.underlay_ip(), + rack_dns_subnets + ) + } + SledKind::DuplicateExternalIp { zone1, zone2, ip } => { + write!( + f, + "duplicate external IP {ip} ({:?} {} and {:?} {})", + zone1.zone_type.kind(), + zone1.id, + zone2.zone_type.kind(), + zone2.id, + ) + } + SledKind::DuplicateNicIp { zone1, zone2, ip } => { + write!( + f, + "duplicate NIC IP {ip} ({:?} {} and {:?} {})", + zone1.zone_type.kind(), + zone1.id, + zone2.zone_type.kind(), + zone2.id, + ) + } + SledKind::DuplicateNicMac { zone1, zone2, mac } => { + write!( + f, + "duplicate NIC MAC {mac} ({:?} {} and {:?} {})", + zone1.zone_type.kind(), + zone1.id, + zone2.zone_type.kind(), + zone2.id, + ) + } + SledKind::ZoneDurableDatasetCollision { zone1, zone2, zpool } => { + write!( + f, + "zpool {zpool} has two zone datasets of the same kind \ + ({:?} {} and {:?} {})", + zone1.zone_type.kind(), + zone1.id, + zone2.zone_type.kind(), + zone2.id, + ) + } + SledKind::ZoneFilesystemDatasetCollision { + zone1, + zone2, + zpool, + } => { + write!( + f, + "zpool {zpool} has two zone filesystems of the same kind \ + ({:?} {} and {:?} {})", + zone1.zone_type.kind(), + zone1.id, + zone2.zone_type.kind(), + zone2.id, + ) + } + SledKind::ZpoolWithDuplicateDatasetKinds { + dataset1, + dataset2, + zpool, + } => { + write!( + f, + "two datasets of the same kind on zpool {zpool} \ + ({:?} {} and {:?} {})", + dataset1.kind, dataset1.id, dataset2.kind, dataset2.id, + ) + } + SledKind::ZpoolMissingDebugDataset { zpool } => { + write!(f, "zpool {zpool} is missing its Debug dataset") + } + SledKind::ZpoolMissingZoneRootDataset { zpool } => { + write!(f, "zpool {zpool} is missing its Zone Root dataset") + } + SledKind::ZoneMissingFilesystemDataset { zone } => { + write!( + f, + "in-service zone's filesytem dataset is missing: {:?} {}", + zone.zone_type.kind(), + zone.id, + ) + } + SledKind::ZoneMissingDurableDataset { zone } => { + write!( + f, + "in-service zone's durable dataset is missing: {:?} {}", + zone.zone_type.kind(), + zone.id, + ) + } + SledKind::ZoneWithDatasetsOnDifferentZpools { + zone, + durable_zpool, + transient_zpool, + } => { + write!( + f, + "zone {:?} {} has its durable dataset on \ + zpool {durable_zpool} but its root dataset on \ + zpool {transient_zpool}", + zone.zone_type.kind(), + zone.id, + ) + } + SledKind::SledMissingDatasets { why } => { + write!(f, "missing entry in blueprint_datasets ({why})") + } + SledKind::SledMissingDisks { why } => { + write!(f, "missing entry in blueprint_disks ({why})") + } + SledKind::OrphanedDataset { dataset } => { + let parent = match dataset.kind { + DatasetKind::Cockroach + | DatasetKind::Crucible + | DatasetKind::Clickhouse + | DatasetKind::ClickhouseKeeper + | DatasetKind::ClickhouseServer + | DatasetKind::ExternalDns + | DatasetKind::InternalDns + | DatasetKind::TransientZone { .. } => "zone", + DatasetKind::TransientZoneRoot + | DatasetKind::Debug + | DatasetKind::Update => "disk", + }; + write!( + f, + "in-service dataset ({:?} {}) with no associated {parent}", + dataset.kind, dataset.id + ) + } + SledKind::DatasetOnNonexistentZpool { dataset } => { + write!( + f, + "in-service dataset ({:?} {}) on non-existent zpool {}", + dataset.kind, dataset.id, dataset.pool + ) + } + } + } +} + +impl Note { + pub fn display(&self, sort_key: BlippyReportSortKey) -> NoteDisplay<'_> { + NoteDisplay { note: self, sort_key } + } +} + +#[derive(Debug)] +pub struct NoteDisplay<'a> { + note: &'a Note, + sort_key: BlippyReportSortKey, +} + +impl fmt::Display for NoteDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.sort_key { + BlippyReportSortKey::Kind => { + write!( + f, + "{}: {} note: {}", + self.note.kind.display_component(), + self.note.severity, + self.note.kind.display_subkind(), + ) + } + BlippyReportSortKey::Severity => { + write!( + f, + "{} note: {}: {}", + self.note.severity, + self.note.kind.display_component(), + self.note.kind.display_subkind(), + ) + } + } + } +} + +#[derive(Debug)] +pub struct Blippy<'a> { + blueprint: &'a Blueprint, + notes: Vec, +} + +impl<'a> Blippy<'a> { + pub fn new(blueprint: &'a Blueprint) -> Self { + let mut slf = Self { blueprint, notes: Vec::new() }; + checks::perform_all_blueprint_only_checks(&mut slf); + slf + } + + pub fn blueprint(&self) -> &'a Blueprint { + self.blueprint + } + + pub(crate) fn push_sled_note( + &mut self, + sled_id: SledUuid, + severity: Severity, + kind: SledKind, + ) { + self.notes.push(Note { severity, kind: Kind::Sled { sled_id, kind } }); + } + + pub fn into_report( + self, + sort_key: BlippyReportSortKey, + ) -> BlippyReport<'a> { + BlippyReport::new(self.blueprint, self.notes, sort_key) + } +} diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs new file mode 100644 index 0000000000..f5673cb77c --- /dev/null +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -0,0 +1,1564 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::blippy::Blippy; +use crate::blippy::Severity; +use crate::blippy::SledKind; +use nexus_sled_agent_shared::inventory::ZoneKind; +use nexus_types::deployment::BlueprintDatasetConfig; +use nexus_types::deployment::BlueprintDatasetFilter; +use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::deployment::BlueprintZoneFilter; +use nexus_types::deployment::OmicronZoneExternalIp; +use omicron_common::address::DnsSubnet; +use omicron_common::address::Ipv6Subnet; +use omicron_common::address::SLED_PREFIX; +use omicron_common::disk::DatasetKind; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::net::Ipv6Addr; + +pub(crate) fn perform_all_blueprint_only_checks(blippy: &mut Blippy<'_>) { + check_underlay_ips(blippy); + check_external_networking(blippy); + check_dataset_zpool_uniqueness(blippy); + check_datasets(blippy); +} + +fn check_underlay_ips(blippy: &mut Blippy<'_>) { + let mut underlay_ips: BTreeMap = + BTreeMap::new(); + let mut inferred_sled_subnets_by_sled: BTreeMap< + SledUuid, + (Ipv6Subnet, &BlueprintZoneConfig), + > = BTreeMap::new(); + let mut inferred_sled_subnets_by_subnet: BTreeMap< + Ipv6Subnet, + SledUuid, + > = BTreeMap::new(); + let mut rack_dns_subnets: BTreeSet = BTreeSet::new(); + + for (sled_id, zone) in blippy + .blueprint() + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + { + let ip = zone.underlay_ip(); + + // There should be no duplicate underlay IPs. + if let Some(previous) = underlay_ips.insert(ip, zone) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::DuplicateUnderlayIp { + zone1: previous.clone(), + zone2: zone.clone(), + }, + ); + } + + if zone.zone_type.is_internal_dns() { + // Internal DNS zones should have IPs coming from the reserved rack + // DNS subnets. + let subnet = DnsSubnet::from_addr(ip); + if rack_dns_subnets.is_empty() { + // The blueprint doesn't store the rack subnet explicitly, so we + // infer it based on the first internal DNS zone we see. + rack_dns_subnets.extend(subnet.rack_subnet().get_dns_subnets()); + } + if !rack_dns_subnets.contains(&subnet) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::InternalDnsZoneBadSubnet { + zone: zone.clone(), + rack_dns_subnets: rack_dns_subnets.clone(), + }, + ); + } + } else { + let subnet = Ipv6Subnet::new(ip); + + // Any given subnet should be used by at most one sled. + match inferred_sled_subnets_by_subnet.entry(subnet) { + Entry::Vacant(slot) => { + slot.insert(sled_id); + } + Entry::Occupied(prev) => { + if *prev.get() != sled_id { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ConflictingSledSubnets { + other_sled: *prev.get(), + subnet, + }, + ); + } + } + } + + // Any given sled should have IPs within at most one subnet. + // + // The blueprint doesn't store sled subnets explicitly, so we can't + // check that each sled is using the subnet it's supposed to. The + // best we can do is check that the sleds are internally consistent. + match inferred_sled_subnets_by_sled.entry(sled_id) { + Entry::Vacant(slot) => { + slot.insert((subnet, zone)); + } + Entry::Occupied(prev) => { + if prev.get().0 != subnet { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::SledWithMixedUnderlaySubnets { + zone1: prev.get().1.clone(), + zone2: zone.clone(), + }, + ); + } + } + } + } + } +} + +fn check_external_networking(blippy: &mut Blippy<'_>) { + let mut used_external_ips = BTreeMap::new(); + let mut used_external_floating_ips = BTreeMap::new(); + let mut used_external_snat_ips = BTreeMap::new(); + + let mut used_nic_ips = BTreeMap::new(); + let mut used_nic_macs = BTreeMap::new(); + + for (sled_id, zone, external_ip, nic) in blippy + .blueprint() + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .filter_map(|(sled_id, zone)| { + zone.zone_type + .external_networking() + .map(|(external_ip, nic)| (sled_id, zone, external_ip, nic)) + }) + { + // There should be no duplicate external IPs. + if let Some(prev_zone) = used_external_ips.insert(external_ip, zone) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::DuplicateExternalIp { + zone1: prev_zone.clone(), + zone2: zone.clone(), + ip: external_ip.ip(), + }, + ); + } + + // See the loop below; we build up separate maps to check for + // Floating/SNAT overlap that wouldn't be caught by the exact + // `used_external_ips` map above. + match external_ip { + OmicronZoneExternalIp::Floating(floating) => { + used_external_floating_ips.insert(floating.ip, zone); + } + OmicronZoneExternalIp::Snat(snat) => { + used_external_snat_ips + .insert(snat.snat_cfg.ip, (sled_id, zone)); + } + } + + // There should be no duplicate NIC IPs or MACs. + if let Some(prev_zone) = used_nic_ips.insert(nic.ip, zone) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::DuplicateNicIp { + zone1: prev_zone.clone(), + zone2: zone.clone(), + ip: nic.ip, + }, + ); + } + if let Some(prev_zone) = used_nic_macs.insert(nic.mac, zone) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::DuplicateNicMac { + zone1: prev_zone.clone(), + zone2: zone.clone(), + mac: nic.mac, + }, + ); + } + } + + // The loop above noted any exact duplicates; we should also check for any + // SNAT / Floating overlaps. For each SNAT IP, ensure we don't have a + // floating IP at the same address. + for (ip, (sled_id, zone2)) in used_external_snat_ips { + if let Some(&zone1) = used_external_floating_ips.get(&ip) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::DuplicateExternalIp { + zone1: zone1.clone(), + zone2: zone2.clone(), + ip, + }, + ); + } + } +} + +fn check_dataset_zpool_uniqueness(blippy: &mut Blippy<'_>) { + let mut durable_kinds_by_zpool: BTreeMap> = + BTreeMap::new(); + let mut transient_kinds_by_zpool: BTreeMap< + ZpoolUuid, + BTreeMap, + > = BTreeMap::new(); + + // On any given zpool, we should have at most one zone of any given + // kind. + for (sled_id, zone) in blippy + .blueprint() + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + { + // Check "one kind per zpool" for durable datasets... + if let Some(dataset) = zone.zone_type.durable_dataset() { + let kind = zone.zone_type.kind(); + if let Some(previous) = durable_kinds_by_zpool + .entry(dataset.dataset.pool_name.id()) + .or_default() + .insert(kind, zone) + { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZoneDurableDatasetCollision { + zone1: previous.clone(), + zone2: zone.clone(), + zpool: dataset.dataset.pool_name.clone(), + }, + ); + } + } + + // ... and transient datasets. + if let Some(dataset) = zone.filesystem_dataset() { + let kind = zone.zone_type.kind(); + if let Some(previous) = transient_kinds_by_zpool + .entry(dataset.pool().id()) + .or_default() + .insert(kind, zone) + { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZoneFilesystemDatasetCollision { + zone1: previous.clone(), + zone2: zone.clone(), + zpool: dataset.into_parts().0, + }, + ); + } + } + + // If a zone has both durable and transient datasets, they should be on + // the same pool. + match (zone.zone_type.durable_zpool(), zone.filesystem_pool.as_ref()) { + (Some(durable), Some(transient)) if durable != transient => { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZoneWithDatasetsOnDifferentZpools { + zone: zone.clone(), + durable_zpool: durable.clone(), + transient_zpool: transient.clone(), + }, + ); + } + _ => (), + } + } +} + +type DatasetByKind<'a> = BTreeMap; +type DatasetsByZpool<'a> = BTreeMap>; + +#[derive(Debug)] +struct DatasetsBySled<'a> { + by_sled: BTreeMap>, + noted_sleds_missing_datasets: BTreeSet, +} + +impl<'a> DatasetsBySled<'a> { + fn new(blippy: &mut Blippy<'a>) -> Self { + let mut by_sled = BTreeMap::new(); + + for (&sled_id, config) in &blippy.blueprint().blueprint_datasets { + let by_zpool: &mut BTreeMap<_, _> = + by_sled.entry(sled_id).or_default(); + + for dataset in config.datasets.values() { + let by_kind: &mut BTreeMap<_, _> = + by_zpool.entry(dataset.pool.id()).or_default(); + + match by_kind.entry(dataset.kind.clone()) { + Entry::Vacant(slot) => { + slot.insert(dataset); + } + Entry::Occupied(prev) => { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZpoolWithDuplicateDatasetKinds { + dataset1: (*prev.get()).clone(), + dataset2: dataset.clone(), + zpool: dataset.pool.id(), + }, + ); + } + } + } + } + + Self { by_sled, noted_sleds_missing_datasets: BTreeSet::new() } + } + + // Get the datasets for each zpool on a given sled, or add a fatal note to + // `blippy` that the sled is missing an entry in `blueprint_datasets` for + // the specified reason `why`. + fn get_sled_or_note_missing( + &mut self, + blippy: &mut Blippy<'_>, + sled_id: SledUuid, + why: &'static str, + ) -> Option<&DatasetsByZpool<'a>> { + let maybe_datasets = self.by_sled.get(&sled_id); + if maybe_datasets.is_none() + && self.noted_sleds_missing_datasets.insert(sled_id) + { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::SledMissingDatasets { why }, + ); + } + maybe_datasets + } +} + +fn check_datasets(blippy: &mut Blippy<'_>) { + let mut datasets = DatasetsBySled::new(blippy); + + // As we loop through all the datasets we expect to see, mark them down. + // Afterwards, we'll check for any datasets present that we _didn't_ expect + // to see. + let mut expected_datasets = BTreeSet::new(); + + // All disks should have debug and zone root datasets. + // + // TODO-correctness We currently only include in-service disks in the + // blueprint; once we include expunged or decommissioned disks too, we + // should filter here to only in-service. + for (&sled_id, disk_config) in &blippy.blueprint().blueprint_disks { + let Some(sled_datasets) = datasets.get_sled_or_note_missing( + blippy, + sled_id, + "sled has an entry in blueprint_disks", + ) else { + continue; + }; + + for disk in &disk_config.disks { + let sled_datasets = sled_datasets.get(&disk.pool_id); + + match sled_datasets + .and_then(|by_zpool| by_zpool.get(&DatasetKind::Debug)) + { + Some(dataset) => { + expected_datasets.insert(dataset.id); + } + None => { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZpoolMissingDebugDataset { + zpool: disk.pool_id, + }, + ); + } + } + + match sled_datasets.and_then(|by_zpool| { + by_zpool.get(&DatasetKind::TransientZoneRoot) + }) { + Some(dataset) => { + expected_datasets.insert(dataset.id); + } + None => { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZpoolMissingZoneRootDataset { + zpool: disk.pool_id, + }, + ); + } + } + } + } + + // There should be a dataset for every dataset referenced by a running zone + // (filesystem or durable). + for (sled_id, zone_config) in blippy + .blueprint() + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + { + let Some(sled_datasets) = datasets.get_sled_or_note_missing( + blippy, + sled_id, + "sled has running zones", + ) else { + continue; + }; + + match &zone_config.filesystem_dataset() { + Some(dataset) => { + match sled_datasets + .get(&dataset.pool().id()) + .and_then(|by_zpool| by_zpool.get(dataset.dataset())) + { + Some(dataset) => { + expected_datasets.insert(dataset.id); + } + None => { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZoneMissingFilesystemDataset { + zone: zone_config.clone(), + }, + ); + } + } + } + None => { + // TODO-john Add a Severity::BackwardsCompatibility and note the + // missing filesystem pool + } + } + + if let Some(dataset) = zone_config.zone_type.durable_dataset() { + match sled_datasets + .get(&dataset.dataset.pool_name.id()) + .and_then(|by_zpool| by_zpool.get(&dataset.kind)) + { + Some(dataset) => { + expected_datasets.insert(dataset.id); + } + None => { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::ZoneMissingDurableDataset { + zone: zone_config.clone(), + }, + ); + } + } + } + } + + // TODO-correctness We currently only include in-service disks in the + // blueprint; once we include expunged or decommissioned disks too, we + // should filter here to only in-service. + let in_service_sled_zpools = blippy + .blueprint() + .blueprint_disks + .iter() + .map(|(sled_id, disk_config)| { + ( + sled_id, + disk_config + .disks + .iter() + .map(|disk| disk.pool_id) + .collect::>(), + ) + }) + .collect::>(); + let mut noted_sleds_without_disks = BTreeSet::new(); + + // All datasets should be on zpools that have disk records, and all datasets + // should have been referenced by either a zone or a disk above. + for (sled_id, dataset) in blippy + .blueprint() + .all_omicron_datasets(BlueprintDatasetFilter::InService) + { + if !expected_datasets.contains(&dataset.id) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::OrphanedDataset { dataset: dataset.clone() }, + ); + continue; + } + + let Some(sled_zpools) = in_service_sled_zpools.get(&sled_id) else { + if noted_sleds_without_disks.insert(sled_id) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::SledMissingDisks { + why: "sled has in-service datasets", + }, + ); + } + continue; + }; + + if !sled_zpools.contains(&dataset.pool.id()) { + blippy.push_sled_note( + sled_id, + Severity::Fatal, + SledKind::DatasetOnNonexistentZpool { + dataset: dataset.clone(), + }, + ); + continue; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::blippy::Kind; + use crate::blippy::Note; + use crate::BlippyReportSortKey; + use nexus_reconfigurator_planning::example::example; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_types::deployment::blueprint_zone_type; + use nexus_types::deployment::BlueprintZoneType; + use omicron_test_utils::dev::test_setup_log; + + // The tests below all take the example blueprint, mutate in some invalid + // way, and confirm that blippy reports the invalidity. This test confirms + // the unmutated blueprint has no blippy notes. + #[test] + fn test_example_blueprint_is_blippy_clean() { + static TEST_NAME: &str = "test_example_blueprint_is_blippy_clean"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, blueprint) = example(&logctx.log, TEST_NAME); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + if !report.notes().is_empty() { + eprintln!("{}", report.display()); + panic!("example blueprint should have no blippy notes"); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_duplicate_underlay_ips() { + static TEST_NAME: &str = "test_duplicate_underlay_ips"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Copy the underlay IP from one Nexus to another. + let mut nexus_iter = blueprint.blueprint_zones.iter_mut().flat_map( + |(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_nexus() { + Some((*sled_id, zone)) + } else { + None + } + }) + }, + ); + let (nexus0_sled_id, nexus0) = + nexus_iter.next().expect("at least one Nexus zone"); + let (nexus1_sled_id, nexus1) = + nexus_iter.next().expect("at least two Nexus zones"); + assert_ne!(nexus0_sled_id, nexus1_sled_id); + + let dup_ip = nexus0.underlay_ip(); + match &mut nexus1.zone_type { + BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + internal_address, + .. + }) => { + internal_address.set_ip(dup_ip); + } + _ => unreachable!("this is a Nexus zone"), + }; + + // This illegal modification should result in at least three notes: a + // duplicate underlay IP, duplicate sled subnets, and sled1 having mixed + // underlay subnets (the details of which depend on the ordering of + // zones, so we'll sort that out here). + let nexus0 = nexus0.clone(); + let nexus1 = nexus1.clone(); + let (mixed_underlay_zone1, mixed_underlay_zone2) = { + let mut sled1_zones = blueprint + .blueprint_zones + .get(&nexus1_sled_id) + .unwrap() + .zones + .iter(); + let sled1_zone1 = sled1_zones.next().expect("at least one zone"); + let sled1_zone2 = sled1_zones.next().expect("at least two zones"); + if sled1_zone1.id == nexus1.id { + (nexus1.clone(), sled1_zone2.clone()) + } else { + (sled1_zone1.clone(), nexus1.clone()) + } + }; + let expected_notes = [ + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: nexus1_sled_id, + kind: SledKind::DuplicateUnderlayIp { + zone1: nexus0.clone(), + zone2: nexus1.clone(), + }, + }, + }, + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: nexus1_sled_id, + kind: SledKind::SledWithMixedUnderlaySubnets { + zone1: mixed_underlay_zone1, + zone2: mixed_underlay_zone2, + }, + }, + }, + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: nexus1_sled_id, + kind: SledKind::ConflictingSledSubnets { + other_sled: nexus0_sled_id, + subnet: Ipv6Subnet::new(dup_ip), + }, + }, + }, + ]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_bad_internal_dns_subnet() { + static TEST_NAME: &str = "test_bad_internal_dns_subnet"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Change the second internal DNS zone to be from a different rack + // subnet. + let mut internal_dns_iter = blueprint + .blueprint_zones + .iter_mut() + .flat_map(|(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_internal_dns() { + Some((*sled_id, zone)) + } else { + None + } + }) + }); + let (dns0_sled_id, dns0) = + internal_dns_iter.next().expect("at least one internal DNS zone"); + let (dns1_sled_id, dns1) = + internal_dns_iter.next().expect("at least two internal DNS zones"); + assert_ne!(dns0_sled_id, dns1_sled_id); + + let dns0_ip = dns0.underlay_ip(); + let rack_subnet = DnsSubnet::from_addr(dns0_ip).rack_subnet(); + let different_rack_subnet = { + // Flip the high bit of the existing underlay IP to guarantee a + // different rack subnet + let hi_bit = 1_u128 << 127; + let lo_bits = !hi_bit; + let hi_bit_ip = Ipv6Addr::from(hi_bit); + let lo_bits_ip = Ipv6Addr::from(lo_bits); + // Build XOR out of the operations we have... + let flipped_ip = if hi_bit_ip & dns0_ip == hi_bit_ip { + dns0_ip & lo_bits_ip + } else { + dns0_ip | hi_bit_ip + }; + DnsSubnet::from_addr(flipped_ip).rack_subnet() + }; + let different_dns_subnet = different_rack_subnet.get_dns_subnet(0); + + match &mut dns1.zone_type { + BlueprintZoneType::InternalDns( + blueprint_zone_type::InternalDns { + http_address, + dns_address, + .. + }, + ) => { + http_address.set_ip(different_dns_subnet.dns_address()); + dns_address.set_ip(different_dns_subnet.dns_address()); + } + _ => unreachable!("this is an internal DNS zone"), + }; + + let expected_note = Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: dns1_sled_id, + kind: SledKind::InternalDnsZoneBadSubnet { + zone: dns1.clone(), + rack_dns_subnets: rack_subnet + .get_dns_subnets() + .into_iter() + .collect(), + }, + }, + }; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + assert!( + report.notes().contains(&expected_note), + "did not find expected note {expected_note:?}" + ); + + logctx.cleanup_successful(); + } + + #[test] + fn test_duplicate_external_ip() { + static TEST_NAME: &str = "test_duplicate_external_ip"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Copy the external IP from one Nexus to another. + let mut nexus_iter = blueprint.blueprint_zones.iter_mut().flat_map( + |(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_nexus() { + Some((*sled_id, zone)) + } else { + None + } + }) + }, + ); + let (nexus0_sled_id, nexus0) = + nexus_iter.next().expect("at least one Nexus zone"); + let (nexus1_sled_id, nexus1) = + nexus_iter.next().expect("at least two Nexus zones"); + assert_ne!(nexus0_sled_id, nexus1_sled_id); + + let dup_ip = match nexus0 + .zone_type + .external_networking() + .expect("Nexus has external networking") + .0 + { + OmicronZoneExternalIp::Floating(ip) => ip, + OmicronZoneExternalIp::Snat(_) => { + unreachable!("Nexus has a floating IP") + } + }; + match &mut nexus1.zone_type { + BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + external_ip, + .. + }) => { + *external_ip = dup_ip; + } + _ => unreachable!("this is a Nexus zone"), + }; + + let expected_notes = [Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: nexus1_sled_id, + kind: SledKind::DuplicateExternalIp { + zone1: nexus0.clone(), + zone2: nexus1.clone(), + ip: dup_ip.ip, + }, + }, + }]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_duplicate_nic_ip() { + static TEST_NAME: &str = "test_duplicate_nic_ip"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Copy the external IP from one Nexus to another. + let mut nexus_iter = blueprint.blueprint_zones.iter_mut().flat_map( + |(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_nexus() { + Some((*sled_id, zone)) + } else { + None + } + }) + }, + ); + let (nexus0_sled_id, nexus0) = + nexus_iter.next().expect("at least one Nexus zone"); + let (nexus1_sled_id, nexus1) = + nexus_iter.next().expect("at least two Nexus zones"); + assert_ne!(nexus0_sled_id, nexus1_sled_id); + + let dup_ip = nexus0 + .zone_type + .external_networking() + .expect("Nexus has external networking") + .1 + .ip; + match &mut nexus1.zone_type { + BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + nic, + .. + }) => { + nic.ip = dup_ip; + } + _ => unreachable!("this is a Nexus zone"), + }; + + let expected_notes = [Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: nexus1_sled_id, + kind: SledKind::DuplicateNicIp { + zone1: nexus0.clone(), + zone2: nexus1.clone(), + ip: dup_ip, + }, + }, + }]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_duplicate_nic_mac() { + static TEST_NAME: &str = "test_duplicate_nic_mac"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Copy the external IP from one Nexus to another. + let mut nexus_iter = blueprint.blueprint_zones.iter_mut().flat_map( + |(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_nexus() { + Some((*sled_id, zone)) + } else { + None + } + }) + }, + ); + let (nexus0_sled_id, nexus0) = + nexus_iter.next().expect("at least one Nexus zone"); + let (nexus1_sled_id, nexus1) = + nexus_iter.next().expect("at least two Nexus zones"); + assert_ne!(nexus0_sled_id, nexus1_sled_id); + + let dup_mac = nexus0 + .zone_type + .external_networking() + .expect("Nexus has external networking") + .1 + .mac; + match &mut nexus1.zone_type { + BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + nic, + .. + }) => { + nic.mac = dup_mac; + } + _ => unreachable!("this is a Nexus zone"), + }; + + let expected_notes = [Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: nexus1_sled_id, + kind: SledKind::DuplicateNicMac { + zone1: nexus0.clone(), + zone2: nexus1.clone(), + mac: dup_mac, + }, + }, + }]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_durable_dataset_collision() { + static TEST_NAME: &str = "test_durable_dataset_collision"; + let logctx = test_setup_log(TEST_NAME); + let (_, mut blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .external_dns_count(2) + .unwrap() + .build(); + + // Copy the durable zpool from one external DNS to another. + let mut dns_iter = blueprint.blueprint_zones.iter_mut().flat_map( + |(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_external_dns() { + Some((*sled_id, zone)) + } else { + None + } + }) + }, + ); + let (dns0_sled_id, dns0) = + dns_iter.next().expect("at least one external DNS zone"); + let (dns1_sled_id, dns1) = + dns_iter.next().expect("at least two external DNS zones"); + assert_ne!(dns0_sled_id, dns1_sled_id); + + let dup_zpool = dns0 + .zone_type + .durable_zpool() + .expect("external DNS has a durable zpool") + .clone(); + match &mut dns1.zone_type { + BlueprintZoneType::ExternalDns( + blueprint_zone_type::ExternalDns { dataset, .. }, + ) => { + dataset.pool_name = dup_zpool.clone(); + } + _ => unreachable!("this is an external DNS zone"), + }; + + let expected_notes = [ + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: dns1_sled_id, + kind: SledKind::ZoneDurableDatasetCollision { + zone1: dns0.clone(), + zone2: dns1.clone(), + zpool: dup_zpool.clone(), + }, + }, + }, + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: dns1_sled_id, + kind: SledKind::ZoneWithDatasetsOnDifferentZpools { + zone: dns1.clone(), + durable_zpool: dup_zpool.clone(), + transient_zpool: dns1.filesystem_pool.clone().unwrap(), + }, + }, + }, + ]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_transient_root_dataset_collision() { + static TEST_NAME: &str = "test_transient_root_dataset_collision"; + let logctx = test_setup_log(TEST_NAME); + let (_, mut blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .external_dns_count(2) + .unwrap() + .build(); + + // Copy the filesystem zpool from one external DNS to another. + let mut dns_iter = blueprint.blueprint_zones.iter_mut().flat_map( + |(sled_id, zones_config)| { + zones_config.zones.iter_mut().filter_map(move |zone| { + if zone.zone_type.is_external_dns() { + Some((*sled_id, zone)) + } else { + None + } + }) + }, + ); + let (dns0_sled_id, dns0) = + dns_iter.next().expect("at least one external DNS zone"); + let (dns1_sled_id, dns1) = + dns_iter.next().expect("at least two external DNS zones"); + assert_ne!(dns0_sled_id, dns1_sled_id); + + let dup_zpool = dns0 + .filesystem_pool + .as_ref() + .expect("external DNS has a filesystem zpool") + .clone(); + dns1.filesystem_pool = Some(dup_zpool.clone()); + + let expected_notes = [ + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: dns1_sled_id, + kind: SledKind::ZoneFilesystemDatasetCollision { + zone1: dns0.clone(), + zone2: dns1.clone(), + zpool: dup_zpool.clone(), + }, + }, + }, + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: dns1_sled_id, + kind: SledKind::ZoneWithDatasetsOnDifferentZpools { + zone: dns1.clone(), + durable_zpool: dns1 + .zone_type + .durable_zpool() + .unwrap() + .clone(), + transient_zpool: dup_zpool.clone(), + }, + }, + }, + ]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_zpool_with_duplicate_dataset_kinds() { + static TEST_NAME: &str = "test_zpool_with_duplicate_dataset_kinds"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + let mut by_kind = BTreeMap::new(); + + // Loop over the datasets until we find a dataset kind that already + // exists on a different zpool, then copy it over. + let mut found_sled_id = None; + let mut dataset1 = None; + let mut dataset2 = None; + let mut zpool = None; + 'outer: for (sled_id, datasets_config) in + blueprint.blueprint_datasets.iter_mut() + { + for dataset in datasets_config.datasets.values_mut() { + if let Some(prev) = + by_kind.insert(dataset.kind.clone(), dataset.clone()) + { + dataset.pool = prev.pool.clone(); + + found_sled_id = Some(*sled_id); + dataset1 = Some(prev); + dataset2 = Some(dataset.clone()); + zpool = Some(dataset.pool.clone()); + break 'outer; + } + } + } + let sled_id = found_sled_id.expect("found dataset to move"); + let dataset1 = dataset1.expect("found dataset to move"); + let dataset2 = dataset2.expect("found dataset to move"); + let zpool = zpool.expect("found dataset to move"); + + let expected_notes = [Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id, + kind: SledKind::ZpoolWithDuplicateDatasetKinds { + dataset1, + dataset2, + zpool: zpool.id(), + }, + }, + }]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_zpool_missing_default_datasets() { + static TEST_NAME: &str = "test_zpool_missing_default_datasets"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Drop the Debug dataset from one zpool and the ZoneRoot dataset from + // another; we should catch both errors. + let (sled_id, datasets_config) = blueprint + .blueprint_datasets + .iter_mut() + .next() + .expect("at least one sled"); + + let mut debug_dataset = None; + let mut zoneroot_dataset = None; + for dataset in &mut datasets_config.datasets.values_mut() { + match &dataset.kind { + DatasetKind::Debug if debug_dataset.is_none() => { + debug_dataset = Some(dataset.clone()); + } + DatasetKind::TransientZoneRoot + if debug_dataset.is_some() + && zoneroot_dataset.is_none() => + { + if Some(&dataset.pool) + != debug_dataset.as_ref().map(|d| &d.pool) + { + zoneroot_dataset = Some(dataset.clone()); + break; + } + } + _ => (), + } + } + let debug_dataset = + debug_dataset.expect("found Debug dataset to prune"); + let zoneroot_dataset = + zoneroot_dataset.expect("found ZoneRoot dataset to prune"); + assert_ne!(debug_dataset.pool, zoneroot_dataset.pool); + + // Actually strip these from the blueprint. + datasets_config.datasets.retain(|&dataset_id, _| { + dataset_id != debug_dataset.id && dataset_id != zoneroot_dataset.id + }); + + let expected_notes = [ + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::ZpoolMissingDebugDataset { + zpool: debug_dataset.pool.id(), + }, + }, + }, + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::ZpoolMissingZoneRootDataset { + zpool: zoneroot_dataset.pool.id(), + }, + }, + }, + ]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_zone_missing_datasets() { + static TEST_NAME: &str = "test_zone_missing_datasets"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + let (sled_id, datasets_config) = blueprint + .blueprint_datasets + .iter_mut() + .next() + .expect("at least one sled"); + let zones_config = blueprint + .blueprint_zones + .get(sled_id) + .expect("got zones for sled with datasets"); + + // Pick a zone with a durable dataset to remove, and a different zone + // with a filesystem_pool dataset to remove. + let mut durable_zone = None; + let mut root_zone = None; + for z in &zones_config.zones { + if durable_zone.is_none() { + if z.zone_type.durable_zpool().is_some() { + durable_zone = Some(z.clone()); + } + } else if root_zone.is_none() { + root_zone = Some(z); + break; + } + } + let durable_zone = + durable_zone.expect("found zone with durable dataset to prune"); + let root_zone = + root_zone.expect("found zone with root dataset to prune"); + assert_ne!(durable_zone.filesystem_pool, root_zone.filesystem_pool); + + // Actually strip these from the blueprint. + datasets_config.datasets.retain(|_, dataset| { + let matches_durable = (dataset.pool + == *durable_zone.zone_type.durable_zpool().unwrap()) + && (dataset.kind + == durable_zone.zone_type.durable_dataset().unwrap().kind); + let root_dataset = root_zone.filesystem_dataset().unwrap(); + let matches_root = (dataset.pool == *root_dataset.pool()) + && (dataset.kind == *root_dataset.dataset()); + !matches_durable && !matches_root + }); + + let expected_notes = [ + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::ZoneMissingFilesystemDataset { + zone: root_zone.clone(), + }, + }, + }, + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::ZoneMissingDurableDataset { + zone: durable_zone, + }, + }, + }, + ]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_sled_missing_datasets() { + static TEST_NAME: &str = "test_sled_missing_datasets"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Pick one sled and remove its blueprint_datasets entry entirely. + let removed_sled_id = *blueprint + .blueprint_datasets + .keys() + .next() + .expect("at least one sled"); + blueprint + .blueprint_datasets + .retain(|&sled_id, _| sled_id != removed_sled_id); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + let mut found_sled_missing_note = false; + for note in report.notes() { + if note.severity == Severity::Fatal { + match ¬e.kind { + Kind::Sled { + sled_id, + kind: SledKind::SledMissingDatasets { .. }, + } if *sled_id == removed_sled_id => { + found_sled_missing_note = true; + } + _ => (), + } + } + } + assert!( + found_sled_missing_note, + "did not find expected note for missing datasets entry for \ + sled {removed_sled_id}" + ); + + logctx.cleanup_successful(); + } + + #[test] + fn test_sled_missing_disks() { + static TEST_NAME: &str = "test_sled_missing_disks"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Pick one sled and remove its blueprint_disks entry entirely. + let removed_sled_id = *blueprint + .blueprint_disks + .keys() + .next() + .expect("at least one sled"); + blueprint + .blueprint_disks + .retain(|&sled_id, _| sled_id != removed_sled_id); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + let mut found_sled_missing_note = false; + for note in report.notes() { + if note.severity == Severity::Fatal { + match ¬e.kind { + Kind::Sled { + sled_id, + kind: SledKind::SledMissingDisks { .. }, + } if *sled_id == removed_sled_id => { + found_sled_missing_note = true; + } + _ => (), + } + } + } + assert!( + found_sled_missing_note, + "did not find expected note for missing disks entry for \ + sled {removed_sled_id}" + ); + + logctx.cleanup_successful(); + } + + #[test] + fn test_orphaned_datasets() { + static TEST_NAME: &str = "test_orphaned_datasets"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Pick two zones (one with a durable dataset and one with a filesystem + // root dataset), and remove both those zones, which should orphan their + // datasets. + let (sled_id, datasets_config) = blueprint + .blueprint_datasets + .iter_mut() + .next() + .expect("at least one sled"); + let zones_config = blueprint + .blueprint_zones + .get_mut(sled_id) + .expect("got zones for sled with datasets"); + let mut durable_zone = None; + let mut root_zone = None; + for z in &zones_config.zones { + if durable_zone.is_none() { + if z.zone_type.durable_zpool().is_some() { + durable_zone = Some(z.clone()); + } + } else if root_zone.is_none() { + root_zone = Some(z.clone()); + break; + } + } + let durable_zone = + durable_zone.expect("found zone with durable dataset to prune"); + let root_zone = + root_zone.expect("found zone with root dataset to prune"); + zones_config + .zones + .retain(|z| z.id != durable_zone.id && z.id != root_zone.id); + + let durable_dataset = durable_zone.zone_type.durable_dataset().unwrap(); + let root_dataset = root_zone.filesystem_dataset().unwrap(); + + // Find the datasets we expect to have been orphaned. + let expected_notes = datasets_config + .datasets + .values() + .filter_map(|dataset| { + if (dataset.pool == durable_dataset.dataset.pool_name + && dataset.kind == durable_dataset.kind) + || (dataset.pool == *root_dataset.pool() + && dataset.kind == *root_dataset.dataset()) + { + Some(Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::OrphanedDataset { + dataset: dataset.clone(), + }, + }, + }) + } else { + None + } + }) + .collect::>(); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_dataset_on_nonexistent_zpool() { + static TEST_NAME: &str = "test_dataset_on_nonexistent_zpool"; + let logctx = test_setup_log(TEST_NAME); + let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); + + // Remove one zpool from one sled, then check that all datasets on that + // zpool produce report notes. + let (sled_id, disks_config) = blueprint + .blueprint_disks + .iter_mut() + .next() + .expect("at least one sled"); + let removed_disk = disks_config.disks.remove(0); + eprintln!("removed disk {removed_disk:?}"); + + let expected_notes = blueprint + .blueprint_datasets + .get(sled_id) + .unwrap() + .datasets + .values() + .filter_map(|dataset| { + if dataset.pool.id() != removed_disk.pool_id { + return None; + } + + let note = match dataset.kind { + DatasetKind::Debug | DatasetKind::TransientZoneRoot => { + Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::OrphanedDataset { + dataset: dataset.clone(), + }, + }, + } + } + _ => Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: *sled_id, + kind: SledKind::DatasetOnNonexistentZpool { + dataset: dataset.clone(), + }, + }, + }, + }; + Some(note) + }) + .collect::>(); + assert!(!expected_notes.is_empty()); + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + for note in expected_notes { + assert!( + report.notes().contains(¬e), + "did not find expected note {note:?}" + ); + } + + logctx.cleanup_successful(); + } +} diff --git a/nexus/reconfigurator/blippy/src/lib.rs b/nexus/reconfigurator/blippy/src/lib.rs new file mode 100644 index 0000000000..283bbfc0d0 --- /dev/null +++ b/nexus/reconfigurator/blippy/src/lib.rs @@ -0,0 +1,24 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Blippy: the blueprint checker +//! +//! [`Blippy`] performs a variety of checks on blueprints to ensure they are +//! internally-consistent (e.g., "every in-service zone that should have one or +//! more datasets do", or "any given external IP address is used by at most one +//! in-service zone"). It emits [`BlippyReport`]s in the form of a list of +//! [`BlippyNote`]s, each of which has an associated severity and parent +//! component (typically a sled). + +mod blippy; +mod checks; +mod report; + +pub use blippy::Blippy; +pub use blippy::Kind as BlippyKind; +pub use blippy::Note as BlippyNote; +pub use blippy::Severity as BlippySeverity; +pub use blippy::SledKind as BlippySledKind; +pub use report::BlippyReport; +pub use report::BlippyReportSortKey; diff --git a/nexus/reconfigurator/blippy/src/report.rs b/nexus/reconfigurator/blippy/src/report.rs new file mode 100644 index 0000000000..136d3b7538 --- /dev/null +++ b/nexus/reconfigurator/blippy/src/report.rs @@ -0,0 +1,85 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::blippy::Note; +use core::fmt; +use nexus_types::deployment::Blueprint; + +#[derive(Debug, Clone, Copy)] +pub enum BlippyReportSortKey { + Kind, + Severity, +} + +#[derive(Debug)] +pub struct BlippyReport<'a> { + blueprint: &'a Blueprint, + notes: Vec, + sort_key: BlippyReportSortKey, +} + +impl<'a> BlippyReport<'a> { + pub(crate) fn new( + blueprint: &'a Blueprint, + notes: Vec, + sort_key: BlippyReportSortKey, + ) -> Self { + let mut slf = Self { blueprint, notes, sort_key }; + slf.sort_notes_by_key(sort_key); + slf + } + + pub fn sort_notes_by_key(&mut self, key: BlippyReportSortKey) { + match key { + BlippyReportSortKey::Kind => { + self.notes.sort_unstable_by(|a, b| { + let a = (&a.kind, &a.severity); + let b = (&b.kind, &b.severity); + a.cmp(&b) + }); + } + BlippyReportSortKey::Severity => { + self.notes.sort_unstable_by(|a, b| { + let a = (&a.severity, &a.kind); + let b = (&b.severity, &b.kind); + a.cmp(&b) + }); + } + } + self.sort_key = key; + } + + pub fn blueprint(&self) -> &'a Blueprint { + self.blueprint + } + + pub fn notes(&self) -> &[Note] { + &self.notes + } + + pub fn display(&self) -> BlippyReportDisplay<'_> { + BlippyReportDisplay { report: self } + } +} + +#[derive(Debug)] +pub struct BlippyReportDisplay<'a> { + report: &'a BlippyReport<'a>, +} + +impl fmt::Display for BlippyReportDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let pluralize = if self.report.notes.len() == 1 { "" } else { "s" }; + writeln!( + f, + "blippy report for blueprint {}: {} note{pluralize}", + self.report.blueprint.id, + self.report.notes.len(), + )?; + for note in self.report.notes() { + writeln!(f, " {}", note.display(self.report.sort_key))?; + } + Ok(()) + } +} diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 5ba1665483..543b9bd278 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -400,7 +400,9 @@ fn register_dataset_records_step<'a>( &opctx, datastore, bp_id, - blueprint.all_omicron_datasets(BlueprintDatasetFilter::All), + blueprint + .all_omicron_datasets(BlueprintDatasetFilter::All) + .map(|(_sled_id, dataset)| dataset), ) .await?; diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index efed173a71..6656693a90 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -19,6 +19,7 @@ ipnet.workspace = true itertools.workspace = true nexus-config.workspace = true nexus-inventory.workspace = true +nexus-reconfigurator-blippy.workspace = true nexus-sled-agent-shared.workspace = true nexus-types.workspace = true omicron-common.workspace = true diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index ad59fb3718..0aaadb624d 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1834,11 +1834,13 @@ impl<'a> BlueprintBuilder<'a> { let mut skip_zpools = BTreeSet::new(); for zone_config in self .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) + .filter(|z| z.zone_type.kind() == zone_kind) { if let Some(zpool) = zone_config.zone_type.durable_zpool() { - if zone_kind == zone_config.zone_type.kind() { - skip_zpools.insert(zpool); - } + skip_zpools.insert(zpool); + } + if let Some(zpool) = &zone_config.filesystem_pool { + skip_zpools.insert(zpool); } } @@ -1901,211 +1903,34 @@ impl<'a> BlueprintBuilder<'a> { #[cfg(test)] pub mod test { use super::*; - use crate::blueprint_builder::external_networking::ExternalIpAllocator; use crate::example::example; use crate::example::ExampleSystemBuilder; use crate::example::SimRngState; use crate::system::SledBuilder; use nexus_inventory::CollectionBuilder; - use nexus_types::deployment::BlueprintDatasetConfig; + use nexus_reconfigurator_blippy::Blippy; + use nexus_reconfigurator_blippy::BlippyReportSortKey; use nexus_types::deployment::BlueprintDatasetDisposition; - use nexus_types::deployment::BlueprintDatasetFilter; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::OmicronZoneNetworkResources; use nexus_types::external_api::views::SledPolicy; use omicron_common::address::IpRange; - use omicron_common::disk::DatasetKind; use omicron_test_utils::dev::test_setup_log; - use omicron_uuid_kinds::DatasetUuid; use std::collections::BTreeSet; use std::mem; pub const DEFAULT_N_SLEDS: usize = 3; - fn datasets_for_sled( - blueprint: &Blueprint, - sled_id: SledUuid, - ) -> &BTreeMap { - &blueprint - .blueprint_datasets - .get(&sled_id) - .unwrap_or_else(|| { - panic!("Cannot find datasets on missing sled: {sled_id}") - }) - .datasets - } - - fn find_dataset<'a>( - datasets: &'a BTreeMap, - zpool: &ZpoolName, - kind: DatasetKind, - ) -> &'a BlueprintDatasetConfig { - datasets.values().find(|dataset| { - &dataset.pool == zpool && - dataset.kind == kind - }).unwrap_or_else(|| { - let kinds = datasets.values().map(|d| (&d.id, &d.pool, &d.kind)).collect::>(); - panic!("Cannot find dataset of type {kind}\nFound the following: {kinds:#?}") - }) - } - /// Checks various conditions that should be true for all blueprints #[track_caller] pub fn verify_blueprint(blueprint: &Blueprint) { - // There should be no duplicate underlay IPs. - let mut underlay_ips: BTreeMap = - BTreeMap::new(); - for (_, zone) in - blueprint.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) - { - if let Some(previous) = - underlay_ips.insert(zone.underlay_ip(), zone) - { - panic!( - "found duplicate underlay IP {} in zones {} and {}\ - \n\n\ - blueprint: {}", - zone.underlay_ip(), - zone.id, - previous.id, - blueprint.display(), - ); - } - } - - // There should be no duplicate external IPs. - // - // Checking this is slightly complicated due to SNAT IPs, so we'll - // delegate to an `ExternalIpAllocator`, which already contains the - // logic for dup checking. (`mark_ip_used` fails if the IP is _already_ - // marked as used.) - // - // We create this with an empty set of service IP pool ranges; those are - // used for allocation, which we don't do, and aren't needed for - // duplicate checking. - let mut ip_allocator = ExternalIpAllocator::new(&[]); - for (external_ip, _nic) in blueprint - .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) - .filter_map(|(_, zone)| zone.zone_type.external_networking()) - { - ip_allocator - .mark_ip_used(&external_ip) - .expect("no duplicate external IPs in running zones"); - } - - // On any given zpool, we should have at most one zone of any given - // kind. - // - // TODO: we may want a similar check for non-durable datasets? - let mut kinds_by_zpool: BTreeMap< - ZpoolUuid, - BTreeMap, - > = BTreeMap::new(); - for (_, zone) in blueprint.all_omicron_zones(BlueprintZoneFilter::All) { - if let Some(dataset) = zone.zone_type.durable_dataset() { - let kind = zone.zone_type.kind(); - if let Some(previous) = kinds_by_zpool - .entry(dataset.dataset.pool_name.id()) - .or_default() - .insert(kind, zone.id) - { - panic!( - "zpool {} has two zones of kind {kind:?}: {} and {}\ - \n\n\ - blueprint: {}", - dataset.dataset.pool_name, - zone.id, - previous, - blueprint.display(), - ); - } - } - } - - // All disks should have debug and zone root datasets. - for (sled_id, disk_config) in &blueprint.blueprint_disks { - for disk in &disk_config.disks { - eprintln!( - "checking datasets for sled {sled_id} disk {}", - disk.id - ); - let zpool = ZpoolName::new_external(disk.pool_id); - let datasets = datasets_for_sled(&blueprint, *sled_id); - - let dataset = - find_dataset(&datasets, &zpool, DatasetKind::Debug); - assert_eq!( - dataset.disposition, - BlueprintDatasetDisposition::InService - ); - let dataset = find_dataset( - &datasets, - &zpool, - DatasetKind::TransientZoneRoot, - ); - assert_eq!( - dataset.disposition, - BlueprintDatasetDisposition::InService - ); - } - } - - // All zones should have dataset records. - for (sled_id, zone_config) in - blueprint.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) - { - match blueprint.sled_state.get(&sled_id) { - // Decommissioned sleds don't keep dataset state around. - // - // Normally we wouldn't observe zones from decommissioned sleds - // anyway, but that's the responsibility of the Planner, not the - // BlueprintBuilder. - None | Some(SledState::Decommissioned) => continue, - Some(SledState::Active) => (), - } - let datasets = datasets_for_sled(&blueprint, sled_id); - - let (zpool, kind) = - zone_config.filesystem_dataset().unwrap().into_parts(); - let dataset = find_dataset(&datasets, &zpool, kind); - assert_eq!( - dataset.disposition, - BlueprintDatasetDisposition::InService - ); - - if let Some(durable_dataset) = - zone_config.zone_type.durable_dataset() - { - let zpool = &durable_dataset.dataset.pool_name; - let dataset = - find_dataset(&datasets, &zpool, durable_dataset.kind); - assert_eq!( - dataset.disposition, - BlueprintDatasetDisposition::InService - ); - } - } - - // All datasets should be on zpools that have disk records. - for (sled_id, datasets) in &blueprint.blueprint_datasets { - let sled_disk_zpools = blueprint - .blueprint_disks - .get(&sled_id) - .expect("no disks for sled") - .disks - .iter() - .map(|disk| disk.pool_id) - .collect::>(); - - for dataset in datasets.datasets.values().filter(|dataset| { - dataset.disposition.matches(BlueprintDatasetFilter::InService) - }) { - assert!( - sled_disk_zpools.contains(&dataset.pool.id()), - "sled {sled_id} has dataset {dataset:?}, \ - which references a zpool without an associated disk", - ); - } + let blippy_report = + Blippy::new(blueprint).into_report(BlippyReportSortKey::Kind); + if !blippy_report.notes().is_empty() { + eprintln!("{}", blueprint.display()); + eprintln!("---"); + eprintln!("{}", blippy_report.display()); + panic!("expected blippy report for blueprint to have no notes"); } } @@ -2314,6 +2139,20 @@ pub mod test { *blueprint1.sled_state.get_mut(&decommision_sled_id).unwrap() = SledState::Decommissioned; + // We're going under the hood of the blueprint here; a sled can only get + // to the decommissioned state if all its disks/datasets/zones have been + // expunged, so do that too. + for zone in &mut blueprint1 + .blueprint_zones + .get_mut(&decommision_sled_id) + .expect("has zones") + .zones + { + zone.disposition = BlueprintZoneDisposition::Expunged; + } + blueprint1.blueprint_datasets.remove(&decommision_sled_id); + blueprint1.blueprint_disks.remove(&decommision_sled_id); + // Change the input to note that the sled is expunged, but still active. let mut builder = input.into_builder(); builder.sleds_mut().get_mut(&decommision_sled_id).unwrap().policy = diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs b/nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs index 56fa39374d..4222481149 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs @@ -102,6 +102,7 @@ pub mod test { use crate::blueprint_builder::test::verify_blueprint; use crate::example::ExampleSystemBuilder; use nexus_types::deployment::BlueprintZoneFilter; + use omicron_common::disk::DatasetKind; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_test_utils::dev::test_setup_log; @@ -128,6 +129,24 @@ pub mod test { let npruned = blueprint1.blueprint_zones.len() - 1; assert!(npruned > 0); + // Also prune out the zones' datasets, or we're left with an invalid + // blueprint. + for (_, dataset_config) in + blueprint1.blueprint_datasets.iter_mut().skip(1) + { + dataset_config.datasets.retain(|_id, dataset| { + // This is gross; once zone configs know explicit dataset IDs, + // we should retain by ID instead. + match &dataset.kind { + DatasetKind::InternalDns => false, + DatasetKind::TransientZone { name } => { + !name.starts_with("oxz_internal_dns") + } + _ => true, + } + }); + } + verify_blueprint(&blueprint1); // Create an allocator. diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index a18bb6e3d3..ce1a1ae960 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -1253,6 +1253,21 @@ mod test { for (_sled_id, zones) in blueprint1.blueprint_zones.iter_mut().take(2) { zones.zones.retain(|z| !z.zone_type.is_internal_dns()); } + for (_, dataset_config) in + blueprint1.blueprint_datasets.iter_mut().take(2) + { + dataset_config.datasets.retain(|_id, dataset| { + // This is gross; once zone configs know explicit dataset IDs, + // we should retain by ID instead. + match &dataset.kind { + DatasetKind::InternalDns => false, + DatasetKind::TransientZone { name } => { + !name.starts_with("oxz_internal_dns") + } + _ => true, + } + }); + } let blueprint2 = Planner::new_based_on( logctx.log.clone(), diff --git a/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt index f2d8334027..45d7feb667 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt @@ -61,8 +61,8 @@ to: blueprint fe13be30-94c2-4fa6-aad5-ae3c5028f6bb oxp_f843fb62-0f04-4c7d-a56f-62531104dc77/crypt/zone/oxz_crucible_fc4f1769-9611-42d3-b8c1-f2be9b5359f6 35fa6ec8-6b58-4fcc-a5a2-36e66736e9c1 none none off oxp_96569b61-9e0c-4ee7-bd11-a5e0c541ca99/crypt/zone/oxz_crucible_fff71a84-09c2-4dab-bc18-8f4570f278bb 00abfe99-288d-4a63-abea-adfa62e74524 none none off oxp_3b6e2ade-57fc-4f9d-85c3-38fca27f1df6/crypt/zone/oxz_crucible_pantry_197067bc-9a21-444e-9794-6051d9f78a00 19736dbd-1d01-41e9-a800-ffc450464c2d none none off - oxp_3b6e2ade-57fc-4f9d-85c3-38fca27f1df6/crypt/zone/oxz_crucible_pantry_350fba7f-b754-429e-a21d-e91d139713f2 8be4aa2f-1612-4bdf-a0f6-7458b151308f none none off - oxp_3b6e2ade-57fc-4f9d-85c3-38fca27f1df6/crypt/zone/oxz_crucible_pantry_504963cb-3077-477c-b4e5-2d69bf9caa0c 7fd439f9-dcef-4cfb-b1a1-d298be9d2e3b none none off + oxp_5192ef62-5a12-4a0c-829d-a409da87909c/crypt/zone/oxz_crucible_pantry_350fba7f-b754-429e-a21d-e91d139713f2 8be4aa2f-1612-4bdf-a0f6-7458b151308f none none off + oxp_8778bcc5-dddf-4345-9fdf-5c46a36497b0/crypt/zone/oxz_crucible_pantry_504963cb-3077-477c-b4e5-2d69bf9caa0c 7fd439f9-dcef-4cfb-b1a1-d298be9d2e3b none none off oxp_3b6e2ade-57fc-4f9d-85c3-38fca27f1df6/crypt/zone/oxz_internal_dns_1e9422ca-a3d9-4435-bb17-39d5ad22b4ba 5651c4fb-d146-4270-8794-6ed7ceb6f130 none none off oxp_8778bcc5-dddf-4345-9fdf-5c46a36497b0/crypt/zone/oxz_internal_dns_4a0ec9f6-6ce6-4456-831e-5f8df7b57332 d2b9f103-8bf1-4603-873d-cec130430ba7 none none off oxp_5192ef62-5a12-4a0c-829d-a409da87909c/crypt/zone/oxz_internal_dns_efecb8a2-ce0b-416f-958b-de1fad1bef02 158e226c-e44e-427f-93af-ee96d2cfb9be none none off diff --git a/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt index ef93527fa3..6414749fce 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt @@ -177,8 +177,8 @@ to: blueprint 1ac2d88f-27dd-4506-8585-6b2be832528e oxp_9d833141-18a1-4f24-8a34-6076c026aa87/crypt/debug f631d6a1-9db5-4fc7-978b-9ace485dfe16 100 GiB none gzip-9 oxp_a279461f-a7b9-413f-a79f-cb4dab4c3fce/crypt/debug 1b9c97d6-c90d-4109-b99c-9ab799b3c3b9 100 GiB none gzip-9 oxp_ff7e002b-3ad8-4d45-b03a-c46ef0ac8e59/crypt/debug 9427caff-29ec-4cd1-981b-26d4a7900052 100 GiB none gzip-9 -+ oxp_1e2ec79e-9c11-4133-ac77-e0b994a507d5/crypt/zone/oxz_crucible_pantry_ff9ce09c-afbf-425b-bbfa-3d8fb254f98e 7d47e5d6-a1a5-451a-b4b4-3a9747f8154a none none off -+ oxp_1e2ec79e-9c11-4133-ac77-e0b994a507d5/crypt/zone/oxz_nexus_845869e9-ecb2-4ec3-b6b8-2a836e459243 a759d2f3-003c-4fb8-b06b-f985e213b273 none none off ++ oxp_440ae69d-5e2e-4539-91d0-e2930bdd7203/crypt/zone/oxz_crucible_pantry_ff9ce09c-afbf-425b-bbfa-3d8fb254f98e 7d47e5d6-a1a5-451a-b4b4-3a9747f8154a none none off ++ oxp_440ae69d-5e2e-4539-91d0-e2930bdd7203/crypt/zone/oxz_nexus_845869e9-ecb2-4ec3-b6b8-2a836e459243 a759d2f3-003c-4fb8-b06b-f985e213b273 none none off omicron zones generation 2 -> 3: diff --git a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt index 803d4ea2a1..39b3398b2e 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt @@ -288,8 +288,8 @@ to: blueprint 74f2e7fd-687e-4c9e-b5d8-e474a5bb8e7c oxp_fe379ac6-1938-4cc2-93a9-43b1447229ae/crypt/debug 3a49dd24-8ead-4196-b453-8aa3273b77d1 100 GiB none gzip-9 + oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/clickhouse 410eca9c-8eee-4a98-aea2-a363697974f7 none none off + oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_clickhouse_fa97835a-aabc-4fe9-9e85-3e50f207129c 08f15d4b-91dc-445d-88f4-cb9fa585444b none none off -+ oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_crucible_pantry_7741bb11-0d99-4856-95ae-725b6b9ff4fa 4eb52e76-39fa-414d-ae9b-2dcb1c7737f9 none none off -+ oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_nexus_69789010-8689-43ab-9a68-a944afcba05a e67b797b-a059-4c7e-a98b-fea18964bad6 none none off ++ oxp_2acfbb84-5ce0-424e-8d73-44c5071d4430/crypt/zone/oxz_crucible_pantry_7741bb11-0d99-4856-95ae-725b6b9ff4fa 4eb52e76-39fa-414d-ae9b-2dcb1c7737f9 none none off ++ oxp_2acfbb84-5ce0-424e-8d73-44c5071d4430/crypt/zone/oxz_nexus_69789010-8689-43ab-9a68-a944afcba05a e67b797b-a059-4c7e-a98b-fea18964bad6 none none off omicron zones generation 3 -> 4: diff --git a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt index 93dc17c180..2c90c981fb 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt @@ -188,9 +188,9 @@ to: blueprint df68d4d4-5af4-4b56-95bb-1654a6957d4f oxp_427b2ccd-998f-4085-af21-e600604cf21e/crypt/zone/oxz_crucible_befe73dd-5970-49a4-9adf-7b4f453c45cf 95d72ef9-e070-49e4-a57b-2c392def6025 none none off oxp_2fa34d8e-13d9-42d3-b8ba-ca9d74ac496a/crypt/zone/oxz_crucible_d9106a19-f267-48db-a82b-004e643feb49 9b9fb14e-cd17-4a7a-a74a-bfd9c7682831 none none off oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_crucible_pantry_6c7f6a84-78b3-4dd9-878e-51bedfda471f aa190e01-9a4e-4131-9fcf-240532108c7f none none off - oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_crucible_pantry_7741bb11-0d99-4856-95ae-725b6b9ff4fa 4eb52e76-39fa-414d-ae9b-2dcb1c7737f9 none none off + oxp_2acfbb84-5ce0-424e-8d73-44c5071d4430/crypt/zone/oxz_crucible_pantry_7741bb11-0d99-4856-95ae-725b6b9ff4fa 4eb52e76-39fa-414d-ae9b-2dcb1c7737f9 none none off oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_internal_dns_0c42ad01-b854-4e7d-bd6c-25fdc3eddef4 1de9cde7-6c1e-4865-bd3d-378e22f62fb8 none none off - oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_nexus_69789010-8689-43ab-9a68-a944afcba05a e67b797b-a059-4c7e-a98b-fea18964bad6 none none off + oxp_2acfbb84-5ce0-424e-8d73-44c5071d4430/crypt/zone/oxz_nexus_69789010-8689-43ab-9a68-a944afcba05a e67b797b-a059-4c7e-a98b-fea18964bad6 none none off oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_nexus_7e763480-0f4f-43cb-ab9a-52b667d8fda5 5773e3b1-dde0-4b54-bc13-3c3bf816015e none none off oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/zone/oxz_ntp_f34f8d36-7137-48d3-9d13-6a46c4edcef4 c8c03dec-65d4-4c97-87c3-a43a8363c97c none none off oxp_21d60319-5fe1-4a3b-a4c0-6aa7465e7bde/crypt/debug f015e445-2e52-45c9-9f0a-49cb5ceae245 100 GiB none gzip-9 diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt index f4f63ad96a..8bb7635d75 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt @@ -353,9 +353,9 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 oxp_cf32a1ce-2c9e-49f5-b1cf-4af7f2a28901/crypt/debug 2dfc5c53-6618-4352-b754-86ef6463c20a 100 GiB none gzip-9 oxp_e405da11-cb6b-4ebc-bac1-9bc997352e10/crypt/debug 61a653cf-44a6-43c0-90e1-bec539511703 100 GiB none gzip-9 oxp_f4d7f914-ec73-4b65-8696-5068591d9065/crypt/debug b803d901-7e43-42fa-8372-43c3c5b3c1a9 100 GiB none gzip-9 -+ oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_508abd03-cbfe-4654-9a6d-7f15a1ad32e5 b781d032-3149-4c44-a7d3-5f8d80e4a607 none none off -+ oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_99f6d544-8599-4e2b-a55a-82d9e0034662 8a39677a-fbcf-4884-b000-63be3247fb63 none none off -+ oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_c26b3bda-5561-44a1-a69f-22103fe209a1 c9c1a582-1fe0-4001-9301-97230387563a none none off ++ oxp_5248a306-4a03-449e-a8a3-6f86d26da755/crypt/zone/oxz_nexus_508abd03-cbfe-4654-9a6d-7f15a1ad32e5 b781d032-3149-4c44-a7d3-5f8d80e4a607 none none off ++ oxp_55196665-ed61-4b23-9a74-0711bf2eaf90/crypt/zone/oxz_nexus_99f6d544-8599-4e2b-a55a-82d9e0034662 8a39677a-fbcf-4884-b000-63be3247fb63 none none off ++ oxp_6b2a719a-35eb-469f-aa54-114a1f21f37d/crypt/zone/oxz_nexus_c26b3bda-5561-44a1-a69f-22103fe209a1 c9c1a582-1fe0-4001-9301-97230387563a none none off omicron zones generation 2 -> 3: @@ -443,9 +443,9 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 oxp_cd62306a-aedf-47e8-93d5-92a358d64c7b/crypt/debug 73674f4b-1d93-404a-bc9c-8395efac97fd 100 GiB none gzip-9 oxp_f1693454-aac1-4265-b8a0-4e9f3f41c7b3/crypt/debug 938737fb-b72f-4727-8833-9697c518ca37 100 GiB none gzip-9 oxp_fe4fdfba-3b6d-47d3-8612-1fb2390b650a/crypt/debug 8e58b91f-9ce2-4256-8dec-5f90f31a73fa 100 GiB none gzip-9 -+ oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_2ec75441-3d7d-4b4b-9614-af03de5a3666 cd15e9c9-0238-493a-8b32-926d1cd1bce6 none none off -+ oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_3ca5292f-8a59-4475-bb72-0f43714d0fff 871b35e6-d234-4a96-bab4-d07314bc6ba2 none none off -+ oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_59950bc8-1497-44dd-8cbf-b6502ba921b2 63ec1a21-2c77-41b5-ad3e-e7bf39207107 none none off ++ oxp_39ca2e23-4c38-4743-afe0-26b0380b27db/crypt/zone/oxz_nexus_2ec75441-3d7d-4b4b-9614-af03de5a3666 cd15e9c9-0238-493a-8b32-926d1cd1bce6 none none off ++ oxp_60131a33-1f12-4dbb-9435-bdd368db1f51/crypt/zone/oxz_nexus_3ca5292f-8a59-4475-bb72-0f43714d0fff 871b35e6-d234-4a96-bab4-d07314bc6ba2 none none off ++ oxp_4fbd2fe0-2eac-41b8-8e8d-4fa46c3e8b6c/crypt/zone/oxz_nexus_59950bc8-1497-44dd-8cbf-b6502ba921b2 63ec1a21-2c77-41b5-ad3e-e7bf39207107 none none off omicron zones generation 2 -> 3: diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt index 72750bc8f0..9fe4b5218b 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt @@ -55,10 +55,10 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 oxp_cf32a1ce-2c9e-49f5-b1cf-4af7f2a28901/crypt/zone/oxz_crucible_c60379ba-4e30-4628-a79a-0ae509aef4c5 a6fcf496-70a1-49bf-a951-62fcec8dd5e2 none none off oxp_5248a306-4a03-449e-a8a3-6f86d26da755/crypt/zone/oxz_crucible_f0ff59e8-4105-4980-a4bb-a1f4c58de1e3 c596346d-4040-4103-b036-8fafdbaada00 none none off oxp_6b2a719a-35eb-469f-aa54-114a1f21f37d/crypt/zone/oxz_crucible_f1a7b9a7-fc6a-4b23-b829-045ff33117ff c864de0d-9859-4ad1-a30b-f5ac45ba03ed none none off - oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_508abd03-cbfe-4654-9a6d-7f15a1ad32e5 b781d032-3149-4c44-a7d3-5f8d80e4a607 none none off - oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_99f6d544-8599-4e2b-a55a-82d9e0034662 8a39677a-fbcf-4884-b000-63be3247fb63 none none off + oxp_5248a306-4a03-449e-a8a3-6f86d26da755/crypt/zone/oxz_nexus_508abd03-cbfe-4654-9a6d-7f15a1ad32e5 b781d032-3149-4c44-a7d3-5f8d80e4a607 none none off + oxp_55196665-ed61-4b23-9a74-0711bf2eaf90/crypt/zone/oxz_nexus_99f6d544-8599-4e2b-a55a-82d9e0034662 8a39677a-fbcf-4884-b000-63be3247fb63 none none off oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_a732c489-d29a-4f75-b900-5966385943af db6c139b-9028-4d8e-92c7-6cc1e9aa0131 none none off - oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_nexus_c26b3bda-5561-44a1-a69f-22103fe209a1 c9c1a582-1fe0-4001-9301-97230387563a none none off + oxp_6b2a719a-35eb-469f-aa54-114a1f21f37d/crypt/zone/oxz_nexus_c26b3bda-5561-44a1-a69f-22103fe209a1 c9c1a582-1fe0-4001-9301-97230387563a none none off oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/zone/oxz_ntp_621509d6-3772-4009-aca1-35eefd1098fb 3b5822d2-9918-4bd6-8b75-2f52bdd73189 none none off oxp_4069c804-c51a-4adc-8822-3cbbab56ed3f/crypt/debug bf9b39db-5a6a-4b45-b2da-c37425271014 100 GiB none gzip-9 oxp_5248a306-4a03-449e-a8a3-6f86d26da755/crypt/debug 1b4e8d9e-e447-4df1-8e0b-57edc318e8ad 100 GiB none gzip-9 @@ -145,10 +145,10 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 oxp_cd62306a-aedf-47e8-93d5-92a358d64c7b/crypt/zone/oxz_crucible_be920398-024a-4655-8c49-69b5ac48dfff 87f757d6-fa4c-4423-995c-1eab5e7d09a2 none none off oxp_39ca2e23-4c38-4743-afe0-26b0380b27db/crypt/zone/oxz_crucible_d47f4996-fac0-4657-bcea-01b1fee6404d c1af262a-2595-4236-98c8-21c5b63c80c3 none none off oxp_789d607d-d196-428e-a988-f7886a327859/crypt/zone/oxz_crucible_e001fea0-6594-4ece-97e3-6198c293e931 5e27b9bc-e69f-4258-83f2-5f9a1109a625 none none off - oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_2ec75441-3d7d-4b4b-9614-af03de5a3666 cd15e9c9-0238-493a-8b32-926d1cd1bce6 none none off - oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_3ca5292f-8a59-4475-bb72-0f43714d0fff 871b35e6-d234-4a96-bab4-d07314bc6ba2 none none off + oxp_39ca2e23-4c38-4743-afe0-26b0380b27db/crypt/zone/oxz_nexus_2ec75441-3d7d-4b4b-9614-af03de5a3666 cd15e9c9-0238-493a-8b32-926d1cd1bce6 none none off + oxp_60131a33-1f12-4dbb-9435-bdd368db1f51/crypt/zone/oxz_nexus_3ca5292f-8a59-4475-bb72-0f43714d0fff 871b35e6-d234-4a96-bab4-d07314bc6ba2 none none off oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_4ad0e9da-08f8-4d40-b4d3-d17e711b5bbf 45d32c13-cbbb-4382-a0ed-dc6574b827b7 none none off - oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_nexus_59950bc8-1497-44dd-8cbf-b6502ba921b2 63ec1a21-2c77-41b5-ad3e-e7bf39207107 none none off + oxp_4fbd2fe0-2eac-41b8-8e8d-4fa46c3e8b6c/crypt/zone/oxz_nexus_59950bc8-1497-44dd-8cbf-b6502ba921b2 63ec1a21-2c77-41b5-ad3e-e7bf39207107 none none off oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/zone/oxz_ntp_bf79a56a-97af-4cc4-94a5-8b20d64c2cda a410308c-e2cb-4e4d-9da6-1879336f93f2 none none off oxp_33d48d85-751e-4982-b738-eae4d9a05f01/crypt/debug 755e24a8-67cc-44b1-8c25-2dcb3acd988f 100 GiB none gzip-9 oxp_39ca2e23-4c38-4743-afe0-26b0380b27db/crypt/debug c834f8cd-25ee-4c62-af03-49cef53fc4c1 100 GiB none gzip-9 diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 806df52711..afe906e781 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -244,11 +244,13 @@ impl Blueprint { pub fn all_omicron_datasets( &self, filter: BlueprintDatasetFilter, - ) -> impl Iterator { + ) -> impl Iterator { self.blueprint_datasets .iter() - .flat_map(move |(_, datasets)| datasets.datasets.values()) - .filter(move |d| d.disposition.matches(filter)) + .flat_map(move |(sled_id, datasets)| { + datasets.datasets.values().map(|dataset| (*sled_id, dataset)) + }) + .filter(move |(_, d)| d.disposition.matches(filter)) } /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint @@ -630,7 +632,17 @@ fn zone_sort_key(z: &T) -> impl Ord { /// Describes one Omicron-managed zone in a blueprint. /// /// Part of [`BlueprintZonesConfig`]. -#[derive(Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, +)] pub struct BlueprintZoneConfig { /// The disposition (desired state) of this zone recorded in the blueprint. pub disposition: BlueprintZoneDisposition, @@ -980,7 +992,17 @@ impl BlueprintDatasetDisposition { } /// Information about a dataset as recorded in a blueprint -#[derive(Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, +)] pub struct BlueprintDatasetConfig { // TODO: Display this in diffs - leave for now, for backwards compat pub disposition: BlueprintDatasetDisposition, diff --git a/nexus/types/src/deployment/network_resources.rs b/nexus/types/src/deployment/network_resources.rs index cdabbc7fdc..f11d739d03 100644 --- a/nexus/types/src/deployment/network_resources.rs +++ b/nexus/types/src/deployment/network_resources.rs @@ -147,7 +147,18 @@ impl OmicronZoneNetworkResources { } /// External IP variants possible for Omicron-managed zones. -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Debug, + Clone, + Copy, + Hash, + PartialOrd, + Ord, + PartialEq, + Eq, + Serialize, + Deserialize, +)] pub enum OmicronZoneExternalIp { Floating(OmicronZoneExternalFloatingIp), Snat(OmicronZoneExternalSnatIp), @@ -199,7 +210,17 @@ pub enum OmicronZoneExternalIpKey { /// necessary for blueprint planning, and requires that the zone have a single /// IP. #[derive( - Debug, Clone, Copy, Hash, PartialEq, Eq, JsonSchema, Serialize, Deserialize, + Debug, + Clone, + Copy, + Hash, + PartialOrd, + Ord, + PartialEq, + Eq, + JsonSchema, + Serialize, + Deserialize, )] pub struct OmicronZoneExternalFloatingIp { pub id: ExternalIpUuid, @@ -208,7 +229,16 @@ pub struct OmicronZoneExternalFloatingIp { /// Floating external address with port allocated to an Omicron-managed zone. #[derive( - Debug, Clone, Copy, PartialEq, Eq, JsonSchema, Serialize, Deserialize, + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + JsonSchema, + Serialize, + Deserialize, )] pub struct OmicronZoneExternalFloatingAddr { pub id: ExternalIpUuid, @@ -227,7 +257,17 @@ impl OmicronZoneExternalFloatingAddr { /// necessary for blueprint planning, and requires that the zone have a single /// IP. #[derive( - Debug, Clone, Copy, Hash, PartialEq, Eq, JsonSchema, Serialize, Deserialize, + Debug, + Clone, + Copy, + Hash, + PartialOrd, + Ord, + PartialEq, + Eq, + JsonSchema, + Serialize, + Deserialize, )] pub struct OmicronZoneExternalSnatIp { pub id: ExternalIpUuid, diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index 86310cfc8f..ffb4bd5a17 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -21,7 +21,17 @@ use serde::Serialize; use std::net::Ipv6Addr; use std::net::SocketAddrV6; -#[derive(Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, +)] #[serde(tag = "type", rename_all = "snake_case")] pub enum BlueprintZoneType { BoundaryNtp(blueprint_zone_type::BoundaryNtp), @@ -335,7 +345,15 @@ pub mod blueprint_zone_type { use std::net::SocketAddrV6; #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct BoundaryNtp { pub address: SocketAddrV6, @@ -349,7 +367,15 @@ pub mod blueprint_zone_type { /// Used in single-node clickhouse setups #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct Clickhouse { pub address: SocketAddrV6, @@ -357,7 +383,15 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct ClickhouseKeeper { pub address: SocketAddrV6, @@ -366,7 +400,15 @@ pub mod blueprint_zone_type { /// Used in replicated clickhouse setups #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct ClickhouseServer { pub address: SocketAddrV6, @@ -374,7 +416,15 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct CockroachDb { pub address: SocketAddrV6, @@ -382,7 +432,15 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct Crucible { pub address: SocketAddrV6, @@ -390,14 +448,30 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct CruciblePantry { pub address: SocketAddrV6, } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct ExternalDns { pub dataset: OmicronZoneDataset, @@ -410,7 +484,15 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct InternalDns { pub dataset: OmicronZoneDataset, @@ -430,14 +512,30 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct InternalNtp { pub address: SocketAddrV6, } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct Nexus { /// The address at which the internal nexus server is reachable. @@ -453,7 +551,15 @@ pub mod blueprint_zone_type { } #[derive( - Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + Debug, + Clone, + Eq, + PartialEq, + Ord, + PartialOrd, + JsonSchema, + Deserialize, + Serialize, )] pub struct Oximeter { pub address: SocketAddrV6, From 6672f1fb3a42ef4c67a4b6f49e96691e63273aa1 Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 20 Dec 2024 15:10:17 -0800 Subject: [PATCH 06/11] [2/n] [omicron-package] better error messages around target (#7287) * If no target is specified, don't print a confusing `the name '{name}' is reserved` message. * For `target delete`, if removing the file failed, print the corresponding error message. Depends on #7285. --- package/src/bin/omicron-package.rs | 49 +++++++++++++++++++----------- package/src/config.rs | 25 +++++++-------- package/src/target.rs | 8 +++++ 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index f4bda47e2c..2cb0512169 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -12,7 +12,7 @@ use illumos_utils::{zfs, zone}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use omicron_package::cargo_plan::build_cargo_plan; use omicron_package::config::{Config, ConfigArgs}; -use omicron_package::target::KnownTarget; +use omicron_package::target::{target_command_help, KnownTarget}; use omicron_package::{parse, BuildCommand, DeployCommand, TargetCommand}; use omicron_zone_package::config::Config as PackageConfig; use omicron_zone_package::package::{Package, PackageOutput, PackageSource}; @@ -157,7 +157,7 @@ async fn do_list_outputs( async fn do_target( artifact_dir: &Utf8Path, - name: &str, + name: Option<&str>, subcommand: &TargetCommand, ) -> Result<()> { let target_dir = artifact_dir.join("target"); @@ -180,7 +180,7 @@ async fn do_target( clickhouse_topology.clone(), )?; - let path = get_single_target(&target_dir, name).await?; + let (name, path) = get_required_named_target(&target_dir, name)?; tokio::fs::write(&path, Target::from(target).to_string()) .await .with_context(|| { @@ -215,31 +215,46 @@ async fn do_target( } } TargetCommand::Set => { - let _ = get_single_target(&target_dir, name).await?; + let (name, _) = get_required_named_target(&target_dir, name)?; replace_active_link(&name, &target_dir).await?; println!("Set build target '{name}' as active"); } TargetCommand::Delete => { - let path = get_single_target(&target_dir, name).await?; - tokio::fs::remove_file(&path).await?; + let (name, path) = get_required_named_target(&target_dir, name)?; + tokio::fs::remove_file(&path).await.with_context(|| { + format!("failed to remove target file {}", path) + })?; println!("Removed build target '{name}'"); } }; Ok(()) } -async fn get_single_target( +/// Get the path to a named target as required by the `target` subcommand. +/// +/// This function bans `active` as a target name, as it is reserved for the +/// active target. +fn get_required_named_target( target_dir: impl AsRef, - name: &str, -) -> Result { - if name == Config::ACTIVE { - bail!( - "The name '{name}' is reserved, please try another (e.g. 'default')\n\ - Usage: '{} -t target ...'", - env::current_exe().unwrap().display(), - ); + name: Option<&str>, +) -> Result<(&str, Utf8PathBuf)> { + match name { + Some(name) if name == Config::ACTIVE => { + bail!( + "the name '{name}' is reserved, please try another (e.g. 'default')\n\ + Usage: {} ...", + target_command_help(""), + ); + } + Some(name) => Ok((name, target_dir.as_ref().join(name))), + None => { + bail!( + "a target name is required for this operation (e.g. 'default')\n\ + Usage: {} ...", + target_command_help(""), + ); + } } - Ok(target_dir.as_ref().join(name)) } async fn replace_active_link( @@ -887,7 +902,7 @@ async fn main() -> Result<()> { SubCommand::Build(BuildCommand::Target { subcommand }) => { do_target( &args.artifact_dir, - &args.config_args.target, + args.config_args.target.as_deref(), &subcommand, ) .await?; diff --git a/package/src/config.rs b/package/src/config.rs index f80bd36057..800af7a0de 100644 --- a/package/src/config.rs +++ b/package/src/config.rs @@ -11,21 +11,15 @@ use omicron_zone_package::{ target::Target, }; use slog::{debug, Logger}; -use std::{ - collections::BTreeMap, env, io::Write, str::FromStr, time::Duration, -}; +use std::{collections::BTreeMap, io::Write, str::FromStr, time::Duration}; -use crate::target::KnownTarget; +use crate::target::{target_command_help, KnownTarget}; #[derive(Debug, Args)] pub struct ConfigArgs { /// The name of the build target to use for this command - #[clap( - short, - long, - default_value_t = Config::ACTIVE.to_string(), - )] - pub target: String, + #[clap(short, long)] + pub target: Option, /// Skip confirmation prompt for destructive operations #[clap(short, long, action, default_value_t = false)] @@ -78,14 +72,17 @@ impl Config { args: &ConfigArgs, artifact_dir: &Utf8Path, ) -> Result { + // Within this path, the target is expected to be set. + let target = args.target.as_deref().unwrap_or(Self::ACTIVE); + let target_help_str = || -> String { format!( - "Try calling: '{} -t default target create' to create a new build target", - env::current_exe().unwrap().display() + "Try calling: '{} target create' to create a new build target", + target_command_help("default"), ) }; - let target_path = artifact_dir.join("target").join(&args.target); + let target_path = artifact_dir.join("target").join(target); let raw_target = std::fs::read_to_string(&target_path).inspect_err(|_| { eprintln!( @@ -103,7 +100,7 @@ impl Config { ); })? .into(); - debug!(log, "target[{}]: {:?}", args.target, target); + debug!(log, "target[{}]: {:?}", target, target); Ok(Config { log: log.clone(), diff --git a/package/src/target.rs b/package/src/target.rs index 6a6cbd32d8..d56f7e87c5 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -200,3 +200,11 @@ impl std::str::FromStr for KnownTarget { ) } } + +/// Generate a command to build a target, for use in usage strings. +pub fn target_command_help(target_name: &str) -> String { + format!( + "{} -t {target_name} target", + std::env::current_exe().unwrap().display(), + ) +} From c1281a9b98a576395d886630167c0dac22ef4024 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 20 Dec 2024 15:57:23 -0800 Subject: [PATCH 07/11] ingest new Propolis VM creation API (#7211) Update Omicron to use the new Propolis VM creation API defined in oxidecomputer/propolis#813 and oxidecomputer/propolis#816. This API requires clients to pass instance specs to create new VMs and component replacement lists to migrate existing VMs. Construct these in sled agent for now; in the future this logic can move to Nexus and become part of a robust virtual platform abstraction. For now the goal is just to keep everything working for existing VMs while adapting to the new Propolis API. Slightly adjust the sled agent instance APIs so that Nexus specifies disks and boot orderings using sled-agent-defined types and not re-exported Propolis types. Finally, adapt Nexus to the fact that Crucible's `VolumeConstructionRequest` and `CrucibleOpts` types no longer appear in Propolis's generated client (and so don't appear in sled agent's client either). Instead, the `propolis-client` crate re-exports these types directly from its `crucible-client-types` dependency. For the most part, this involves updating `use` directives and storing `SocketAddr`s in their natively typed form instead of converting them to and from strings. Tests: cargo nextest, plus ad hoc testing in a dev cluster as described in the PR comments. --- Cargo.lock | 22 +- Cargo.toml | 8 +- clients/sled-agent-client/Cargo.toml | 1 + clients/sled-agent-client/src/lib.rs | 2 + dev-tools/ls-apis/tests/api_dependencies.out | 4 +- dev-tools/omdb/src/bin/omdb/db.rs | 2 +- .../src/db/datastore/region_replacement.rs | 2 +- .../datastore/region_snapshot_replacement.rs | 2 +- nexus/db-queries/src/db/datastore/volume.rs | 278 ++++++------ .../src/db/datastore/volume_repair.rs | 2 +- .../background/tasks/region_replacement.rs | 4 +- .../tasks/region_replacement_driver.rs | 2 +- .../region_snapshot_replacement_finish.rs | 2 +- ...on_snapshot_replacement_garbage_collect.rs | 2 +- .../region_snapshot_replacement_start.rs | 9 +- .../tasks/region_snapshot_replacement_step.rs | 34 +- nexus/src/app/image.rs | 2 +- nexus/src/app/instance.rs | 63 +-- nexus/src/app/sagas/disk_create.rs | 6 +- nexus/src/app/sagas/instance_migrate.rs | 1 - .../src/app/sagas/region_replacement_drive.rs | 2 - .../app/sagas/region_replacement_finish.rs | 4 +- .../src/app/sagas/region_replacement_start.rs | 8 +- ...on_snapshot_replacement_garbage_collect.rs | 4 +- .../region_snapshot_replacement_start.rs | 13 +- .../sagas/region_snapshot_replacement_step.rs | 9 +- ...apshot_replacement_step_garbage_collect.rs | 4 +- nexus/src/app/sagas/snapshot_create.rs | 93 ++-- nexus/src/app/sagas/volume_delete.rs | 2 +- nexus/src/app/sagas/volume_remove_rop.rs | 2 +- .../integration_tests/volume_management.rs | 84 ++-- openapi/sled-agent.json | 327 +++----------- package-manifest.toml | 4 +- sled-agent/src/instance.rs | 401 +++++++++++++++--- sled-agent/src/sim/http_entrypoints_pantry.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 44 +- sled-agent/src/sim/storage.rs | 2 +- sled-agent/types/src/instance.rs | 41 +- 38 files changed, 799 insertions(+), 695 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5048914c74..1aec1ccc89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -693,7 +693,7 @@ dependencies = [ [[package]] name = "bhyve_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" +source = "git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b#d4529fd8247386b422b78e1203315d5baea5ea8b" dependencies = [ "bhyve_api_sys", "libc", @@ -703,7 +703,7 @@ dependencies = [ [[package]] name = "bhyve_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" +source = "git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b#d4529fd8247386b422b78e1203315d5baea5ea8b" dependencies = [ "libc", "strum", @@ -7023,7 +7023,7 @@ dependencies = [ "pq-sys", "pretty_assertions", "progenitor-client", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b)", "qorb", "rand", "rcgen", @@ -7289,7 +7289,7 @@ dependencies = [ "oximeter-producer", "oxnet", "pretty_assertions", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b)", "propolis-mock-server", "propolis_api_types", "rand", @@ -9004,12 +9004,14 @@ dependencies = [ [[package]] name = "propolis-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" +source = "git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b#d4529fd8247386b422b78e1203315d5baea5ea8b" dependencies = [ "async-trait", "base64 0.21.7", + "crucible-client-types", "futures", "progenitor", + "propolis_api_types", "rand", "reqwest", "schemars", @@ -9046,7 +9048,7 @@ dependencies = [ [[package]] name = "propolis-mock-server" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" +source = "git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b#d4529fd8247386b422b78e1203315d5baea5ea8b" dependencies = [ "anyhow", "atty", @@ -9088,12 +9090,13 @@ dependencies = [ [[package]] name = "propolis_api_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" +source = "git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b#d4529fd8247386b422b78e1203315d5baea5ea8b" dependencies = [ "crucible-client-types", "propolis_types", "schemars", "serde", + "serde_with", "thiserror 1.0.69", "uuid", ] @@ -9101,7 +9104,7 @@ dependencies = [ [[package]] name = "propolis_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" +source = "git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b#d4529fd8247386b422b78e1203315d5baea5ea8b" dependencies = [ "schemars", "serde", @@ -10739,6 +10742,7 @@ dependencies = [ "omicron-workspace-hack", "oxnet", "progenitor", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b)", "regress 0.9.1", "reqwest", "schemars", @@ -10764,7 +10768,7 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "oxnet", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=d4529fd8247386b422b78e1203315d5baea5ea8b)", "rcgen", "schemars", "serde", diff --git a/Cargo.toml b/Cargo.toml index 3d29f61cf9..8906ab4d70 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -547,10 +547,10 @@ prettyplease = { version = "0.2.25", features = ["verbatim"] } proc-macro2 = "1.0" progenitor = "0.8.0" progenitor-client = "0.8.0" -bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } -propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } -propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } -propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } +bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "d4529fd8247386b422b78e1203315d5baea5ea8b" } +propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "d4529fd8247386b422b78e1203315d5baea5ea8b" } +propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "d4529fd8247386b422b78e1203315d5baea5ea8b" } +propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "d4529fd8247386b422b78e1203315d5baea5ea8b" } proptest = "1.5.0" qorb = "0.2.1" quote = "1.0" diff --git a/clients/sled-agent-client/Cargo.toml b/clients/sled-agent-client/Cargo.toml index 770588d6b4..e6c77fe24a 100644 --- a/clients/sled-agent-client/Cargo.toml +++ b/clients/sled-agent-client/Cargo.toml @@ -17,6 +17,7 @@ omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true oxnet.workspace = true progenitor.workspace = true +propolis-client.workspace = true regress.workspace = true reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] } schemars.workspace = true diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index f2eb957650..6a9c721587 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -12,6 +12,8 @@ use serde::Serialize; use std::convert::TryFrom; use uuid::Uuid; +pub use propolis_client::{CrucibleOpts, VolumeConstructionRequest}; + progenitor::generate_api!( spec = "../../openapi/sled-agent.json", derives = [schemars::JsonSchema, PartialEq], diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index 8e061f2906..d2b672e62c 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -67,8 +67,8 @@ Oximeter (client: oximeter-client) consumed by: omicron-nexus (omicron/nexus) via 2 paths Propolis (client: propolis-client) - consumed by: omicron-nexus (omicron/nexus) via 2 paths - consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path + consumed by: omicron-nexus (omicron/nexus) via 3 paths + consumed by: omicron-sled-agent (omicron/sled-agent) via 2 paths Crucible Repair (client: repair-client) consumed by: crucible-downstairs (crucible/downstairs) via 1 path diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index e501e650b1..629e465212 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -135,7 +135,7 @@ use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::VolumeConstructionRequest; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::BTreeMap; diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 8aad7f2cfd..e4a7d78224 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -924,7 +924,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 90b014c582..5568822e07 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -1371,7 +1371,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 505533da50..e55c374073 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -58,7 +58,7 @@ use omicron_uuid_kinds::UpstairsRepairKind; use serde::Deserialize; use serde::Deserializer; use serde::Serialize; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::VolumeConstructionRequest; use std::collections::HashSet; use std::collections::VecDeque; use std::net::AddrParseError; @@ -1118,7 +1118,7 @@ impl DataStore { ) -> CreateResult { let volume = self.volume_checkout(volume_id, reason).await?; - let vcr: sled_agent_client::types::VolumeConstructionRequest = + let vcr: sled_agent_client::VolumeConstructionRequest = serde_json::from_str(volume.data())?; let randomized_vcr = serde_json::to_string( @@ -2578,10 +2578,15 @@ fn region_in_vcr( VolumeConstructionRequest::Region { opts, .. } => { for target in &opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target == *region { - region_found = true; - break; + match target { + SocketAddr::V6(t) if *t == *region => { + region_found = true; + break; + } + SocketAddr::V6(_) => {} + SocketAddr::V4(_) => { + bail!("region target contains an IPv4 address"); + } } } } @@ -2643,9 +2648,16 @@ fn read_only_target_in_vcr( } for target in &opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target == *read_only_target && opts.read_only { - return Ok(true); + match target { + SocketAddr::V6(t) + if *t == *read_only_target && opts.read_only => + { + return Ok(true); + } + SocketAddr::V6(_) => {} + SocketAddr::V4(_) => { + bail!("region target contains an IPv4 address"); + } } } } @@ -3177,9 +3189,9 @@ impl DataStore { blocks_per_extent: 1, extent_count: 1, gen: 1, - opts: sled_agent_client::types::CrucibleOpts { + opts: sled_agent_client::CrucibleOpts { id: volume_to_delete_id.0, - target: vec![existing.0.to_string()], + target: vec![existing.0.into()], lossy: false, flush_timeout: None, key: None, @@ -3442,7 +3454,9 @@ pub fn read_only_resources_associated_with_volume( VolumeConstructionRequest::Region { opts, .. } => { for target in &opts.target { if opts.read_only { - crucible_targets.read_only_targets.push(target.clone()); + crucible_targets + .read_only_targets + .push(target.to_string()); } } } @@ -3481,7 +3495,7 @@ pub fn read_write_resources_associated_with_volume( VolumeConstructionRequest::Region { opts, .. } => { if !opts.read_only { for target in &opts.target { - targets.push(target.clone()); + targets.push(target.to_string()); } } } @@ -3597,10 +3611,11 @@ fn replace_region_in_vcr( VolumeConstructionRequest::Region { opts, gen, .. } => { for target in &mut opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target == old_region { - *target = new_region.to_string(); - old_region_found = true; + if let SocketAddr::V6(target) = target { + if *target == old_region { + *target = new_region; + old_region_found = true; + } } } @@ -3681,10 +3696,11 @@ fn replace_read_only_target_in_vcr( } for target in &mut opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target == old_target.0 && opts.read_only { - *target = new_target.0.to_string(); - replacements += 1; + if let SocketAddr::V6(target) = target { + if *target == old_target.0 && opts.read_only { + *target = new_target.0; + replacements += 1; + } } } } @@ -3727,9 +3743,10 @@ fn find_matching_rw_regions_in_volume( VolumeConstructionRequest::Region { opts, .. } => { if !opts.read_only { for target in &opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target.ip() == ip { - matched_targets.push(parsed_target); + if let SocketAddr::V6(target) = target { + if target.ip() == ip { + matched_targets.push(*target); + } } } } @@ -4006,7 +4023,7 @@ mod tests { use nexus_types::external_api::params::DiskSource; use omicron_common::api::external::ByteCount; use omicron_test_utils::dev; - use sled_agent_client::types::CrucibleOpts; + use sled_agent_client::CrucibleOpts; // Assert that Nexus will not fail to deserialize an old version of // CrucibleResources that was serialized before schema update 6.0.0. @@ -4145,7 +4162,7 @@ mod tests { .await .unwrap(); - let mut region_addresses: Vec = + let mut region_addresses: Vec = Vec::with_capacity(datasets_and_regions.len()); for (i, (_, region)) in datasets_and_regions.iter().enumerate() { @@ -4162,7 +4179,7 @@ mod tests { let address: SocketAddrV6 = datastore.region_addr(region.id()).await.unwrap().unwrap(); - region_addresses.push(address.to_string()); + region_addresses.push(address); } // Manually create a replacement region at the first dataset @@ -4209,9 +4226,9 @@ mod tests { id: volume_id, target: vec![ // target to replace - region_addresses[0].clone(), - region_addresses[1].clone(), - region_addresses[2].clone(), + region_addresses[0].into(), + region_addresses[1].into(), + region_addresses[2].into(), ], lossy: false, flush_timeout: None, @@ -4238,7 +4255,7 @@ mod tests { db::datastore::VolumeReplacementParams { volume_id, region_id: datasets_and_regions[0].1.id(), - region_addr: region_addresses[0].parse().unwrap(), + region_addr: region_addresses[0], }, /* replacement */ db::datastore::VolumeReplacementParams { @@ -4271,9 +4288,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - replacement_region_addr.to_string(), // replaced - region_addresses[1].clone(), - region_addresses[2].clone(), + replacement_region_addr.into(), // replaced + region_addresses[1].into(), + region_addresses[2].into(), ], lossy: false, flush_timeout: None, @@ -4302,7 +4319,7 @@ mod tests { db::datastore::VolumeReplacementParams { volume_id: volume_to_delete_id, region_id: datasets_and_regions[0].1.id(), - region_addr: region_addresses[0].parse().unwrap(), + region_addr: region_addresses[0], }, ) .await @@ -4329,9 +4346,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - region_addresses[0].clone(), // back to what it was - region_addresses[1].clone(), - region_addresses[2].clone(), + region_addresses[0].into(), // back to what it was + region_addresses[1].into(), + region_addresses[2].into(), ], lossy: false, flush_timeout: None, @@ -4383,7 +4400,7 @@ mod tests { .await .unwrap(); - let mut region_addresses: Vec = + let mut region_addresses: Vec = Vec::with_capacity(datasets_and_regions.len()); for (i, (_, region)) in datasets_and_regions.iter().enumerate() { @@ -4400,7 +4417,7 @@ mod tests { let address: SocketAddrV6 = datastore.region_addr(region.id()).await.unwrap().unwrap(); - region_addresses.push(address.to_string()); + region_addresses.push(address); } // Manually create a replacement region at the first dataset @@ -4435,28 +4452,31 @@ mod tests { // need to add region snapshot objects to satisfy volume create // transaction's search for resources - let address_1 = String::from("[fd00:1122:3344:104::1]:400"); - let address_2 = String::from("[fd00:1122:3344:105::1]:401"); - let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + let address_1: SocketAddrV6 = + "[fd00:1122:3344:104::1]:400".parse().unwrap(); + let address_2: SocketAddrV6 = + "[fd00:1122:3344:105::1]:401".parse().unwrap(); + let address_3: SocketAddrV6 = + "[fd00:1122:3344:106::1]:402".parse().unwrap(); let region_snapshots = [ RegionSnapshot::new( DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - address_1.clone(), + address_1.to_string(), ), RegionSnapshot::new( DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - address_2.clone(), + address_2.to_string(), ), RegionSnapshot::new( DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - address_3.clone(), + address_3.to_string(), ), ]; @@ -4493,9 +4513,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - region_addresses[0].clone(), - region_addresses[1].clone(), - region_addresses[2].clone(), + region_addresses[0].into(), + region_addresses[1].into(), + region_addresses[2].into(), ], lossy: false, flush_timeout: None, @@ -4517,9 +4537,9 @@ mod tests { id: rop_id, target: vec![ // target to replace - address_1.clone(), - address_2.clone(), - address_3.clone(), + address_1.into(), + address_2.into(), + address_3.into(), ], lossy: false, flush_timeout: None, @@ -4587,7 +4607,7 @@ mod tests { let volume_replace_snapshot_result = datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget(address_1.parse().unwrap()), + ExistingTarget(address_1), ReplacementTarget(replacement_region_addr), VolumeToDelete(volume_to_delete_id), ) @@ -4616,9 +4636,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - region_addresses[0].clone(), - region_addresses[1].clone(), - region_addresses[2].clone(), + region_addresses[0].into(), + region_addresses[1].into(), + region_addresses[2].into(), ], lossy: false, flush_timeout: None, @@ -4640,9 +4660,9 @@ mod tests { id: rop_id, target: vec![ // target replaced - replacement_region_addr.to_string(), - address_2.clone(), - address_3.clone(), + replacement_region_addr.into(), + address_2.into(), + address_3.into(), ], lossy: false, flush_timeout: None, @@ -4682,7 +4702,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replaced target stashed here - address_1.clone(), + address_1.into(), ], lossy: false, flush_timeout: None, @@ -4745,7 +4765,7 @@ mod tests { .volume_replace_snapshot( VolumeWithTarget(volume_id), ExistingTarget(replacement_region_addr), - ReplacementTarget(address_1.parse().unwrap()), + ReplacementTarget(address_1), VolumeToDelete(volume_to_delete_id), ) .await @@ -4772,9 +4792,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - region_addresses[0].clone(), - region_addresses[1].clone(), - region_addresses[2].clone(), + region_addresses[0].into(), + region_addresses[1].into(), + region_addresses[2].into(), ], lossy: false, flush_timeout: None, @@ -4796,7 +4816,9 @@ mod tests { id: rop_id, target: vec![ // back to what it was - address_1, address_2, address_3, + address_1.into(), + address_2.into(), + address_3.into(), ], lossy: false, flush_timeout: None, @@ -4836,7 +4858,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replacement stashed here - replacement_region_addr.to_string(), + replacement_region_addr.into(), ], lossy: false, flush_timeout: None, @@ -4899,16 +4921,19 @@ mod tests { // need to add region snapshot objects to satisfy volume create // transaction's search for resources - let address_1 = String::from("[fd00:1122:3344:104::1]:400"); - let address_2 = String::from("[fd00:1122:3344:105::1]:401"); - let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + let address_1: SocketAddrV6 = + "[fd00:1122:3344:104::1]:400".parse().unwrap(); + let address_2: SocketAddrV6 = + "[fd00:1122:3344:105::1]:401".parse().unwrap(); + let address_3: SocketAddrV6 = + "[fd00:1122:3344:106::1]:402".parse().unwrap(); datastore .region_snapshot_create(RegionSnapshot::new( DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - address_1.clone(), + address_1.to_string(), )) .await .unwrap(); @@ -4917,7 +4942,7 @@ mod tests { DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - address_2.clone(), + address_2.to_string(), )) .await .unwrap(); @@ -4926,7 +4951,7 @@ mod tests { DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - address_3.clone(), + address_3.to_string(), )) .await .unwrap(); @@ -4949,9 +4974,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - address_1.clone(), - address_2, - address_3, + address_1.into(), + address_2.into(), + address_3.into(), ], lossy: false, flush_timeout: None, @@ -4971,10 +4996,7 @@ mod tests { .unwrap(); let volumes = datastore - .find_volumes_referencing_socket_addr( - &opctx, - address_1.parse().unwrap(), - ) + .find_volumes_referencing_socket_addr(&opctx, address_1.into()) .await .unwrap(); @@ -5014,9 +5036,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + "[fd00:1122:3344:105::1]:401".parse().unwrap(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5050,9 +5072,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + "[fd00:1122:3344:105::1]:401".parse().unwrap(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5089,9 +5111,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + "[fd00:1122:3344:105::1]:401".parse().unwrap(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5133,9 +5155,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + "[fd00:1122:3344:105::1]:401".parse().unwrap(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5175,9 +5197,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - new_target.0.to_string(), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + new_target.0.into(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5210,9 +5232,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:204::1]:1000"), - String::from("[fd55:1122:3344:205::1]:1001"), - String::from("[fd55:1122:3344:206::1]:1002"), + "[fd55:1122:3344:204::1]:1000".parse().unwrap(), + "[fd55:1122:3344:205::1]:1001".parse().unwrap(), + "[fd55:1122:3344:206::1]:1002".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5233,9 +5255,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd33:1122:3344:304::1]:2000"), - String::from("[fd33:1122:3344:305::1]:2001"), - String::from("[fd33:1122:3344:306::1]:2002"), + "[fd33:1122:3344:304::1]:2000".parse().unwrap(), + "[fd33:1122:3344:305::1]:2001".parse().unwrap(), + "[fd33:1122:3344:306::1]:2002".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5258,9 +5280,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + "[fd00:1122:3344:105::1]:401".parse().unwrap(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5301,9 +5323,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:204::1]:1000"), - String::from("[fd55:1122:3344:205::1]:1001"), - String::from("[fd55:1122:3344:206::1]:1002"), + "[fd55:1122:3344:204::1]:1000".parse().unwrap(), + "[fd55:1122:3344:205::1]:1001".parse().unwrap(), + "[fd55:1122:3344:206::1]:1002".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5324,13 +5346,13 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from( - "[fd33:1122:3344:304::1]:2000" - ), - String::from( - "[fd33:1122:3344:305::1]:2001" - ), - new_target.0.to_string(), + "[fd33:1122:3344:304::1]:2000" + .parse() + .unwrap(), + "[fd33:1122:3344:305::1]:2001" + .parse() + .unwrap(), + new_target.0.into(), ], lossy: false, flush_timeout: None, @@ -5353,9 +5375,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + "[fd00:1122:3344:105::1]:401".parse().unwrap(), + "[fd00:1122:3344:106::1]:402".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5383,9 +5405,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd33:1122:3344:304::1]:2000"), - String::from("[fd33:1122:3344:305::1]:2001"), - String::from("[fd33:1122:3344:306::1]:2002"), + "[fd33:1122:3344:304::1]:2000".parse().unwrap(), + "[fd33:1122:3344:305::1]:2001".parse().unwrap(), + "[fd33:1122:3344:306::1]:2002".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5412,9 +5434,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:204::1]:1000"), - String::from("[fd55:1122:3344:205::1]:1001"), - String::from("[fd55:1122:3344:206::1]:1002"), + "[fd55:1122:3344:204::1]:1000".parse().unwrap(), + "[fd55:1122:3344:205::1]:1001".parse().unwrap(), + "[fd55:1122:3344:206::1]:1002".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5450,9 +5472,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - new_target.0.to_string(), - String::from("[fd33:1122:3344:305::1]:2001"), - String::from("[fd33:1122:3344:306::1]:2002"), + new_target.0.into(), + "[fd33:1122:3344:305::1]:2001".parse().unwrap(), + "[fd33:1122:3344:306::1]:2002".parse().unwrap(), ], lossy: false, flush_timeout: None, @@ -5481,9 +5503,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:204::1]:1000"), - String::from("[fd55:1122:3344:205::1]:1001"), - String::from("[fd55:1122:3344:206::1]:1002"), + "[fd55:1122:3344:204::1]:1000".parse().unwrap(), + "[fd55:1122:3344:205::1]:1001".parse().unwrap(), + "[fd55:1122:3344:206::1]:1002".parse().unwrap(), ], lossy: false, flush_timeout: None, diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 598d9d77a2..976c7f756a 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -169,7 +169,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; #[tokio::test] async fn volume_lock_conflict_error_returned() { diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index caa262bc8c..acadf3dee3 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -306,8 +306,8 @@ mod test { use nexus_db_model::RegionReplacement; use nexus_db_model::Volume; use nexus_test_utils_macros::nexus_test; - use sled_agent_client::types::CrucibleOpts; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::CrucibleOpts; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index 6cc28f9dfd..c08cd2edfc 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -258,7 +258,7 @@ mod test { use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 61a84c579d..a4351e0573 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -198,7 +198,7 @@ mod test { use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index 57bbf3741c..88999efeb9 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -155,7 +155,7 @@ mod test { use nexus_db_model::RegionSnapshotReplacementState; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index f2b82a3943..412a437250 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -355,8 +355,8 @@ mod test { use omicron_common::api::external; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; - use sled_agent_client::types::CrucibleOpts; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::CrucibleOpts; + use sled_agent_client::VolumeConstructionRequest; use std::collections::BTreeMap; use uuid::Uuid; @@ -701,10 +701,7 @@ mod test { gen: 1, opts: CrucibleOpts { id: Uuid::new_v4(), - target: vec![ - // the region snapshot - String::from("[::1]:12345"), - ], + target: vec!["[::1]:12345".parse().unwrap()], lossy: false, flush_timeout: None, key: None, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index f481126312..c8010dd90d 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -566,8 +566,9 @@ mod test { use nexus_db_model::Volume; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::CrucibleOpts; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::CrucibleOpts; + use sled_agent_client::VolumeConstructionRequest; + use std::net::SocketAddrV6; use uuid::Uuid; type ControlPlaneTestContext = @@ -575,7 +576,7 @@ mod test { async fn add_fake_volume_for_snapshot_addr( datastore: &DataStore, - snapshot_addr: String, + snapshot_addr: SocketAddrV6, ) -> Uuid { let new_volume_id = Uuid::new_v4(); @@ -587,7 +588,7 @@ mod test { DatasetUuid::new_v4(), Uuid::new_v4(), Uuid::new_v4(), - snapshot_addr.clone(), + snapshot_addr.to_string(), )) .await .unwrap(); @@ -604,7 +605,7 @@ mod test { gen: 0, opts: CrucibleOpts { id: Uuid::new_v4(), - target: vec![snapshot_addr], + target: vec![snapshot_addr.into()], lossy: false, flush_timeout: None, key: None, @@ -656,13 +657,14 @@ mod test { let dataset_id = DatasetUuid::new_v4(); let region_id = Uuid::new_v4(); let snapshot_id = Uuid::new_v4(); - let snapshot_addr = String::from("[fd00:1122:3344::101]:9876"); + let snapshot_addr: SocketAddrV6 = + "[fd00:1122:3344::101]:9876".parse().unwrap(); let fake_region_snapshot = RegionSnapshot::new( dataset_id, region_id, snapshot_id, - snapshot_addr.clone(), + snapshot_addr.to_string(), ); datastore.region_snapshot_create(fake_region_snapshot).await.unwrap(); @@ -746,28 +748,22 @@ mod test { // Add some fake volumes that reference the region snapshot being // replaced - let new_volume_1_id = add_fake_volume_for_snapshot_addr( - &datastore, - snapshot_addr.clone(), - ) - .await; - let new_volume_2_id = add_fake_volume_for_snapshot_addr( - &datastore, - snapshot_addr.clone(), - ) - .await; + let new_volume_1_id = + add_fake_volume_for_snapshot_addr(&datastore, snapshot_addr).await; + let new_volume_2_id = + add_fake_volume_for_snapshot_addr(&datastore, snapshot_addr).await; // Add some fake volumes that do not let other_volume_1_id = add_fake_volume_for_snapshot_addr( &datastore, - String::from("[fd00:1122:3344::101]:1000"), + "[fd00:1122:3344::101]:1000".parse().unwrap(), ) .await; let other_volume_2_id = add_fake_volume_for_snapshot_addr( &datastore, - String::from("[fd12:5544:3344::912]:3901"), + "[fd12:5544:3344::912]:3901".parse().unwrap(), ) .await; diff --git a/nexus/src/app/image.rs b/nexus/src/app/image.rs index a3fa722d36..d46da647d5 100644 --- a/nexus/src/app/image.rs +++ b/nexus/src/app/image.rs @@ -155,7 +155,7 @@ impl super::Nexus { let image_id = Uuid::new_v4(); let volume_construction_request = - sled_agent_client::types::VolumeConstructionRequest::File { + sled_agent_client::VolumeConstructionRequest::File { id: image_id, block_size, path: "/opt/oxide/propolis-server/blob/alpine.iso" diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index e916974332..109c0d8521 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -63,8 +63,7 @@ use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; use sagas::instance_start; use sagas::instance_update; -use sled_agent_client::types::BootOrderEntry; -use sled_agent_client::types::BootSettings; +use sled_agent_client::types::InstanceBootSettings; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; use sled_agent_client::types::VmmPutStateBody; @@ -1053,8 +1052,6 @@ impl super::Nexus { ) .await?; - let mut boot_disk_name = None; - let mut disk_reqs = vec![]; for disk in &disks { // Disks that are attached to an instance should always have a slot @@ -1089,58 +1086,22 @@ impl super::Nexus { ) .await?; - // Propolis wants the name of the boot disk rather than ID, because we send names - // rather than IDs in the disk requsts as assembled below. - if db_instance.boot_disk_id == Some(disk.id()) { - boot_disk_name = Some(disk.name().to_string()); - } - - disk_reqs.push(sled_agent_client::types::DiskRequest { + disk_reqs.push(sled_agent_client::types::InstanceDisk { + disk_id: disk.id(), name: disk.name().to_string(), - slot: sled_agent_client::types::Slot(slot.0), + slot: slot.0, read_only: false, - device: "nvme".to_string(), - volume_construction_request: serde_json::from_str( - &volume.data(), - ) - .map_err(Error::from)?, + vcr_json: volume.data().to_owned(), }); } - let boot_settings = if let Some(boot_disk_name) = boot_disk_name { - Some(BootSettings { - order: vec![BootOrderEntry { name: boot_disk_name }], - }) - } else { - if let Some(instance_boot_disk_id) = - db_instance.boot_disk_id.as_ref() - { - // This should never occur: when setting the boot disk we ensure it is - // attached, and when detaching a disk we ensure it is not the boot - // disk. If this error is seen, the instance somehow had a boot disk - // that was not attached anyway. - // - // When Propolis accepts an ID rather than name, and we don't need to - // look up a name when assembling the Propolis request, we might as well - // remove this check; we can just pass the ID and rely on Propolis' own - // check that the boot disk is attached. - if boot_disk_name.is_none() { - error!(self.log, "instance boot disk is not attached"; - "boot_disk_id" => ?instance_boot_disk_id, - "instance id" => %db_instance.id()); - - return Err(InstanceStateChangeError::Other( - Error::internal_error(&format!( - "instance {} has boot disk {:?} but it is not attached", - db_instance.id(), - db_instance.boot_disk_id.as_ref(), - )), - )); - } - } - - None - }; + // The routines that maintain an instance's boot options are supposed to + // guarantee that the boot disk ID, if present, is the ID of an attached + // disk. If this invariant isn't upheld, Propolis will catch the failure + // when it processes its received VM configuration. + let boot_settings = db_instance + .boot_disk_id + .map(|id| InstanceBootSettings { order: vec![id] }); let nics = self .db_datastore diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index 51f6d29de1..5dcb3a0616 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -20,10 +20,10 @@ use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; -use std::collections::VecDeque; +use sled_agent_client::{CrucibleOpts, VolumeConstructionRequest}; use std::convert::TryFrom; use std::net::SocketAddrV6; +use std::{collections::VecDeque, net::SocketAddr}; use steno::ActionError; use steno::Node; use uuid::Uuid; @@ -506,7 +506,7 @@ async fn sdc_regions_ensure( )), ) }) - .map(|addr| addr.to_string()) + .map(SocketAddr::V6) }) .collect::, ActionError>>()?, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 2ddbe5b05c..644f0c42e3 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -539,7 +539,6 @@ async fn sim_instance_migrate( InstanceStateChangeRequest::Migrate( InstanceMigrationTargetParams { src_propolis_addr: src_vmm_addr.to_string(), - src_propolis_id, }, ), ) diff --git a/nexus/src/app/sagas/region_replacement_drive.rs b/nexus/src/app/sagas/region_replacement_drive.rs index 81f9a9a7eb..30cb76f7fb 100644 --- a/nexus/src/app/sagas/region_replacement_drive.rs +++ b/nexus/src/app/sagas/region_replacement_drive.rs @@ -148,7 +148,6 @@ use crate::app::{authn, authz, db}; use chrono::DateTime; use chrono::Utc; use nexus_db_model::VmmState; -use nexus_types::identity::Resource; use omicron_common::api::external::Error; use propolis_client::types::ReplaceResult; use serde::Deserialize; @@ -1502,7 +1501,6 @@ async fn execute_propolis_drive_action( .instance_issue_crucible_vcr_request() .id(disk.id()) .body(propolis_client::types::InstanceVcrReplace { - name: disk.name().to_string(), vcr_json: disk_new_volume_vcr, }) .send() diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index 2212e6fdf3..fcbf40db84 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -219,8 +219,8 @@ pub(crate) mod test { use nexus_db_queries::context::OpContext; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::CrucibleOpts; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::CrucibleOpts; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index aa9e83c037..bbd68d3aac 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -58,8 +58,8 @@ use nexus_db_queries::db::datastore::REGION_REDUNDANCY_THRESHOLD; use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::CrucibleOpts; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::CrucibleOpts; +use sled_agent_client::VolumeConstructionRequest; use std::net::SocketAddrV6; use steno::ActionError; use steno::Node; @@ -688,7 +688,7 @@ async fn srrs_create_fake_volume( gen: 0, opts: CrucibleOpts { id: new_volume_id, - target: vec![old_region_address.to_string()], + target: vec![old_region_address.into()], lossy: false, flush_timeout: None, key: None, @@ -793,7 +793,7 @@ pub(crate) mod test { use nexus_types::identity::Asset; use omicron_common::api::internal::shared::DatasetKind; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs index 675b2b0cb3..89c3ec2364 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs @@ -219,8 +219,8 @@ pub(crate) mod test { use nexus_db_queries::context::OpContext; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; - use sled_agent_client::types::CrucibleOpts; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::CrucibleOpts; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index b9ed75c288..4919919c99 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -68,8 +68,9 @@ use omicron_common::api::external::Error; use omicron_uuid_kinds::DatasetUuid; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::CrucibleOpts; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::CrucibleOpts; +use sled_agent_client::VolumeConstructionRequest; +use std::net::SocketAddr; use std::net::SocketAddrV6; use steno::ActionError; use steno::Node; @@ -731,12 +732,12 @@ async fn rsrss_new_region_volume_create( ))); }; - let new_region_address = SocketAddrV6::new( + let new_region_address = SocketAddr::V6(SocketAddrV6::new( *new_dataset_address.ip(), ensured_region.port_number, 0, 0, - ); + )); // Create a volume to inflate the reference count of the newly created // read-only region. If this is not done it's possible that a user could @@ -753,7 +754,7 @@ async fn rsrss_new_region_volume_create( gen: 0, opts: CrucibleOpts { id: new_region_volume_id, - target: vec![new_region_address.to_string()], + target: vec![new_region_address], lossy: false, flush_timeout: None, key: None, @@ -1149,7 +1150,7 @@ pub(crate) mod test { use nexus_types::external_api::views; use nexus_types::identity::Asset; use omicron_uuid_kinds::GenericUuid; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::VolumeConstructionRequest; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index a236fcf62c..fd34e80712 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -56,13 +56,12 @@ use crate::app::db::lookup::LookupPath; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use nexus_db_model::VmmState; -use nexus_types::identity::Resource; use omicron_common::api::external::Error; use propolis_client::types::ReplaceResult; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::CrucibleOpts; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::CrucibleOpts; +use sled_agent_client::VolumeConstructionRequest; use std::net::SocketAddrV6; use steno::ActionError; use steno::Node; @@ -555,11 +554,13 @@ async fn rsrss_notify_upstairs( "vmm id" => ?vmm.id, ); + // N.B. The ID passed to this request must match the disk backend ID that + // sled agent supplies to Propolis when it creates the instance. Currently, + // sled agent uses the disk ID as the backend ID. let result = client .instance_issue_crucible_vcr_request() .id(disk.id()) .body(propolis_client::types::InstanceVcrReplace { - name: disk.name().to_string(), vcr_json: new_volume_vcr, }) .send() diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs index 15c6a39651..f67a1ee31f 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs @@ -131,8 +131,8 @@ pub(crate) mod test { use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; - use sled_agent_client::types::CrucibleOpts; - use sled_agent_client::types::VolumeConstructionRequest; + use sled_agent_client::CrucibleOpts; + use sled_agent_client::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 4dd958c50b..cceec7d70e 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -112,18 +112,21 @@ use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid}; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::CrucibleOpts; use sled_agent_client::types::VmmIssueDiskSnapshotRequestBody; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::CrucibleOpts; +use sled_agent_client::VolumeConstructionRequest; use slog::info; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::VecDeque; +use std::net::SocketAddr; use std::net::SocketAddrV6; use steno::ActionError; use steno::Node; use uuid::Uuid; +type ReplaceSocketsMap = BTreeMap; + // snapshot create saga: input parameters #[derive(Debug, Deserialize, Serialize)] @@ -539,7 +542,7 @@ async fn ssc_regions_ensure( )), ) }) - .map(|addr| addr.to_string()) + .map(SocketAddr::V6) }) .collect::, ActionError>>()?, @@ -1383,7 +1386,7 @@ async fn ssc_detach_disk_from_pantry( async fn ssc_start_running_snapshot( sagactx: NexusActionContext, -) -> Result, ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; @@ -1409,7 +1412,7 @@ async fn ssc_start_running_snapshot( .await .map_err(ActionError::action_failed)?; - let mut map: BTreeMap = BTreeMap::new(); + let mut map: ReplaceSocketsMap = BTreeMap::new(); for (dataset, region) in datasets_and_regions { let Some(dataset_addr) = dataset.address() else { @@ -1438,28 +1441,22 @@ async fn ssc_start_running_snapshot( ); // Map from the region to the snapshot - let region_addr = format!( - "{}", - SocketAddrV6::new( - *dataset_addr.ip(), - crucible_region.port_number, - 0, - 0 - ) + let region_addr = SocketAddrV6::new( + *dataset_addr.ip(), + crucible_region.port_number, + 0, + 0, ); - let snapshot_addr = format!( - "{}", - SocketAddrV6::new( - *dataset_addr.ip(), - crucible_running_snapshot.port_number, - 0, - 0 - ) + let snapshot_addr = SocketAddrV6::new( + *dataset_addr.ip(), + crucible_running_snapshot.port_number, + 0, + 0, ); info!(log, "map {} to {}", region_addr, snapshot_addr); - map.insert(region_addr, snapshot_addr.clone()); + map.insert(region_addr, snapshot_addr); // Once snapshot has been validated, and running snapshot has been // started, add an entry in the region_snapshot table to correspond to @@ -1470,7 +1467,7 @@ async fn ssc_start_running_snapshot( dataset_id: dataset.id().into(), region_id: region.id(), snapshot_id, - snapshot_addr, + snapshot_addr: snapshot_addr.to_string(), volume_references: 0, // to be filled later deleting: false, }) @@ -1570,7 +1567,7 @@ async fn ssc_create_volume_record( // read-only crucible agent downstairs (corresponding to this snapshot) // launched through this saga. let replace_sockets_map = - sagactx.lookup::>("replace_sockets_map")?; + sagactx.lookup::("replace_sockets_map")?; let snapshot_volume_construction_request: VolumeConstructionRequest = create_snapshot_from_disk( &disk_volume_construction_request, @@ -1694,7 +1691,7 @@ async fn ssc_release_volume_lock( /// VolumeConstructionRequest and modifying it accordingly. fn create_snapshot_from_disk( disk: &VolumeConstructionRequest, - socket_map: &BTreeMap, + socket_map: &ReplaceSocketsMap, ) -> anyhow::Result { // When copying a disk's VolumeConstructionRequest to turn it into a // snapshot: @@ -1756,9 +1753,19 @@ fn create_snapshot_from_disk( if work.socket_modification_required { for target in &mut opts.target { - target.clone_from(socket_map.get(target).ok_or_else( - || anyhow!("target {} not found in map!", target), - )?); + let target = match target { + SocketAddr::V6(v6) => v6, + SocketAddr::V4(_) => { + anyhow::bail!( + "unexpected IPv4 address in VCR: {:?}", + work.vcr_part + ) + } + }; + + *target = *socket_map.get(target).ok_or_else(|| { + anyhow!("target {} not found in map!", target) + })?; } } } @@ -1799,7 +1806,7 @@ mod test { use omicron_common::api::external::InstanceCpuCount; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; - use sled_agent_client::types::CrucibleOpts; + use sled_agent_client::CrucibleOpts; use sled_agent_client::TestInterfaces as SledAgentTestInterfaces; use std::str::FromStr; @@ -1834,9 +1841,9 @@ mod test { lossy: false, read_only: true, target: vec![ - "[fd00:1122:3344:101::8]:19001".into(), - "[fd00:1122:3344:101::7]:19001".into(), - "[fd00:1122:3344:101::6]:19001".into(), + "[fd00:1122:3344:101::8]:19001".parse().unwrap(), + "[fd00:1122:3344:101::7]:19001".parse().unwrap(), + "[fd00:1122:3344:101::6]:19001".parse().unwrap(), ], cert_pem: None, key_pem: None, @@ -1860,34 +1867,34 @@ mod test { lossy: false, read_only: false, target: vec![ - "[fd00:1122:3344:101::8]:19002".into(), - "[fd00:1122:3344:101::7]:19002".into(), - "[fd00:1122:3344:101::6]:19002".into(), + "[fd00:1122:3344:101::8]:19002".parse().unwrap(), + "[fd00:1122:3344:101::7]:19002".parse().unwrap(), + "[fd00:1122:3344:101::6]:19002".parse().unwrap(), ], cert_pem: None, key_pem: None, root_cert_pem: None, flush_timeout: None, - control: Some("127.0.0.1:12345".into()), + control: Some("127.0.0.1:12345".parse().unwrap()), } }, ], }; - let mut replace_sockets: BTreeMap = BTreeMap::new(); + let mut replace_sockets = ReplaceSocketsMap::new(); // Replacements for top level Region only replace_sockets.insert( - "[fd00:1122:3344:101::6]:19002".into(), - "[XXXX:1122:3344:101::6]:9000".into(), + "[fd00:1122:3344:101::6]:19002".parse().unwrap(), + "[fd01:1122:3344:101::6]:9000".parse().unwrap(), ); replace_sockets.insert( - "[fd00:1122:3344:101::7]:19002".into(), - "[XXXX:1122:3344:101::7]:9000".into(), + "[fd00:1122:3344:101::7]:19002".parse().unwrap(), + "[fd01:1122:3344:101::7]:9000".parse().unwrap(), ); replace_sockets.insert( - "[fd00:1122:3344:101::8]:19002".into(), - "[XXXX:1122:3344:101::8]:9000".into(), + "[fd00:1122:3344:101::8]:19002".parse().unwrap(), + "[fd01:1122:3344:101::8]:9000".parse().unwrap(), ); let snapshot = diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index a8ded4e33c..89d8306265 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -336,7 +336,7 @@ async fn svd_delete_crucible_snapshot_records( /// It's insufficient to rely on the struct of CrucibleResources to clean up /// that is returned as part of svd_decrease_crucible_resource_count. Imagine a /// disk that is composed of three regions (a subset of -/// [`sled_agent_client::types::VolumeConstructionRequest`] is shown here): +/// [`sled_agent_client::VolumeConstructionRequest`] is shown here): /// /// ```json /// { diff --git a/nexus/src/app/sagas/volume_remove_rop.rs b/nexus/src/app/sagas/volume_remove_rop.rs index b614495615..0f81356365 100644 --- a/nexus/src/app/sagas/volume_remove_rop.rs +++ b/nexus/src/app/sagas/volume_remove_rop.rs @@ -10,7 +10,7 @@ use nexus_db_queries::db; use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::VolumeConstructionRequest; +use sled_agent_client::VolumeConstructionRequest; use steno::{ActionError, Node}; use uuid::Uuid; diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index f0eb294e58..b059aae12b 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -55,6 +55,7 @@ use omicron_common::api::external::Name; use omicron_common::api::internal; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_test_utils::dev::poll::CondCheckError; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; use omicron_uuid_kinds::GenericUuid; @@ -64,9 +65,9 @@ use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; -use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use sled_agent_client::{CrucibleOpts, VolumeConstructionRequest}; use std::collections::HashSet; -use std::net::SocketAddrV6; +use std::net::{SocketAddr, SocketAddrV6}; use std::sync::Arc; use uuid::Uuid; @@ -2206,44 +2207,44 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { // insert those here manually. // (dataset_id, region_id, snapshot_id, snapshot_addr) - let region_snapshots = vec![ + let region_snapshots: Vec<(DatasetUuid, Uuid, Uuid, SocketAddr)> = vec![ // first snapshot-create ( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101::7]:19016"), + "[fd00:1122:3344:101::7]:19016".parse().unwrap(), ), ( zpool1.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:102::7]:19016"), + "[fd00:1122:3344:102::7]:19016".parse().unwrap(), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103::7]:19016"), + "[fd00:1122:3344:103::7]:19016".parse().unwrap(), ), // second snapshot-create ( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101::7]:19016"), // duplicate! + "[fd00:1122:3344:101::7]:19016".parse().unwrap(), ), ( zpool3.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:104::7]:19016"), + "[fd00:1122:3344:104::7]:19016".parse().unwrap(), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103::7]:19017"), + "[fd00:1122:3344:103::7]:19017".parse().unwrap(), ), ]; @@ -2258,7 +2259,7 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { dataset_id: (*dataset_id).into(), region_id: *region_id, snapshot_id: *snapshot_id, - snapshot_addr: snapshot_addr.clone(), + snapshot_addr: snapshot_addr.to_string(), volume_references: 0, deleting: false, }) @@ -2283,9 +2284,9 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - region_snapshots[0].3.clone(), - region_snapshots[1].3.clone(), - region_snapshots[2].3.clone(), + region_snapshots[0].3, + region_snapshots[1].3, + region_snapshots[2].3, ], lossy: false, flush_timeout: None, @@ -2379,7 +2380,7 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { dataset_id: (*dataset_id).into(), region_id: *region_id, snapshot_id: *snapshot_id, - snapshot_addr: snapshot_addr.clone(), + snapshot_addr: snapshot_addr.to_string(), volume_references: 0, deleting: false, }) @@ -2404,9 +2405,9 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - region_snapshots[3].3.clone(), - region_snapshots[4].3.clone(), - region_snapshots[5].3.clone(), + region_snapshots[3].3, + region_snapshots[4].3, + region_snapshots[5].3, ], lossy: false, flush_timeout: None, @@ -3659,7 +3660,7 @@ impl TestReadOnlyRegionReferenceUsage { gen: 1, opts: CrucibleOpts { id: Uuid::new_v4(), - target: vec![self.region_address.to_string()], + target: vec![self.region_address.into()], lossy: false, flush_timeout: None, key: None, @@ -3789,7 +3790,7 @@ impl TestReadOnlyRegionReferenceUsage { gen: 1, opts: CrucibleOpts { id: Uuid::new_v4(), - target: vec![self.region_address.to_string()], + target: vec![self.region_address.into()], lossy: false, flush_timeout: None, key: None, @@ -3822,7 +3823,7 @@ impl TestReadOnlyRegionReferenceUsage { gen: 1, opts: CrucibleOpts { id: Uuid::new_v4(), - target: vec![self.region_address.to_string()], + target: vec![self.region_address.into()], lossy: false, flush_timeout: None, key: None, @@ -3857,7 +3858,7 @@ impl TestReadOnlyRegionReferenceUsage { gen: 1, opts: CrucibleOpts { id: Uuid::new_v4(), - target: vec![self.region_address.to_string()], + target: vec![self.region_address.into()], lossy: false, flush_timeout: None, key: None, @@ -5377,30 +5378,30 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( let zpool3 = iter.next().expect("Expected four zpools"); // (dataset_id, region_id, snapshot_id, snapshot_addr) - let region_snapshots = vec![ + let region_snapshots: Vec<(DatasetUuid, Uuid, Uuid, SocketAddr)> = vec![ ( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101::7]:19016"), + "[fd00:1122:3344:101::7]:19016".parse().unwrap(), ), ( zpool1.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:102::7]:19016"), + "[fd00:1122:3344:102::7]:19016".parse().unwrap(), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103::7]:19016"), + "[fd00:1122:3344:103::7]:19016".parse().unwrap(), ), ( zpool3.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:104::7]:19016"), + "[fd00:1122:3344:104::7]:19016".parse().unwrap(), ), ]; @@ -5413,7 +5414,7 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( dataset_id: to_db_typed_uuid(*dataset_id), region_id: *region_id, snapshot_id: *snapshot_id, - snapshot_addr: snapshot_addr.clone(), + snapshot_addr: snapshot_addr.to_string(), volume_references: 0, deleting: false, }) @@ -5441,9 +5442,9 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - region_snapshots[0].3.clone(), - region_snapshots[1].3.clone(), - region_snapshots[2].3.clone(), + region_snapshots[0].3, + region_snapshots[1].3, + region_snapshots[2].3, ], lossy: false, flush_timeout: None, @@ -5479,9 +5480,9 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - region_snapshots[1].3.clone(), - region_snapshots[2].3.clone(), - region_snapshots[3].3.clone(), + region_snapshots[1].3, + region_snapshots[2].3, + region_snapshots[3].3, ], lossy: false, flush_timeout: None, @@ -5519,7 +5520,10 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( ); assert_eq!(region_snapshot_to_delete.region_id, region_snapshots[0].1); assert_eq!(region_snapshot_to_delete.snapshot_id, region_snapshots[0].2); - assert_eq!(region_snapshot_to_delete.snapshot_addr, region_snapshots[0].3); + assert_eq!( + region_snapshot_to_delete.snapshot_addr.parse::().unwrap(), + region_snapshots[0].3 + ); assert_eq!(region_snapshot_to_delete.volume_references, 0); assert_eq!(region_snapshot_to_delete.deleting, true); @@ -6050,9 +6054,9 @@ async fn test_no_zombie_read_only_regions(cptestctx: &ControlPlaneTestContext) { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - region_addrs[0].to_string(), - region_addrs[1].to_string(), - region_addrs[2].to_string(), + region_addrs[0].into(), + region_addrs[1].into(), + region_addrs[2].into(), ], lossy: false, flush_timeout: None, @@ -6236,9 +6240,9 @@ async fn test_no_zombie_read_write_regions( opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - region_addrs[0].to_string(), - region_addrs[1].to_string(), - region_addrs[2].to_string(), + region_addrs[0].into(), + region_addrs[1].into(), + region_addrs[2].into(), ], lossy: false, flush_timeout: None, diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index c7d2b36de4..81d9211104 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -2570,36 +2570,6 @@ } ] }, - "BootOrderEntry": { - "description": "An entry in the boot order stored in a [`BootSettings`] component.\n\n
JSON schema\n\n```json { \"description\": \"An entry in the boot order stored in a [`BootSettings`] component.\", \"type\": \"object\", \"required\": [ \"name\" ], \"properties\": { \"name\": { \"description\": \"The name of another component in the spec that Propolis should try to boot from.\\n\\nCurrently, only disk device components are supported.\", \"type\": \"string\" } } } ```
", - "type": "object", - "properties": { - "name": { - "description": "The name of another component in the spec that Propolis should try to boot from.\n\nCurrently, only disk device components are supported.", - "type": "string" - } - }, - "required": [ - "name" - ] - }, - "BootSettings": { - "description": "Settings supplied to the guest's firmware image that specify the order in which it should consider its options when selecting a device to try to boot from.\n\n
JSON schema\n\n```json { \"description\": \"Settings supplied to the guest's firmware image that specify the order in which it should consider its options when selecting a device to try to boot from.\", \"type\": \"object\", \"required\": [ \"order\" ], \"properties\": { \"order\": { \"description\": \"An ordered list of components to attempt to boot from.\", \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/BootOrderEntry\" } } }, \"additionalProperties\": false } ```
", - "type": "object", - "properties": { - "order": { - "description": "An ordered list of components to attempt to boot from.", - "type": "array", - "items": { - "$ref": "#/components/schemas/BootOrderEntry" - } - } - }, - "required": [ - "order" - ], - "additionalProperties": false - }, "BootstoreStatus": { "type": "object", "properties": { @@ -2891,59 +2861,6 @@ } ] }, - "CrucibleOpts": { - "description": "CrucibleOpts\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"id\", \"lossy\", \"read_only\", \"target\" ], \"properties\": { \"cert_pem\": { \"type\": [ \"string\", \"null\" ] }, \"control\": { \"type\": [ \"string\", \"null\" ] }, \"flush_timeout\": { \"type\": [ \"number\", \"null\" ], \"format\": \"float\" }, \"id\": { \"type\": \"string\", \"format\": \"uuid\" }, \"key\": { \"type\": [ \"string\", \"null\" ] }, \"key_pem\": { \"type\": [ \"string\", \"null\" ] }, \"lossy\": { \"type\": \"boolean\" }, \"read_only\": { \"type\": \"boolean\" }, \"root_cert_pem\": { \"type\": [ \"string\", \"null\" ] }, \"target\": { \"type\": \"array\", \"items\": { \"type\": \"string\" } } } } ```
", - "type": "object", - "properties": { - "cert_pem": { - "nullable": true, - "type": "string" - }, - "control": { - "nullable": true, - "type": "string" - }, - "flush_timeout": { - "nullable": true, - "type": "number", - "format": "float" - }, - "id": { - "type": "string", - "format": "uuid" - }, - "key": { - "nullable": true, - "type": "string" - }, - "key_pem": { - "nullable": true, - "type": "string" - }, - "lossy": { - "type": "boolean" - }, - "read_only": { - "type": "boolean" - }, - "root_cert_pem": { - "nullable": true, - "type": "string" - }, - "target": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": [ - "id", - "lossy", - "read_only", - "target" - ] - }, "DatasetConfig": { "description": "Configuration information necessary to request a single dataset.\n\nThese datasets are tracked directly by Nexus.", "type": "object", @@ -3250,34 +3167,6 @@ "identity" ] }, - "DiskRequest": { - "description": "DiskRequest\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"device\", \"name\", \"read_only\", \"slot\", \"volume_construction_request\" ], \"properties\": { \"device\": { \"type\": \"string\" }, \"name\": { \"type\": \"string\" }, \"read_only\": { \"type\": \"boolean\" }, \"slot\": { \"$ref\": \"#/components/schemas/Slot\" }, \"volume_construction_request\": { \"$ref\": \"#/components/schemas/VolumeConstructionRequest\" } } } ```
", - "type": "object", - "properties": { - "device": { - "type": "string" - }, - "name": { - "type": "string" - }, - "read_only": { - "type": "boolean" - }, - "slot": { - "$ref": "#/components/schemas/Slot" - }, - "volume_construction_request": { - "$ref": "#/components/schemas/VolumeConstructionRequest" - } - }, - "required": [ - "device", - "name", - "read_only", - "slot", - "volume_construction_request" - ] - }, "DiskRuntimeState": { "description": "Runtime state of the Disk, which includes its attach state and some minimal metadata", "type": "object", @@ -3866,12 +3755,65 @@ } ] }, + "InstanceBootSettings": { + "description": "Configures how an instance is told to try to boot.", + "type": "object", + "properties": { + "order": { + "description": "Propolis should tell guest firmware to try to boot from devices in this order.", + "type": "array", + "items": { + "type": "string", + "format": "uuid" + } + } + }, + "required": [ + "order" + ] + }, "InstanceCpuCount": { "description": "The number of CPUs in an Instance", "type": "integer", "format": "uint16", "minimum": 0 }, + "InstanceDisk": { + "description": "A request to attach a disk to an instance.", + "type": "object", + "properties": { + "disk_id": { + "description": "The disk's UUID.", + "type": "string", + "format": "uuid" + }, + "name": { + "description": "The disk's name, used to generate the serial number for the virtual disk exposed to the guest.", + "type": "string" + }, + "read_only": { + "description": "True if the disk is read-only.", + "type": "boolean" + }, + "slot": { + "description": "The logical slot number assigned to the disk in its database record.", + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "vcr_json": { + "description": "A JSON representation of the Crucible volume construction request for this attachment.", + "type": "string" + } + }, + "required": [ + "disk_id", + "name", + "read_only", + "slot", + "vcr_json" + ] + }, "InstanceEnsureBody": { "description": "The body of a request to ensure that a instance and VMM are known to a sled agent.", "type": "object", @@ -3978,7 +3920,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/BootSettings" + "$ref": "#/components/schemas/InstanceBootSettings" } ] }, @@ -3992,7 +3934,7 @@ "disks": { "type": "array", "items": { - "$ref": "#/components/schemas/DiskRequest" + "$ref": "#/components/schemas/InstanceDisk" } }, "ephemeral_ip": { @@ -4062,16 +4004,10 @@ "src_propolis_addr": { "description": "The address of the Propolis server that will serve as the migration source.", "type": "string" - }, - "src_propolis_id": { - "description": "The Propolis ID of the migration source.", - "type": "string", - "format": "uuid" } }, "required": [ - "src_propolis_addr", - "src_propolis_id" + "src_propolis_addr" ] }, "InstanceProperties": { @@ -5682,12 +5618,6 @@ "vmm_state" ] }, - "Slot": { - "description": "A stable index which is translated by Propolis into a PCI BDF, visible to the guest.\n\n
JSON schema\n\n```json { \"description\": \"A stable index which is translated by Propolis into a PCI BDF, visible to the guest.\", \"type\": \"integer\", \"format\": \"uint8\", \"minimum\": 0.0 } ```
", - "type": "integer", - "format": "uint8", - "minimum": 0 - }, "SourceNatConfig": { "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.", "type": "object", @@ -6269,151 +6199,6 @@ "format": "uint32", "minimum": 0 }, - "VolumeConstructionRequest": { - "description": "VolumeConstructionRequest\n\n
JSON schema\n\n```json { \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"block_size\", \"id\", \"sub_volumes\", \"type\" ], \"properties\": { \"block_size\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"id\": { \"type\": \"string\", \"format\": \"uuid\" }, \"read_only_parent\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/VolumeConstructionRequest\" } ] }, \"sub_volumes\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/VolumeConstructionRequest\" } }, \"type\": { \"type\": \"string\", \"enum\": [ \"volume\" ] } } }, { \"type\": \"object\", \"required\": [ \"block_size\", \"id\", \"type\", \"url\" ], \"properties\": { \"block_size\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"id\": { \"type\": \"string\", \"format\": \"uuid\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"url\" ] }, \"url\": { \"type\": \"string\" } } }, { \"type\": \"object\", \"required\": [ \"block_size\", \"blocks_per_extent\", \"extent_count\", \"gen\", \"opts\", \"type\" ], \"properties\": { \"block_size\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"blocks_per_extent\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"extent_count\": { \"type\": \"integer\", \"format\": \"uint32\", \"minimum\": 0.0 }, \"gen\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"opts\": { \"$ref\": \"#/components/schemas/CrucibleOpts\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"region\" ] } } }, { \"type\": \"object\", \"required\": [ \"block_size\", \"id\", \"path\", \"type\" ], \"properties\": { \"block_size\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"id\": { \"type\": \"string\", \"format\": \"uuid\" }, \"path\": { \"type\": \"string\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"file\" ] } } } ] } ```
", - "oneOf": [ - { - "type": "object", - "properties": { - "block_size": { - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "id": { - "type": "string", - "format": "uuid" - }, - "read_only_parent": { - "nullable": true, - "allOf": [ - { - "$ref": "#/components/schemas/VolumeConstructionRequest" - } - ] - }, - "sub_volumes": { - "type": "array", - "items": { - "$ref": "#/components/schemas/VolumeConstructionRequest" - } - }, - "type": { - "type": "string", - "enum": [ - "volume" - ] - } - }, - "required": [ - "block_size", - "id", - "sub_volumes", - "type" - ] - }, - { - "type": "object", - "properties": { - "block_size": { - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "id": { - "type": "string", - "format": "uuid" - }, - "type": { - "type": "string", - "enum": [ - "url" - ] - }, - "url": { - "type": "string" - } - }, - "required": [ - "block_size", - "id", - "type", - "url" - ] - }, - { - "type": "object", - "properties": { - "block_size": { - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "blocks_per_extent": { - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "extent_count": { - "type": "integer", - "format": "uint32", - "minimum": 0 - }, - "gen": { - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "opts": { - "$ref": "#/components/schemas/CrucibleOpts" - }, - "type": { - "type": "string", - "enum": [ - "region" - ] - } - }, - "required": [ - "block_size", - "blocks_per_extent", - "extent_count", - "gen", - "opts", - "type" - ] - }, - { - "type": "object", - "properties": { - "block_size": { - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "id": { - "type": "string", - "format": "uuid" - }, - "path": { - "type": "string" - }, - "type": { - "type": "string", - "enum": [ - "file" - ] - } - }, - "required": [ - "block_size", - "id", - "path", - "type" - ] - } - ] - }, "VpcFirewallRuleAction": { "type": "string", "enum": [ diff --git a/package-manifest.toml b/package-manifest.toml index b28ac7d59f..8140ce43af 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -624,10 +624,10 @@ service_name = "propolis-server" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "propolis" -source.commit = "220a6f367c18f2452dbc4fa9086f3fe73b961739" +source.commit = "d4529fd8247386b422b78e1203315d5baea5ea8b" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/propolis/image//propolis-server.sha256.txt -source.sha256 = "964bf262677496118f8cea95c257d0a57c76ddca70733217b0666657b53bd6e6" +source.sha256 = "3e5995281e2b222fbfa3537fcc846e0706361db5ab57de6656811871bcc04cc3" output.type = "zone" [package.mg-ddm-gz] diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 21832fde23..8b327ddd9a 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -443,8 +443,8 @@ struct InstanceRunner { dhcp_config: DhcpCfg, // Disk related properties - requested_disks: Vec, - boot_settings: Option, + requested_disks: Vec, + boot_settings: Option, cloud_init_bytes: Option>, // Internal State management @@ -847,84 +847,365 @@ impl InstanceRunner { res.map(|_| ()) } - /// Sends an instance ensure request to this instance's Propolis. + /// Sends an instance ensure request to this instance's Propolis, + /// constructing its configuration from the fields in `self` that describe + /// the instance's virtual hardware configuration. async fn propolis_ensure_inner( &self, client: &PropolisClient, running_zone: &RunningZone, migrate: Option, ) -> Result<(), Error> { - let nics = running_zone + // A bit of history helps to explain the workings of the rest of this + // function. + // + // In the past, the Propolis API accepted an InstanceEnsureRequest + // struct that described a VM's hardware configuration at more or less + // the level of specificity used in Nexus's database. Callers passed + // this struct irrespective of whether they were starting a brand new VM + // migrating from an existing VM in some other Propolis. It was + // Propolis's job to convert any insufficiently-specific parameters + // passed in its API into the concrete details needed to set up VM + // components (e.g., converting device slot numbers into concrete PCI + // bus/device/function numbers). + // + // The Propolis VM creation API used below differs from this scheme in + // two important ways: + // + // 1. Callers are responsible for filling in all the details of a VM's + // configuration (e.g., choosing PCI BDFs for PCI devices). + // 2. During a live migration, the migration target inherits most of its + // configuration directly from the migration source. Propolis only + // allows clients to specify new configurations for the specific set + // of components that it expects to be reconfigured over a migration. + // These are described further below. + // + // This scheme aims to + // + // 1. prevent bugs where an instance can't migrate because the control + // plane has specified conflicting configurations to the source and + // target, and + // 2. maximize the updateability of VM configurations by allowing the + // control plane, which (at least in Nexus) is relatively easy to + // update, to make the rules about how Nexus instance configurations + // are realized in Propolis VMs. + // + // See propolis#804 for even more context on this API's design. + // + // A "virtual platform" is a set of rules that describe how to realize a + // Propolis VM configuration from a control plane instance description. + // The logic below encodes the "Oxide MVP" virtual platform that + // Propolis implicitly implemented in its legacy instance-ensure API. In + // the future there will be additional virtual platforms that may encode + // different rules and configuration options. + // + // TODO(#615): Eventually this logic should move to a robust virtual + // platform library in Nexus. + + use propolis_client::{ + types::{ + BlobStorageBackend, Board, BootOrderEntry, BootSettings, + Chipset, ComponentV0, CrucibleStorageBackend, + InstanceInitializationMethod, NvmeDisk, QemuPvpanic, + ReplacementComponent, SerialPort, SerialPortNumber, VirtioDisk, + VirtioNetworkBackend, VirtioNic, + }, + PciPath, SpecKey, + }; + + // Define some local helper structs for unpacking hardware descriptions + // into the types Propolis wants to see in its specifications. + struct DiskComponents { + id: Uuid, + device: NvmeDisk, + backend: CrucibleStorageBackend, + } + + struct NicComponents { + id: Uuid, + device: VirtioNic, + backend: VirtioNetworkBackend, + } + + // Assemble the list of NVMe disks associated with this instance. + let disks: Vec = self + .requested_disks + .iter() + .map(|disk| -> Result { + // One of the details Propolis asks clients to fill in is the + // serial number for each NVMe disk. It's important that this + // number remain stable because it can be written to an + // instance's nonvolatile EFI variables, specifically its boot + // order variables, which can be undercut if a serial number + // changes. + // + // The old version of the Propolis API generated serial numbers + // by taking the control plane disk name and padding it with + // zero bytes. Match this behavior here. + // + // Note that this scheme violates version 1.2.1 of the NVMe + // specification: section 1.5 says that string fields like this + // one should be left-justified and padded with spaces, not + // zeroes. Future versions of this logic may switch to this + // behavior. + let serial_number = + propolis_client::support::nvme_serial_from_str( + &disk.name, 0, + ); + + Ok(DiskComponents { + id: disk.disk_id, + device: NvmeDisk { + backend_id: SpecKey::Uuid(disk.disk_id), + // The old Propolis API added 16 to disk slot numbers to + // get their PCI device numbers. + pci_path: PciPath::new(0, disk.slot + 0x10, 0)?, + serial_number, + }, + backend: CrucibleStorageBackend { + readonly: disk.read_only, + request_json: disk.vcr_json.0.clone(), + }, + }) + }) + .collect::, _>>()?; + + // Next, assemble the list of guest NICs. + let nics: Vec = running_zone .opte_ports() - .map(|port| { - self.requested_nics + .map(|port| -> Result { + let nic = self + .requested_nics .iter() // We expect to match NIC slots to OPTE port slots. // Error out if we can't find a NIC for a port. - .position(|nic| nic.slot == port.slot()) + .find(|nic| nic.slot == port.slot()) .ok_or(Error::Opte( illumos_utils::opte::Error::NoNicforPort( port.name().into(), port.slot().into(), ), + ))?; + + Ok(NicComponents { + id: nic.id, + device: VirtioNic { + backend_id: SpecKey::Uuid(nic.id), + interface_id: nic.id, + // The old Propolis API added 8 to NIC slot numbers to + // get their PCI device numbers. + pci_path: PciPath::new(0, nic.slot + 8, 0)?, + }, + backend: VirtioNetworkBackend { + vnic_name: port.name().to_string(), + }, + }) + }) + .collect::, _>>()?; + + // When a VM migrates, the migration target inherits most of its + // configuration directly from the migration source. The exceptions are + // cases where the target VM needs new parameters in order to interface + // with services on its sled or in the rest of the rack. These include + // + // - Crucible disks: the target needs to connect to its downstairs + // instances with new generation numbers supplied from Nexus + // - OPTE ports: the target needs to bind its VNICs to the correct + // devices for its new host; those devices may have different names + // than their counterparts on the source host + // + // If this is a request to migrate, construct a list of source component + // specifications that this caller intends to replace. Otherwise, + // construct a complete instance specification for a new VM. + let request = if let Some(params) = migrate { + // TODO(#6073): The migration ID is sent in the VMM registration + // request and isn't part of the migration target params (despite + // being known when the migration-start request is sent). If it were + // sent here it would no longer be necessary to read the ID back + // from the saved VMM/instance state. + // + // In practice, Nexus should (by the construction of the instance + // migration saga) always initialize a migration-destination VMM + // with a valid migration ID. But since that invariant is + // technically part of a different component, return an error here + // instead of unwrapping if it's violated. + let migration_id = self + .state + .migration_in() + .ok_or_else(|| { + Error::Migration(anyhow::anyhow!( + "migration requested but no migration ID was \ + supplied when this VMM was registered" )) - .map(|pos| { - let nic = &self.requested_nics[pos]; - propolis_client::types::NetworkInterfaceRequest { - interface_id: nic.id, - name: port.name().to_string(), - slot: propolis_client::types::Slot(port.slot()), - } + })? + .migration_id; + + // Add the new Crucible backends to the list of source instance spec + // elements that should be replaced in the target's spec. + let mut elements_to_replace: std::collections::HashMap<_, _> = + disks + .into_iter() + .map(|disk| { + ( + // N.B. This ID must match the one supplied when the + // instance was started. + disk.id.to_string(), + ReplacementComponent::CrucibleStorageBackend( + disk.backend, + ), + ) }) - }) - .collect::, _>>()?; - - let migrate = match migrate { - Some(params) => { - let migration_id = self.state - .migration_in() - // TODO(eliza): This is a bit of an unfortunate dance: the - // initial instance-ensure-registered request is what sends - // the migration ID, but it's the subsequent - // instance-ensure-state request (which we're handling here) - // that includes migration the source VMM's UUID and IP - // address. Because the API currently splits the migration - // IDs between the instance-ensure-registered and - // instance-ensure-state requests, we have to stash the - // migration ID in an `Option` and `expect()` it here, - // panicking if we get an instance-ensure-state request with - // a source Propolis ID if the instance wasn't registered - // with a migration in ID. - // - // This is kind of a shame. Eventually, we should consider - // reworking the API ensure-state request contains the - // migration ID, and we don't have to unwrap here. See: - // https://github.com/oxidecomputer/omicron/issues/6073 - .expect("if we have migration target params, we should also have a migration in") - .migration_id; - Some(propolis_client::types::InstanceMigrateInitiateRequest { - src_addr: params.src_propolis_addr.to_string(), - src_uuid: params.src_propolis_id, + .collect(); + + // Add new OPTE backend configuration to the replacement list. + elements_to_replace.extend(nics.into_iter().map(|nic| { + ( + // N.B. This ID must match the one supplied when the + // instance was started. + nic.id.to_string(), + ReplacementComponent::VirtioNetworkBackend(nic.backend), + ) + })); + + propolis_client::types::InstanceEnsureRequest { + properties: self.properties.clone(), + init: InstanceInitializationMethod::MigrationTarget { migration_id, - }) + replace_components: elements_to_replace, + src_addr: params.src_propolis_addr.to_string(), + }, } - None => None, - }; + } else { + // This is not a request to migrate, so construct a brand new spec + // to send to Propolis. + // + // Spec components must all have unique names. This routine ensures + // that names are unique as follows: + // + // 1. Backend components corresponding to specific control plane + // objects (e.g. Crucible disks, network interfaces) are + // identified by their control plane record IDs, which are UUIDs. + // (If these UUIDs collide, Nexus has many other problems.) + // 2. Devices attached to those backends are identified by a string + // that includes the backend UUID; see `id_to_device_name` below. + // 3. Other components that are implicitly added to all VMs are + // assigned unique component names within this function. + // + // Because *Nexus object names* (which *can* alias) are never used + // directly to name spec components, there should never be a + // conflict, so this helper asserts that all keys in the component + // map are unique. + fn add_component( + spec: &mut propolis_client::types::InstanceSpecV0, + key: String, + component: ComponentV0, + ) { + assert!(spec.components.insert(key, component).is_none()); + } + + fn id_to_device_name(id: &Uuid) -> String { + format!("{id}:device") + } + + let mut spec = propolis_client::types::InstanceSpecV0 { + board: Board { + chipset: Chipset::default(), + cpus: self.vcpus, + memory_mb: self.memory_mib, + cpuid: None, + }, + components: Default::default(), + }; + + for (name, num) in [ + ("com1", SerialPortNumber::Com1), + ("com2", SerialPortNumber::Com2), + ("com3", SerialPortNumber::Com3), + ("com4", SerialPortNumber::Com4), + ] { + add_component( + &mut spec, + name.to_string(), + ComponentV0::SerialPort(SerialPort { num }), + ); + } + + for DiskComponents { id, device, backend } in disks.into_iter() { + add_component( + &mut spec, + id_to_device_name(&id), + ComponentV0::NvmeDisk(device), + ); - let request = propolis_client::types::InstanceEnsureRequest { - properties: self.properties.clone(), - vcpus: self.vcpus, - memory: self.memory_mib, - nics, - disks: self - .requested_disks - .iter() - .cloned() - .map(Into::into) - .collect(), - boot_settings: self.boot_settings.clone(), - migrate, - cloud_init_bytes: self.cloud_init_bytes.clone().map(|x| x.0), + add_component( + &mut spec, + id.to_string(), + ComponentV0::CrucibleStorageBackend(backend), + ); + } + + for NicComponents { id, device, backend } in nics.into_iter() { + add_component( + &mut spec, + id_to_device_name(&id), + ComponentV0::VirtioNic(device), + ); + add_component( + &mut spec, + id.to_string(), + ComponentV0::VirtioNetworkBackend(backend), + ); + } + + add_component( + &mut spec, + "pvpanic".to_owned(), + ComponentV0::QemuPvpanic(QemuPvpanic { enable_isa: true }), + ); + + if let Some(settings) = &self.boot_settings { + // The boot order contains a list of disk IDs. Propolis matches + // boot order entries based on device component names, so pass + // the ID through `id_to_device_name` when generating the + // Propolis boot order. + let settings = ComponentV0::BootSettings(BootSettings { + order: settings + .order + .iter() + .map(|id| BootOrderEntry { + id: SpecKey::Name(id_to_device_name(&id)), + }) + .collect(), + }); + + add_component(&mut spec, "boot_settings".to_string(), settings); + } + + if let Some(cloud_init) = &self.cloud_init_bytes { + let device_name = "cloud-init-dev"; + let backend_name = "cloud-init-backend"; + + // The old Propolis API (and sled-agent's arguments to it) + // always attached cloud-init drives at BDF 0.24.0. + let device = ComponentV0::VirtioDisk(VirtioDisk { + backend_id: SpecKey::Name(backend_name.to_string()), + pci_path: PciPath::new(0, 0x18, 0).unwrap(), + }); + + let backend = + ComponentV0::BlobStorageBackend(BlobStorageBackend { + base64: cloud_init.0.clone(), + readonly: true, + }); + + add_component(&mut spec, device_name.to_string(), device); + add_component(&mut spec, backend_name.to_string(), backend); + } + + propolis_client::types::InstanceEnsureRequest { + properties: self.properties.clone(), + init: InstanceInitializationMethod::Spec { spec }, + } }; debug!(self.log, "Sending ensure request to propolis: {:?}", request); diff --git a/sled-agent/src/sim/http_entrypoints_pantry.rs b/sled-agent/src/sim/http_entrypoints_pantry.rs index c98c7db665..e879cea70f 100644 --- a/sled-agent/src/sim/http_entrypoints_pantry.rs +++ b/sled-agent/src/sim/http_entrypoints_pantry.rs @@ -9,7 +9,7 @@ use dropshot::{ HttpResponseDeleted, HttpResponseOk, HttpResponseUpdatedNoContent, Path as TypedPath, RequestContext, TypedBody, }; -use propolis_client::types::VolumeConstructionRequest; +use propolis_client::VolumeConstructionRequest; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::sync::Arc; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 1f099fc036..0653b52508 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -45,7 +45,11 @@ use omicron_uuid_kinds::{ }; use oxnet::Ipv6Net; use propolis_client::{ - types::VolumeConstructionRequest, Client as PropolisClient, + types::{ + Board, Chipset, ComponentV0, InstanceInitializationMethod, + InstanceSpecV0, SerialPort, SerialPortNumber, + }, + Client as PropolisClient, VolumeConstructionRequest, }; use sled_agent_api::SupportBundleMetadata; use sled_agent_api::SupportBundleState; @@ -125,7 +129,7 @@ fn extract_targets_from_volume_construction_request( VolumeConstructionRequest::Region { opts, .. } => { for target in &opts.target { - res.push(SocketAddr::from_str(&target)?); + res.push(*target); } } @@ -301,10 +305,7 @@ impl SledAgent { // Ensure that any disks that are in this request are attached to // this instance. - let id = match disk.volume_construction_request { - VolumeConstructionRequest::Volume { id, .. } => id, - _ => panic!("Unexpected construction type"), - }; + let id = disk.disk_id; self.disks .sim_ensure( &id, @@ -353,15 +354,30 @@ impl SledAgent { description: "sled-agent-sim created instance".to_string(), metadata, }; + let body = propolis_client::types::InstanceEnsureRequest { properties, - memory: hardware.properties.memory.to_whole_mebibytes(), - vcpus: hardware.properties.ncpus.0 as u8, - nics: vec![], - disks: vec![], - boot_settings: None, - migrate: None, - cloud_init_bytes: None, + init: InstanceInitializationMethod::Spec { + spec: InstanceSpecV0 { + board: Board { + cpus: hardware.properties.ncpus.0 as u8, + chipset: Chipset::default(), + memory_mb: hardware + .properties + .memory + .to_whole_mebibytes(), + cpuid: None, + }, + components: [( + "com1".to_string(), + ComponentV0::SerialPort(SerialPort { + num: SerialPortNumber::Com1, + }), + )] + .into_iter() + .collect(), + }, + }, }; // Try to create the instance client.instance_ensure().body(body).send().await.map_err( @@ -397,7 +413,7 @@ impl SledAgent { .await?; for disk_request in &hardware.disks { - let vcr = &disk_request.volume_construction_request; + let vcr = serde_json::from_str(&disk_request.vcr_json.0)?; self.map_disk_ids_to_region_ids(&vcr).await?; } diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 2299ba9db9..c706c05b14 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -36,7 +36,7 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; -use propolis_client::types::VolumeConstructionRequest; +use propolis_client::VolumeConstructionRequest; use serde::Serialize; use sled_agent_api::SupportBundleMetadata; use sled_agent_api::SupportBundleState; diff --git a/sled-agent/types/src/instance.rs b/sled-agent/types/src/instance.rs index 39726030b0..3ff73925e4 100644 --- a/sled-agent/types/src/instance.rs +++ b/sled-agent/types/src/instance.rs @@ -47,6 +47,39 @@ pub struct InstanceEnsureBody { pub metadata: InstanceMetadata, } +/// A request to attach a disk to an instance. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct InstanceDisk { + /// The disk's UUID. + pub disk_id: Uuid, + /// The logical slot number assigned to the disk in its database record. + pub slot: u8, + /// True if the disk is read-only. + pub read_only: bool, + /// A JSON representation of the Crucible volume construction request for + /// this attachment. + // + // This is marked as `NoDebug` because the VCR contains the volume's + // encryption keys. + pub vcr_json: NoDebug, + + /// The disk's name, used to generate the serial number for the virtual disk + /// exposed to the guest. + // + // TODO(#7153): Making this depend on the disk name means that a disk's ID + // may change if it is renamed or if a snapshot of it is used to create a + // new disk. + pub name: String, +} + +/// Configures how an instance is told to try to boot. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct InstanceBootSettings { + /// Propolis should tell guest firmware to try to boot from devices in this + /// order. + pub order: Vec, +} + /// Describes the instance hardware. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct InstanceHardware { @@ -59,9 +92,8 @@ pub struct InstanceHardware { pub floating_ips: Vec, pub firewall_rules: Vec, pub dhcp_config: DhcpConfig, - // TODO: replace `propolis_client::*` with locally-modeled request type - pub disks: Vec, - pub boot_settings: Option, + pub disks: Vec, + pub boot_settings: Option, pub cloud_init_bytes: Option>, } @@ -152,9 +184,6 @@ pub struct VmmUnregisterResponse { /// migration. #[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct InstanceMigrationTargetParams { - /// The Propolis ID of the migration source. - pub src_propolis_id: Uuid, - /// The address of the Propolis server that will serve as the migration /// source. pub src_propolis_addr: SocketAddr, From 591e2e9ef83f0f8fb78046069f1db8d24a899174 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 20 Dec 2024 20:54:16 -0500 Subject: [PATCH 08/11] Support diffing blueprints via Diffus (#7261) This is the second step in https://github.com/oxidecomputer/omicron/issues/7240. The next steps will use this support to replace our handrolled code for generating diffs and generate output from the diffus based diffs. --- Cargo.lock | 35 +++++++++++++++-- Cargo.toml | 3 +- clickhouse-admin/types/Cargo.toml | 3 +- clickhouse-admin/types/src/lib.rs | 3 ++ common/Cargo.toml | 1 + common/src/api/external/mod.rs | 16 ++++++++ common/src/api/internal/shared.rs | 8 +++- common/src/disk.rs | 4 ++ common/src/zpool_name.rs | 5 ++- nexus-sled-agent-shared/Cargo.toml | 1 + nexus-sled-agent-shared/src/inventory.rs | 2 + nexus/reconfigurator/planning/Cargo.toml | 1 + nexus/types/Cargo.toml | 1 + nexus/types/src/deployment.rs | 39 ++++++++++++++++--- nexus/types/src/deployment/clickhouse.rs | 5 ++- .../types/src/deployment/network_resources.rs | 4 ++ nexus/types/src/deployment/planning_input.rs | 12 +++++- nexus/types/src/deployment/zone_type.rs | 15 +++++++ nexus/types/src/external_api/views.rs | 2 + uuid-kinds/Cargo.toml | 1 + uuid-kinds/src/lib.rs | 2 + workspace-hack/Cargo.toml | 6 ++- 22 files changed, 152 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1aec1ccc89..fe8ac9704f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1437,6 +1437,7 @@ dependencies = [ "camino-tempfile", "chrono", "derive_more", + "diffus", "expectorate", "itertools 0.13.0", "omicron-common", @@ -2392,6 +2393,28 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" +[[package]] +name = "diffus" +version = "0.10.0" +source = "git+https://github.com/oxidecomputer/diffus?branch=oxide/main#f6abe39bffd875b5fb1ebabf8144da15e3effe16" +dependencies = [ + "diffus-derive", + "itertools 0.10.5", + "newtype-uuid", + "oxnet", + "uuid", +] + +[[package]] +name = "diffus-derive" +version = "0.10.0" +source = "git+https://github.com/oxidecomputer/diffus?branch=oxide/main#f6abe39bffd875b5fb1ebabf8144da15e3effe16" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "digest" version = "0.10.7" @@ -6104,6 +6127,7 @@ dependencies = [ "chrono", "clickhouse-admin-types", "debug-ignore", + "diffus", "expectorate", "gateway-client", "illumos-utils", @@ -6206,6 +6230,7 @@ dependencies = [ name = "nexus-sled-agent-shared" version = "0.1.0" dependencies = [ + "diffus", "illumos-utils", "omicron-common", "omicron-passwords", @@ -6308,6 +6333,7 @@ dependencies = [ "cookie", "derive-where", "derive_more", + "diffus", "dropshot 0.13.0", "futures", "gateway-client", @@ -6732,6 +6758,7 @@ dependencies = [ "camino", "camino-tempfile", "chrono", + "diffus", "dropshot 0.13.0", "expectorate", "futures", @@ -7383,6 +7410,7 @@ dependencies = [ name = "omicron-uuid-kinds" version = "0.1.0" dependencies = [ + "diffus", "newtype-uuid", "paste", "schemars", @@ -7459,6 +7487,7 @@ dependencies = [ "managed", "memchr", "mio", + "newtype-uuid", "nom", "num-bigint-dig", "num-integer", @@ -10423,9 +10452,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", "memchr", @@ -13280,7 +13309,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 8906ab4d70..535afb94b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -369,6 +369,7 @@ derive-where = "1.2.7" # Having the i-implement-... feature here makes diesel go away from the workspace-hack diesel = { version = "2.2.4", features = ["i-implement-a-third-party-backend-and-opt-into-breaking-changes", "postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } diesel-dtrace = "0.4.2" +diffus = { git = "https://github.com/oxidecomputer/diffus", branch = "oxide/main", features = ["uuid-impl", "derive", "newtype-uuid-impl", "oxnet-impl"] } dns-server = { path = "dns-server" } dns-server-api = { path = "dns-server-api" } dns-service-client = { path = "clients/dns-service-client" } @@ -582,7 +583,7 @@ secrecy = "0.8.0" semver = { version = "1.0.23", features = ["std", "serde"] } serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" } -serde_json = "1.0.132" +serde_json = "1.0.133" serde_path_to_error = "0.1.16" serde_tokenstream = "0.2" serde_urlencoded = "0.7.1" diff --git a/clickhouse-admin/types/Cargo.toml b/clickhouse-admin/types/Cargo.toml index f57b1c5052..c8ee984abe 100644 --- a/clickhouse-admin/types/Cargo.toml +++ b/clickhouse-admin/types/Cargo.toml @@ -14,6 +14,7 @@ camino.workspace = true camino-tempfile.workspace = true chrono.workspace = true derive_more.workspace = true +diffus.workspace = true itertools.workspace = true omicron-common.workspace = true omicron-workspace-hack.workspace = true @@ -25,4 +26,4 @@ expectorate.workspace = true [dev-dependencies] slog-async.workspace = true -slog-term.workspace = true \ No newline at end of file +slog-term.workspace = true diff --git a/clickhouse-admin/types/src/lib.rs b/clickhouse-admin/types/src/lib.rs index aa1437bedc..b4dd21652c 100644 --- a/clickhouse-admin/types/src/lib.rs +++ b/clickhouse-admin/types/src/lib.rs @@ -7,6 +7,7 @@ use atomicwrites::AtomicFile; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use derive_more::{Add, AddAssign, Display, From}; +use diffus::Diffus; use itertools::Itertools; use omicron_common::api::external::Generation; use schemars::{ @@ -56,6 +57,7 @@ pub const OXIMETER_CLUSTER: &str = "oximeter_cluster"; JsonSchema, Serialize, Deserialize, + Diffus, )] pub struct KeeperId(pub u64); @@ -75,6 +77,7 @@ pub struct KeeperId(pub u64); JsonSchema, Serialize, Deserialize, + Diffus, )] pub struct ServerId(pub u64); diff --git a/common/Cargo.toml b/common/Cargo.toml index 0d122e602b..f2797ed2b4 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -20,6 +20,7 @@ async-trait.workspace = true backoff.workspace = true camino.workspace = true chrono.workspace = true +diffus.workspace = true dropshot.workspace = true futures.workspace = true hex.workspace = true diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index ab46f9f7f6..4c8f032fcb 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -18,6 +18,7 @@ use anyhow::Context; use api_identity::ObjectIdentity; use chrono::DateTime; use chrono::Utc; +use diffus::{edit, Diffable, Diffus}; use dropshot::HttpError; pub use dropshot::PaginationOrder; pub use error::*; @@ -211,6 +212,7 @@ impl<'a> TryFrom<&DataPageParams<'a, NameOrId>> for DataPageParams<'a, Uuid> { )] #[display("{0}")] #[serde(try_from = "String")] +#[derive(Diffus)] pub struct Name(String); /// `Name::try_from(String)` is the primary method for constructing an Name @@ -614,6 +616,7 @@ impl JsonSchema for RoleName { Eq, PartialOrd, Ord, + Diffus, )] pub struct ByteCount(u64); @@ -748,6 +751,7 @@ impl From for i64 { PartialEq, PartialOrd, Serialize, + Diffus, )] pub struct Generation(u64); @@ -1937,6 +1941,17 @@ impl JsonSchema for L4PortRange { )] pub struct MacAddr(pub macaddr::MacAddr6); +impl<'a> Diffable<'a> for MacAddr { + type Diff = (&'a Self, &'a Self); + fn diff(&'a self, other: &'a Self) -> edit::Edit<'a, Self> { + if self == other { + edit::Edit::Copy(self) + } else { + edit::Edit::Change((self, other)) + } + } +} + impl MacAddr { // Guest MAC addresses begin with the Oxide OUI A8:40:25. Further, guest // address are constrained to be in the virtual address range @@ -2100,6 +2115,7 @@ impl JsonSchema for MacAddr { Deserialize, Serialize, JsonSchema, + Diffus, )] pub struct Vni(u32); diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index a3f28a258a..588117c71c 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -8,6 +8,7 @@ use crate::{ address::NUM_SOURCE_NAT_PORTS, api::external::{self, BfdMode, ImportExportPolicy, Name, Vni}, }; +use diffus::Diffus; use oxnet::{IpNet, Ipv4Net, Ipv6Net}; use schemars::JsonSchema; use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; @@ -35,6 +36,7 @@ use super::nexus::HostIdentifier; Serialize, JsonSchema, Hash, + Diffus, )] #[serde(tag = "type", rename_all = "snake_case")] pub enum NetworkInterfaceKind { @@ -58,6 +60,7 @@ pub enum NetworkInterfaceKind { PartialOrd, Ord, Hash, + Diffus, )] pub struct NetworkInterface { pub id: Uuid, @@ -88,6 +91,7 @@ pub struct NetworkInterface { PartialOrd, Ord, Hash, + Diffus, )] pub struct SourceNatConfig { /// The external address provided to the instance or service. @@ -891,7 +895,9 @@ pub struct ExternalIpGatewayMap { } /// Describes the purpose of the dataset. -#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, EnumCount)] +#[derive( + Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, EnumCount, Diffus, +)] #[cfg_attr(feature = "testing", derive(test_strategy::Arbitrary))] pub enum DatasetKind { // Durable datasets for zones diff --git a/common/src/disk.rs b/common/src/disk.rs index 99c2b2db7b..df6efe3196 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -6,6 +6,7 @@ use anyhow::bail; use camino::{Utf8Path, Utf8PathBuf}; +use diffus::Diffus; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::ZpoolUuid; @@ -176,6 +177,7 @@ impl DatasetName { Hash, PartialOrd, Ord, + Diffus, )] pub struct GzipLevel(u8); @@ -218,6 +220,7 @@ impl FromStr for GzipLevel { Hash, PartialOrd, Ord, + Diffus, )] #[serde(tag = "type", rename_all = "snake_case")] pub enum CompressionAlgorithm { @@ -412,6 +415,7 @@ impl DatasetsManagementResult { Serialize, Deserialize, JsonSchema, + Diffus, )] pub struct DiskIdentity { pub vendor: String, diff --git a/common/src/zpool_name.rs b/common/src/zpool_name.rs index df5ca8ea31..9852b46120 100644 --- a/common/src/zpool_name.rs +++ b/common/src/zpool_name.rs @@ -5,6 +5,7 @@ //! Zpool labels and kinds shared between Nexus and Sled Agents use camino::{Utf8Path, Utf8PathBuf}; +use diffus::Diffus; use omicron_uuid_kinds::ZpoolUuid; use schemars::JsonSchema; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -15,7 +16,7 @@ pub const ZPOOL_INTERNAL_PREFIX: &str = "oxi_"; /// Describes the different classes of Zpools. #[derive( - Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, JsonSchema, + Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, JsonSchema, Diffus, )] #[serde(rename_all = "snake_case")] pub enum ZpoolKind { @@ -29,7 +30,7 @@ pub enum ZpoolKind { /// /// This expects that the format will be: `ox{i,p}_` - we parse the prefix /// when reading the structure, and validate that the UUID can be utilized. -#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Diffus)] pub struct ZpoolName { id: ZpoolUuid, kind: ZpoolKind, diff --git a/nexus-sled-agent-shared/Cargo.toml b/nexus-sled-agent-shared/Cargo.toml index 504cd92c37..ea83cb67ae 100644 --- a/nexus-sled-agent-shared/Cargo.toml +++ b/nexus-sled-agent-shared/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" workspace = true [dependencies] +diffus.workspace = true illumos-utils.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 3b9daf583e..b2b1001f9b 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -6,6 +6,7 @@ use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; +use diffus::Diffus; use omicron_common::{ api::{ external::{ByteCount, Generation}, @@ -183,6 +184,7 @@ impl OmicronZoneConfig { PartialOrd, Ord, Hash, + Diffus, )] pub struct OmicronZoneDataset { pub pool_name: ZpoolName, diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 6656693a90..c7d978b387 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -11,6 +11,7 @@ anyhow.workspace = true clickhouse-admin-types.workspace = true chrono.workspace = true debug-ignore.workspace = true +diffus.workspace = true gateway-client.workspace = true illumos-utils.workspace = true indexmap.workspace = true diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index 8990b0b83b..edf22679ef 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -17,6 +17,7 @@ clickhouse-admin-types.workspace = true cookie.workspace = true derive-where.workspace = true derive_more.workspace = true +diffus.workspace = true dropshot.workspace = true futures.workspace = true http.workspace = true diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index afe906e781..8915ac8746 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -18,6 +18,7 @@ use crate::inventory::Collection; pub use crate::inventory::SourceNatConfig; pub use crate::inventory::ZpoolName; use blueprint_diff::ClickhouseClusterConfigDiffTablesForSingleBlueprint; +use diffus::Diffus; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::ZoneKind; @@ -137,7 +138,9 @@ pub use blueprint_diff::BlueprintDiff; // zones deployed on each host and some supporting configuration (e.g., DNS). // This is aimed at supporting add/remove sleds. The plan is to grow this to // include more of the system as we support more use cases. -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffus, +)] pub struct Blueprint { /// unique identifier for this blueprint pub id: Uuid, @@ -187,6 +190,7 @@ pub struct Blueprint { pub clickhouse_cluster_config: Option, /// when this blueprint was generated (for debugging) + #[diffus(ignore)] pub time_created: chrono::DateTime, /// identity of the component that generated the blueprint (for debugging) /// This would generally be the Uuid of a Nexus instance. @@ -538,7 +542,9 @@ impl<'a> fmt::Display for BlueprintDisplay<'a> { /// per-zone [`BlueprintZoneDisposition`]. /// /// Part of [`Blueprint`]. -#[derive(Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffus, +)] pub struct BlueprintZonesConfig { /// Generation number of this configuration. /// @@ -642,6 +648,7 @@ fn zone_sort_key(z: &T) -> impl Ord { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct BlueprintZoneConfig { /// The disposition (desired state) of this zone recorded in the blueprint. @@ -653,6 +660,12 @@ pub struct BlueprintZoneConfig { pub zone_type: BlueprintZoneType, } +impl diffus::Same for BlueprintZoneConfig { + fn same(&self, other: &Self) -> bool { + self == other + } +} + impl BlueprintZoneConfig { /// Returns the underlay IP address associated with this zone. /// @@ -704,6 +717,7 @@ impl From for OmicronZoneConfig { Deserialize, Serialize, EnumIter, + Diffus, )] #[serde(rename_all = "snake_case")] pub enum BlueprintZoneDisposition { @@ -847,6 +861,7 @@ pub enum BlueprintDatasetFilter { Deserialize, Serialize, EnumIter, + Diffus, )] #[serde(rename_all = "snake_case")] pub enum BlueprintPhysicalDiskDisposition { @@ -878,7 +893,9 @@ impl BlueprintPhysicalDiskDisposition { } /// Information about an Omicron physical disk as recorded in a bluerprint. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Diffus, +)] pub struct BlueprintPhysicalDiskConfig { pub disposition: BlueprintPhysicalDiskDisposition, pub identity: DiskIdentity, @@ -889,12 +906,20 @@ pub struct BlueprintPhysicalDiskConfig { /// Information about Omicron physical disks as recorded in a blueprint. /// /// Part of [`Blueprint`]. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Diffus, +)] pub struct BlueprintPhysicalDisksConfig { pub generation: Generation, pub disks: Vec, } +impl diffus::Same for BlueprintPhysicalDiskConfig { + fn same(&self, other: &Self) -> bool { + self == other + } +} + // Required by RSS impl Default for BlueprintPhysicalDisksConfig { fn default() -> Self { @@ -929,7 +954,9 @@ impl From for OmicronPhysicalDisksConfig { } /// Information about Omicron datasets as recorded in a blueprint. -#[derive(Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffus, +)] pub struct BlueprintDatasetsConfig { pub generation: Generation, pub datasets: BTreeMap, @@ -964,6 +991,7 @@ impl From for DatasetsConfig { Deserialize, Serialize, EnumIter, + Diffus, )] #[serde(rename_all = "snake_case")] pub enum BlueprintDatasetDisposition { @@ -1002,6 +1030,7 @@ impl BlueprintDatasetDisposition { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct BlueprintDatasetConfig { // TODO: Display this in diffs - leave for now, for backwards compat diff --git a/nexus/types/src/deployment/clickhouse.rs b/nexus/types/src/deployment/clickhouse.rs index d0612e2ddf..f8443c6afe 100644 --- a/nexus/types/src/deployment/clickhouse.rs +++ b/nexus/types/src/deployment/clickhouse.rs @@ -5,6 +5,7 @@ //! Types used in blueprints related to clickhouse configuration use clickhouse_admin_types::{KeeperId, ServerId}; +use diffus::Diffus; use omicron_common::api::external::Generation; use omicron_uuid_kinds::OmicronZoneUuid; use schemars::JsonSchema; @@ -12,7 +13,9 @@ use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; /// Global configuration for all clickhouse servers (replicas) and keepers -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive( + Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffus, +)] pub struct ClickhouseClusterConfig { /// The last update to the clickhouse cluster configuration /// diff --git a/nexus/types/src/deployment/network_resources.rs b/nexus/types/src/deployment/network_resources.rs index f11d739d03..b08e5a2582 100644 --- a/nexus/types/src/deployment/network_resources.rs +++ b/nexus/types/src/deployment/network_resources.rs @@ -5,6 +5,7 @@ use super::tri_map::TriMap; use super::tri_map::TriMapEntry; use anyhow::anyhow; +use diffus::Diffus; use omicron_common::api::external::MacAddr; use omicron_common::api::internal::shared::SourceNatConfig; use omicron_uuid_kinds::ExternalIpUuid; @@ -221,6 +222,7 @@ pub enum OmicronZoneExternalIpKey { JsonSchema, Serialize, Deserialize, + Diffus, )] pub struct OmicronZoneExternalFloatingIp { pub id: ExternalIpUuid, @@ -239,6 +241,7 @@ pub struct OmicronZoneExternalFloatingIp { JsonSchema, Serialize, Deserialize, + Diffus, )] pub struct OmicronZoneExternalFloatingAddr { pub id: ExternalIpUuid, @@ -268,6 +271,7 @@ impl OmicronZoneExternalFloatingAddr { JsonSchema, Serialize, Deserialize, + Diffus, )] pub struct OmicronZoneExternalSnatIp { pub id: ExternalIpUuid, diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 0dda916509..1876440d47 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -17,6 +17,7 @@ use crate::external_api::views::SledState; use chrono::DateTime; use chrono::Utc; use clap::ValueEnum; +use diffus::Diffus; use ipnetwork::IpNetwork; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; @@ -376,6 +377,7 @@ impl CockroachDbSettings { Deserialize, Serialize, JsonSchema, + Diffus, )] pub enum CockroachDbClusterVersion { #[display("22.1")] @@ -408,7 +410,15 @@ impl CockroachDbClusterVersion { /// Whether to set `cluster.preserve_downgrade_option` and what to set it to. #[derive( - Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize, JsonSchema, + Clone, + Copy, + Debug, + Eq, + PartialEq, + Deserialize, + Serialize, + JsonSchema, + Diffus, )] #[serde(tag = "action", content = "data", rename_all = "snake_case")] pub enum CockroachDbPreserveDowngrade { diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index ffb4bd5a17..c1ecd18158 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -9,6 +9,7 @@ //! that is not needed by sled-agent. use super::OmicronZoneExternalIp; +use diffus::Diffus; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_sled_agent_shared::inventory::ZoneKind; @@ -31,6 +32,7 @@ use std::net::SocketAddrV6; JsonSchema, Deserialize, Serialize, + Diffus, )] #[serde(tag = "type", rename_all = "snake_case")] pub enum BlueprintZoneType { @@ -335,6 +337,7 @@ pub mod blueprint_zone_type { use crate::deployment::OmicronZoneExternalFloatingAddr; use crate::deployment::OmicronZoneExternalFloatingIp; use crate::deployment::OmicronZoneExternalSnatIp; + use diffus::Diffus; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use omicron_common::api::internal::shared::NetworkInterface; use schemars::JsonSchema; @@ -354,6 +357,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct BoundaryNtp { pub address: SocketAddrV6, @@ -376,6 +380,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct Clickhouse { pub address: SocketAddrV6, @@ -392,6 +397,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct ClickhouseKeeper { pub address: SocketAddrV6, @@ -409,6 +415,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct ClickhouseServer { pub address: SocketAddrV6, @@ -425,6 +432,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct CockroachDb { pub address: SocketAddrV6, @@ -441,6 +449,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct Crucible { pub address: SocketAddrV6, @@ -457,6 +466,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct CruciblePantry { pub address: SocketAddrV6, @@ -472,6 +482,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct ExternalDns { pub dataset: OmicronZoneDataset, @@ -493,6 +504,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct InternalDns { pub dataset: OmicronZoneDataset, @@ -521,6 +533,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct InternalNtp { pub address: SocketAddrV6, @@ -536,6 +549,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct Nexus { /// The address at which the internal nexus server is reachable. @@ -560,6 +574,7 @@ pub mod blueprint_zone_type { JsonSchema, Deserialize, Serialize, + Diffus, )] pub struct Oximeter { pub address: SocketAddrV6, diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 0fd45c0666..94b2279906 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -11,6 +11,7 @@ use crate::identity::AssetIdentityMetadata; use api_identity::ObjectIdentity; use chrono::DateTime; use chrono::Utc; +use diffus::Diffus; use omicron_common::api::external::{ AllowedSourceIps as ExternalAllowedSourceIps, ByteCount, Digest, Error, IdentityMetadata, InstanceState, Name, ObjectIdentity, RoleName, @@ -704,6 +705,7 @@ impl fmt::Display for SledPolicy { PartialEq, Eq, EnumIter, + Diffus, )] #[serde(rename_all = "snake_case")] pub enum SledState { diff --git a/uuid-kinds/Cargo.toml b/uuid-kinds/Cargo.toml index 9ea2f8223c..0a80fab9c9 100644 --- a/uuid-kinds/Cargo.toml +++ b/uuid-kinds/Cargo.toml @@ -12,6 +12,7 @@ workspace = true # within omicron. [dependencies] +diffus.workspace = true newtype-uuid.workspace = true schemars = { workspace = true, optional = true } paste.workspace = true diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 42c50379ce..e4f65d7039 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -14,6 +14,7 @@ pub use newtype_uuid::{ GenericUuid, ParseError, TagError, TypedUuid, TypedUuidKind, TypedUuidTag, }; +use diffus::Diffus; #[cfg(feature = "schemars08")] use schemars::JsonSchema; @@ -22,6 +23,7 @@ macro_rules! impl_typed_uuid_kind { $( paste::paste! { #[cfg_attr(feature = "schemars08", derive(JsonSchema))] + #[derive(Diffus)] pub enum [< $kind Kind>] {} impl TypedUuidKind for [< $kind Kind >] { diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 678170b25e..9b5de946f9 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -77,6 +77,7 @@ libc = { version = "0.2.162", features = ["extra_traits"] } log = { version = "0.4.22", default-features = false, features = ["kv_unstable", "std"] } managed = { version = "0.8.0", default-features = false, features = ["alloc", "map"] } memchr = { version = "2.7.4" } +newtype-uuid = { version = "1.1.3" } nom = { version = "7.1.3" } num-bigint-dig = { version = "0.8.4", default-features = false, features = ["i128", "prime", "serde", "u64_digit", "zeroize"] } num-integer = { version = "0.1.46", features = ["i128"] } @@ -107,7 +108,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.215", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.132", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.133", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] } @@ -197,6 +198,7 @@ libc = { version = "0.2.162", features = ["extra_traits"] } log = { version = "0.4.22", default-features = false, features = ["kv_unstable", "std"] } managed = { version = "0.8.0", default-features = false, features = ["alloc", "map"] } memchr = { version = "2.7.4" } +newtype-uuid = { version = "1.1.3" } nom = { version = "7.1.3" } num-bigint-dig = { version = "0.8.4", default-features = false, features = ["i128", "prime", "serde", "u64_digit", "zeroize"] } num-integer = { version = "0.1.46", features = ["i128"] } @@ -227,7 +229,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.215", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.132", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.133", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] } From ef65ae6ae9bc9e8fca68ff2b3cc69765f0e75b6b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 20 Dec 2024 18:32:40 -0800 Subject: [PATCH 09/11] [tests] make instance reincarnation tests less racy (#7295) --- nexus/tests/integration_tests/instances.rs | 90 +++++++++++++++++++--- 1 file changed, 79 insertions(+), 11 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 163869896f..8ca2f9a396 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1374,15 +1374,20 @@ async fn test_instance_failed_when_on_expunged_sled( // The restarted instance should now transition back to `Running`, on its // new sled. - instance_wait_for_vmm_registration(cptestctx, &instance2_id).await; - instance_simulate(nexus, &instance2_id).await; - instance_wait_for_state(client, instance2_id, InstanceState::Running).await; + instance_wait_for_simulated_transition( + &cptestctx, + &instance2_id, + InstanceState::Running, + ) + .await; // The auto-restartable instance should be...restarted automatically. - - instance_wait_for_vmm_registration(cptestctx, &instance3_id).await; - instance_simulate(nexus, &instance3_id).await; - instance_wait_for_state(client, instance3_id, InstanceState::Running).await; + instance_wait_for_simulated_transition( + &cptestctx, + &instance3_id, + InstanceState::Running, + ) + .await; } // Verifies that the instance-watcher background task transitions an instance @@ -1393,7 +1398,6 @@ async fn test_instance_failed_by_instance_watcher_automatically_reincarnates( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; - let nexus = &cptestctx.server.server_context().nexus; let instance_id = dbg!( make_forgotten_instance( &cptestctx, @@ -1430,10 +1434,13 @@ async fn test_instance_failed_by_instance_watcher_automatically_reincarnates( // it. dbg!(instance_wait_for_vmm_registration(cptestctx, &instance_id).await); // Now, we can actually poke the instance. - dbg!(instance_simulate(nexus, &instance_id).await); dbg!( - instance_wait_for_state(client, instance_id, InstanceState::Running) - .await + instance_wait_for_simulated_transition( + &cptestctx, + &instance_id, + InstanceState::Running + ) + .await ); } @@ -6672,6 +6679,67 @@ pub async fn instance_simulate_with_opctx( sled_info.sled_client.vmm_finish_transition(sled_info.propolis_id).await; } +/// Wait for an instance to complete a simulated state transition, repeatedly +/// poking the simulated sled-agent until the transition occurs. +/// +/// This can be used to avoid races between Nexus processes (like sagas) which +/// trigger a state transition but cannot be easily awaited by the test, and the +/// actual request to simulate the state transition. However, it should be used +/// cautiously to avoid simulating multiple state transitions accidentally. +async fn instance_wait_for_simulated_transition( + cptestctx: &ControlPlaneTestContext, + id: &InstanceUuid, + state: InstanceState, +) -> Instance { + const MAX_WAIT: Duration = Duration::from_secs(120); + let client = &cptestctx.external_client; + slog::info!( + &client.client_log, + "waiting for instance {id} transition to {state} \ + (and poking simulated sled-agent)..."; + ); + let url = format!("/v1/instances/{id}"); + let result = wait_for_condition( + || async { + let instance: Instance = NexusRequest::object_get(&client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await? + .parsed_body()?; + if instance.runtime.run_state == state { + Ok(instance) + } else { + slog::info!( + &client.client_log, + "instance {id} has not transitioned to {state}, \ + poking sled-agent"; + "instance_id" => %instance.identity.id, + "instance_runtime_state" => ?instance.runtime, + ); + instance_simulate(&cptestctx.server.server_context().nexus, id) + .await; + Err(CondCheckError::::NotYet) + } + }, + &Duration::from_secs(1), + &MAX_WAIT, + ) + .await; + match result { + Ok(instance) => { + slog::info!( + &client.client_log, + "instance {id} has transitioned to {state}" + ); + instance + } + Err(e) => panic!( + "instance {id} did not transition to {state:?} \ + after {MAX_WAIT:?}: {e}" + ), + } +} + /// Simulates state transitions for the incarnation of the instance on the /// supplied sled (which may not be the sled ID currently stored in the /// instance's CRDB record). From c1408170715de64017c2b5908501d7e84e19ac92 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 20 Dec 2024 20:04:44 -0800 Subject: [PATCH 10/11] document a pattern for making Sled-Agent-owned config to be Nexus-owned instead (#7279) --- docs/control-plane-architecture.adoc | 2 + docs/reconfigurator.adoc | 163 +++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) diff --git a/docs/control-plane-architecture.adoc b/docs/control-plane-architecture.adoc index 7d050e392b..e6b449fb7d 100644 --- a/docs/control-plane-architecture.adoc +++ b/docs/control-plane-architecture.adoc @@ -236,6 +236,8 @@ It is also important to note why the previous strategy works well and is largely Now we get to what has been the hairiest of the problems for data compatibility across versions. As we add more features, and make our system more consistent in its promise that Nexus manages state for the control plane instead of sled-agent, we have realized that Nexus sometimes doesn't have enough information to take over this responsibility. In such cases when performing https://github.com/oxidecomputer/omicron/blob/5b865b74208ce0a11b8aec1bca12e2a6ea538bb6/sled-agent/src/sim/server.rs#L254[RSS handoff to Nexus], we have had to add new state to the handoff message so that Nexus can create a blueprint to drive the rest of the system to its desired state via https://github.com/oxidecomputer/omicron/blob/main/docs/reconfigurator.adoc[Reconfigurator]. However, this only works for new rack deployments when we actually run RSS. For existing deployments that have already gone through initial rack setup, the new Nexus code does not have enough information to proceed with running reconfigurator. In this case we must **backfill** that information. This can, and has, been done a variety of ways. We sometimes may have to add new data to CRDB, and sometimes modify a schema and backfill columns. Othertimes, we may need to retrieve important data from sled-agent and store it in existing placeholders in blueprints. In any event, doing this is tricky and influences how legible the code is to read, how testable it is, and how correct it is under all circumstances. It's for this reason that we proposed the rules for data compatibility in the prior section, which largely align with how we do ledger updates. +For more on this, see the xref:reconfigurator.adoc#_incorporating_existing_configuration_into_reconfigurator[Reconfigurator docs on this subject]. + == Cold start "Cold start" refers to starting the control plane from a rack that's completely powered off. Achieving this requires careful consideration of where configuration is stored and how configuration changes flow through the system. diff --git a/docs/reconfigurator.adoc b/docs/reconfigurator.adoc index d7aa929677..0aa7ce78cc 100644 --- a/docs/reconfigurator.adoc +++ b/docs/reconfigurator.adoc @@ -179,6 +179,169 @@ We're being cautious about rolling out that kind of automation. Instead, today, To get to the long term vision where the system is doing all this on its own in response to operator input, we'll need to get confidence that continually executing the planner will have no ill effects on working systems. This might involve more operational experience with it, more safeties, and tools for pausing execution, previewing what it _would_ do, etc. +== Design patterns + +=== Incorporating existing configuration into Reconfigurator + +Something we've done several times now is taking some existing piece of configuration that was managed outside the control plane (i.e., not known to Nexus or CockroachDB) and brought it under the ownership of the control plane. Examples: + +* Control plane zones: when we initially built the system, RSS deployed control plane zones to sleds and Nexus/CockroachDB was largely unaware of them. Nexus/CockroachDB did know about them, but did not have enough information to reconstruct the configuration on each sled. But of course the control plane _needs_ to be able to manage these components for upgrade, fault management, scale-out, etc. Migrating to a system that can do these things required that the control plane learn what zones were deployed already, where, and with what configuration. +* ZFS datasets: in releases prior to R12, Sled Agent automatically created ZFS datasets for zones that were requested by the control plane. Concretely, `PUT /omicron-zones` would create ZFS datasets for zones that needed a persistent dataset. Work is ongoing to make this more explicit so that the control plane manages datasets separately and then specifies with each zone which dataset it should use. Migrating to this approach on deployed systems requires that the control plane learn what datasets exist in the first place and what zones they're associated with. +* In the medium term, for online upgrade, we will be incorporating an image id or artifact id into the Omicron zone configuration. Currently, the artifact id is implied: sled agent uses whatever artifact was delivered by the last MUPdate. For online upgrade, the control plane will need to be able to specify a particular artifact. + +In all of these cases: + +* There's a piece of configuration managed outside of CockroachDB/Nexus (e.g., by Sled Agent, RSS, and/or MUPdate). +* We want to transition to a world where the configuration is owned by CockroachDB/Nexus. +* We need to bootstrap the initial control-plane-managed copy based on what's currently deployed. + +The general pattern here is: + +* Use the inventory system to collect the current configuration. +* Incorporate that configuration into the next blueprint generated by Reconfigurator. +* Incorporate the configuration from the blueprint into the next request to Sled Agent. + +```mermaid +sequenceDiagram + participant SledAgent as Sled Agent + participant Nexus + + Note over SledAgent: owns some piece
of configuration + SledAgent ->> Nexus: reports current configuration
via inventory + Note over Nexus: incorporates latest inventory
into next blueprint + Note over Nexus: now owns the configuration + + loop + Note over Nexus: changes configuration as needed + Nexus ->> SledAgent: sends new configuration
+ end +``` + +Below is a proposed pattern for doing this over two releases. We'll call the new piece of config `my_config`, represented with type `MyConfig`. This could be arbitrarily complicated. In our examples above, this could be a list of zones and their detailed configurations (IP addresses, ports, and all the properties they need to start), a list of ZFS dataset structs, a `dataset_id` property on an existing struct, an `artifact_id` property on an existing struct, etc. It may hang directly off the Sled Agent `Inventory` or it might be embedded in some existing struct. + +NOTE: This is a work in progress. We hope this closely enough matches what we've done in the past that it should work. We should update this documentation as we discover better ways to do this. + +CAUTION: This isn't the _only_ way to do it. However, many other ways to do it come with non-obvious problems. When we diverge from this, we should first try to understand why this procedure looks the way it does. + +**In the first release** (we'll call it "release 1"): the configuration is totally managed in Sled Agent and unknown to Nexus and CockroachDB. + +**In the next release** (we'll call it "release 2"): + +. Add `my_config: MyConfig` to appropriate spot in Sled Agent inventory. +** In the inventory API that Sled Agent exposes, this field can be non-optional. In this use case, it's assumed that Sled Agent can know what the current value is. That is, the code in this release must be aware that this value, which may previously have been hardcoded or even absent altogether, is now a variable to be reported in inventory (and eventually controlled by Nexus -- see below). +** In the Nexus inventory structures and database inventory structures, the field still needs to be optional (`my_config: Option` or equivalent) because Nexus generally needs to be able to read inventory structures written by the previous release. +. Add `my_config: Option` to the blueprint structures (both in-memory and in the database). This field has to be optional so that when updating to this release, the system can still read the current target blueprint (that was written in the previous release that didn't have this field). +. In the Reconfigurator planner, when generating a blueprint based on a parent blueprint where `my_config` is `None`, fill in `my_config` (using a `Some` value) based on the contents in inventory. +. Add `my_config` to the Sled Agent request that will be used by Reconfigurator to _configure_ this on each sled. +** If a request already exists (e.g., if this will be part of `OmicronZoneConfig` that already gets sent by Reconfigurator to Sled Agent, as in the case of ZFS dataset id or artifact id): the new field should be optional: `my_config: Option`. This is required for the system to be able to execute the last target blueprint that was written in the _previous_ release. This is typically also necessary because it's usually the same struct that Sled Agent records persistently. See the next item. +** If no request already exists for this purpose, then you'll be adding a whole new one (e.g., when we added a new `PUT /datasets`). The body of this request will generally be type `MyConfig` (_not_ optional). During execution, Reconfigurator can avoid making this request altogether if the blueprint does not specify it. +. Add `my_config` to the Sled Agent ledger that will store this information persistently. _This will almost always be the same as the previous step_. The structure that Sled Agent stores is generally the same one it accepts from Nexus. ++ +This explains another reason why `my_config` should be optional in this structure: Sled Agent _must_ be able to read ledgers written by a previous release and those won't have this field. + +**During the upgrade to this release:** + +. Wait for at least one inventory cycle to complete successfully and verify that it contains the expected `my_config` field. +. Generate a new blueprint, make it the current target, and ensure that it executes successfully. It should make no actual changes to the system, but it will propagate the current values for `my_config` to the blueprint system and to sled ledgers. +. Verify that: +** the new blueprint has `my_config` filled in +** all Sled Agent ledgers have `my_config` filled in (value `Some`) + +**In the next release** (we'll call it "release 3"): all the optional fields can be made non-optional: + +* Blueprints' in-memory structure can go from `my_config: Option` to `my_config: MyConfig`. +* Nexus's in-memory structure for inventory can go from `my_config: Option` to `my_config: MyConfig`. +* Blueprints' and inventory collections' database representations can go from NULL-able columns to non-NULL-able ones, though only if we can populate the value or drop old blueprints and collections. More work is needed here (see below). +* The Sled Agent API input types and ledgers that refer to `my_config` can go from `my_config: Option` to `my_config: MyConfig`. No on-disk changes are needed for this. + +**During the upgrade to the next release**: Blueprints and inventory collections that do not have `my_config` set will need to be deleted from the database prior to the upgrade. See https://github.com/oxidecomputer/omicron/issues/7278[omicron#7278] for more on operationalizing this. + +Visually: + +```mermaid +flowchart TD + subgraph R1 [Release 1] + Initial["**Config owned by Sled Agent**"] + end + + subgraph R2 [Release 2] + Inventory["Sled Agent: reports current config in inventory"] + Blueprint["Reconfigurator Planner: incorporates latest inventory into blueprint"] + SledAgent["Reconfigurator Executor: sends blueprint config (unchanged) as configuration to Sled Agent"] + Handoff["**Config owned by Nexus**"] + Change21["Nexus wants to change the config"] + Change22["Reconfigurator Planner: uses new value in blueprint"] + Change23["Reconfigurator Executor: sends new value as new configuration to Sled Agent"] + + Inventory --> Blueprint + Blueprint --> SledAgent + SledAgent --> Handoff + Handoff --> Change21 + Change21 --> Change22 + Change22 --> Change23 + Change23 --> Change21 + end + + subgraph R3 [Release 3] + Owned["**Config owned by Nexus**"] + Cleanup["**Blueprint field, Sled Agent field are now required**"] + Change31["Nexus wants to change the config"] + Change32["Reconfigurator Planner: uses new value in blueprint"] + Change33["Reconfigurator Executor: sends new value as new configuration to Sled Agent"] + Owned --> Cleanup + Cleanup --> Change31 + Change31 --> Change32 + Change32 --> Change33 + Change33 --> Change31 + end + + R1 --> R2 + R2 --> R3 +``` + +During release 1 and during release 2 _before_ Sled Agent has reported the configuration in inventory, things look like this: + +```mermaid +sequenceDiagram + box Nexus + participant Planner as Reconfigurator Planner + participant Executor as Reconfigurator Executor + end + participant SledAgent as Sled Agent + participant Database + + + loop while config is not part of inventory + Database ->> Planner: load latest inventory: config NOT present + Planner ->> Executor: generate blueprint:
config NOT present + Executor ->> SledAgent: write config:
config NOT present + Note over SledAgent: missing config
treated as
"no change" + end +``` + +Shortly after the system comes up in release 2, Sled Agent starts reporting the config in inventory. After that point, things look like this: + +```mermaid +sequenceDiagram + box Nexus + participant Planner as Reconfigurator Planner + participant Executor as Reconfigurator Executor + end + participant SledAgent as Sled Agent + participant Database + + loop + SledAgent ->> Database: report config
in inventory + end + + loop + Database ->> Planner: load latest inventory: config IS present + Planner ->> Executor: generate blueprint:
config IS present + Executor ->> SledAgent: write config:
config IS present + Note over SledAgent: config is present
and honored + end +``` + [bibliography] == References From 0afbd6e95d98106eabceca60c6eb1d54f82127fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Mon, 6 Jan 2025 16:47:07 +1300 Subject: [PATCH 11/11] [sled-agent] Use zone-network-setup service after underlay is up (#7260) This commit removes the need to zlogin into the switch zone to set the underlay IP address and new static address after the underlay is up. Instead it adds the new values to the SMF properties and refreshes the zone-network-setup service. This service is safe to rerun as all it's commands ensure things are there, but does not throw an error if they are. Closes: https://github.com/oxidecomputer/omicron/issues/6157 --- illumos-utils/src/smf_helper.rs | 32 ++++++++++ sled-agent/src/services.rs | 91 +++++++++++++++-------------- smf/zone-network-setup/manifest.xml | 6 +- 3 files changed, 85 insertions(+), 44 deletions(-) diff --git a/illumos-utils/src/smf_helper.rs b/illumos-utils/src/smf_helper.rs index 2d29376950..9bf782ce1e 100644 --- a/illumos-utils/src/smf_helper.rs +++ b/illumos-utils/src/smf_helper.rs @@ -165,6 +165,38 @@ impl<'t> SmfHelper<'t> { Ok(()) } + pub fn delpropvalue(&self, prop: P, val: V) -> Result<(), Error> + where + P: ToString, + V: ToString, + { + match self + .running_zone + .run_cmd(&[ + SVCCFG, + "-s", + &self.smf_name, + "delpropvalue", + &prop.to_string(), + &val.to_string(), + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!("del {} smf property value", prop.to_string()), + err, + }) { + Ok(_) => (), + Err(e) => { + // If a property already doesn't exist we don't need to + // return an error + if !e.to_string().contains("No such property") { + return Err(e); + } + } + }; + + Ok(()) + } + pub fn delpropvalue_default_instance( &self, prop: P, diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 7c5fb44d13..7536824da3 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -54,7 +54,6 @@ use illumos_utils::running_zone::{ }; use illumos_utils::smf_helper::SmfHelper; use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT; -use illumos_utils::zone::AddressRequest; use illumos_utils::zpool::{PathInPool, ZpoolName}; use illumos_utils::{execute, PFEXEC}; use internal_dns_resolver::Resolver; @@ -542,6 +541,18 @@ struct SwitchZoneConfigLocal { root: Utf8PathBuf, } +/// Service that sets up common networking across zones +struct ZoneNetworkSetupService {} + +impl illumos_utils::smf_helper::Service for ZoneNetworkSetupService { + fn service_name(&self) -> String { + "zone-network-setup".to_string() + } + fn smf_name(&self) -> String { + format!("svc:/oxide/{}", self.service_name()) + } +} + /// Describes either an Omicron-managed zone or the switch zone, used for /// functions that operate on either one or the other enum ZoneArgs<'a> { @@ -4287,59 +4298,53 @@ impl ServiceManager { ); *request = new_request; + // Add SMF properties here and restart zone-network-setup service let first_address = request.addresses.get(0); let address = first_address .map(|addr| addr.to_string()) .unwrap_or_else(|| "".to_string()); - for addr in &request.addresses { - if *addr == Ipv6Addr::LOCALHOST { - continue; - } - info!( - self.inner.log, - "Ensuring address {} exists", - addr.to_string() - ); - let addr_request = - AddressRequest::new_static(IpAddr::V6(*addr), None); - zone.ensure_address(addr_request).await?; - info!( - self.inner.log, - "Ensuring address {} exists - OK", - addr.to_string() - ); - } + // Set new properties for the network set up service and refresh + let nw_setup_svc = ZoneNetworkSetupService {}; + let nsmfh = SmfHelper::new(&zone, &nw_setup_svc); + + nsmfh.delpropvalue("config/gateway", "*")?; + nsmfh.delpropvalue("config/static_addr", "*")?; - // When the request addresses have changed this means the underlay is - // available now as well. if let Some(info) = self.inner.sled_info.get() { - info!( + nsmfh.addpropvalue_type( + "config/gateway", + &info.underlay_address, + "astring", + )?; + } else { + // It should be impossible for the `sled_info` not to be set here. + // When the request addresses have changed this means the underlay is + // available as well. + error!( self.inner.log, - "Ensuring there is a default route"; - "gateway" => ?info.underlay_address, + concat!( + "sled agent info is not present,", + " even though underlay address exists" + ) ); - match zone.add_default_route(info.underlay_address).map_err( - |err| Error::ZoneCommand { - intent: "Adding Route".to_string(), - err, - }, - ) { - Ok(_) => (), - Err(e) => { - if e.to_string().contains("entry exists") { - info!( - self.inner.log, - "Default route already exists"; - "gateway" => ?info.underlay_address, - ) - } else { - return Err(e); - } - } - }; } + for address in &request.addresses { + if *address != Ipv6Addr::LOCALHOST { + nsmfh.addpropvalue_type( + "config/static_addr", + &address, + "astring", + )?; + } + } + nsmfh.refresh()?; + info!( + self.inner.log, + "refreshed zone-network-setup service with new configuration" + ); + for service in &request.services { let smfh = SmfHelper::new(&zone, service); diff --git a/smf/zone-network-setup/manifest.xml b/smf/zone-network-setup/manifest.xml index e7dc1e496b..c78a7256e5 100644 --- a/smf/zone-network-setup/manifest.xml +++ b/smf/zone-network-setup/manifest.xml @@ -12,7 +12,7 @@ - + @@ -31,6 +31,10 @@ timeout_seconds='0' /> + +