From 7cf688c6346c88aba989dd65a5186648d868971d Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Nov 2024 15:11:18 -0500 Subject: [PATCH] Implement record based Crucible reference counting (#6805) Crucible volumes are created by layering read-write regions over a hierarchy of read-only resources. Originally only a region snapshot could be used as a read-only resource for a volume. With the introduction of read-only regions (created during the region snapshot replacement process) this is no longer true! Read-only resources can be used by many volumes, and because of this they need to have a reference count so they can be deleted when they're not referenced anymore. The region_snapshot table uses a `volume_references` column, which counts how many uses there are. The region table does not have this column, and more over a simple integer works for reference counting but does not tell you _what_ volume that use is from. This can be determined (see omdb's validate volume references command) but it's information that is tossed out, as Nexus knows what volumes use what resources! Instead, record what read-only resources a volume uses in a new table. As part of the schema change to add the new `volume_resource_usage` table, a migration is included that will create the appropriate records for all region snapshots. In testing, a few bugs were found: the worst being that read-only regions did not have their read_only column set to true. This would be a problem if read-only regions are created, but they're currently only created during region snapshot replacement. To detect if any of these regions were created, find all regions that were allocated for a snapshot volume: SELECT id FROM region WHERE volume_id IN (SELECT volume_id FROM snapshot); A similar bug was found in the simulated Crucible agent. This commit also reverts #6728, enabling region snapshot replacement again - it was disabled due to a lack of read-only region reference counting, so it can be enabled once again. --- dev-tools/omdb/src/bin/omdb/db.rs | 12 +- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/region.rs | 8 + nexus/db-model/src/schema.rs | 18 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/volume_resource_usage.rs | 142 + nexus/db-queries/src/db/datastore/mod.rs | 4 +- nexus/db-queries/src/db/datastore/region.rs | 8 +- .../src/db/datastore/region_snapshot.rs | 17 +- nexus/db-queries/src/db/datastore/snapshot.rs | 19 + nexus/db-queries/src/db/datastore/volume.rs | 3635 ++++++++++++----- nexus/db-queries/src/db/queries/mod.rs | 1 - .../src/db/queries/region_allocation.rs | 5 +- nexus/db-queries/src/db/queries/volume.rs | 114 - .../output/region_allocate_distinct_sleds.sql | 21 +- .../output/region_allocate_random_sleds.sql | 21 +- ..._allocate_with_snapshot_distinct_sleds.sql | 21 +- ...on_allocate_with_snapshot_random_sleds.sql | 21 +- nexus/src/app/background/init.rs | 32 +- .../region_snapshot_replacement_finish.rs | 12 +- ...on_snapshot_replacement_garbage_collect.rs | 23 +- .../region_snapshot_replacement_start.rs | 15 +- .../tasks/region_snapshot_replacement_step.rs | 44 +- nexus/src/app/sagas/disk_create.rs | 29 + .../region_snapshot_replacement_start.rs | 57 +- nexus/src/app/sagas/volume_delete.rs | 45 +- nexus/test-utils/src/background.rs | 2 +- nexus/tests/integration_tests/disks.rs | 77 - .../integration_tests/volume_management.rs | 2891 ++++++++++++- .../crdb/crucible-ref-count-records/up01.sql | 1 + .../crdb/crucible-ref-count-records/up02.sql | 3 + .../crdb/crucible-ref-count-records/up03.sql | 4 + .../crdb/crucible-ref-count-records/up04.sql | 37 + .../crdb/crucible-ref-count-records/up05.sql | 3 + .../crdb/crucible-ref-count-records/up06.sql | 3 + .../crdb/crucible-ref-count-records/up07.sql | 8 + .../crdb/crucible-ref-count-records/up08.sql | 31 + .../crdb/crucible-ref-count-records/up09.sql | 2 + .../crdb/crucible-ref-count-records/up10.sql | 1 + .../crdb/crucible-ref-count-records/up11.sql | 1 + schema/crdb/dbinit.sql | 87 +- sled-agent/src/sim/storage.rs | 2 +- 42 files changed, 5963 insertions(+), 1519 deletions(-) create mode 100644 nexus/db-model/src/volume_resource_usage.rs delete mode 100644 nexus/db-queries/src/db/queries/volume.rs create mode 100644 schema/crdb/crucible-ref-count-records/up01.sql create mode 100644 schema/crdb/crucible-ref-count-records/up02.sql create mode 100644 schema/crdb/crucible-ref-count-records/up03.sql create mode 100644 schema/crdb/crucible-ref-count-records/up04.sql create mode 100644 schema/crdb/crucible-ref-count-records/up05.sql create mode 100644 schema/crdb/crucible-ref-count-records/up06.sql create mode 100644 schema/crdb/crucible-ref-count-records/up07.sql create mode 100644 schema/crdb/crucible-ref-count-records/up08.sql create mode 100644 schema/crdb/crucible-ref-count-records/up09.sql create mode 100644 schema/crdb/crucible-ref-count-records/up10.sql create mode 100644 schema/crdb/crucible-ref-count-records/up11.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index fc9bb9cc2f..3ceba3fc25 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2565,26 +2565,22 @@ async fn cmd_db_region_find_deleted( struct Row { dataset_id: Uuid, region_id: Uuid, - region_snapshot_id: String, - volume_id: Uuid, + volume_id: String, } let rows: Vec = datasets_regions_volumes .into_iter() .map(|row| { - let (dataset, region, region_snapshot, volume) = row; + let (dataset, region, volume) = row; Row { dataset_id: dataset.id(), region_id: region.id(), - region_snapshot_id: if let Some(region_snapshot) = - region_snapshot - { - region_snapshot.snapshot_id.to_string() + volume_id: if let Some(volume) = volume { + volume.id().to_string() } else { String::from("") }, - volume_id: volume.id(), } }) .collect(); diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 9137b0b3c3..66723e0b18 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -109,6 +109,7 @@ mod vmm; mod vni; mod volume; mod volume_repair; +mod volume_resource_usage; mod vpc; mod vpc_firewall_rule; mod vpc_route; @@ -219,6 +220,7 @@ pub use vmm_state::*; pub use vni::*; pub use volume::*; pub use volume_repair::*; +pub use volume_resource_usage::*; pub use vpc::*; pub use vpc_firewall_rule::*; pub use vpc_route::*; diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index e38c5543ef..02c7db8120 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -46,6 +46,10 @@ pub struct Region { // A region may be read-only read_only: bool, + + // Shared read-only regions require a "deleting" flag to avoid a + // use-after-free scenario + deleting: bool, } impl Region { @@ -67,6 +71,7 @@ impl Region { extent_count: extent_count as i64, port: Some(port.into()), read_only, + deleting: false, } } @@ -99,4 +104,7 @@ impl Region { pub fn read_only(&self) -> bool { self.read_only } + pub fn deleting(&self) -> bool { + self.deleting + } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8a97e43639..6ca7171793 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1075,6 +1075,8 @@ table! { port -> Nullable, read_only -> Bool, + + deleting -> Bool, } } @@ -2087,3 +2089,19 @@ joinable!(instance_ssh_key -> instance (instance_id)); allow_tables_to_appear_in_same_query!(sled, sled_instance); joinable!(network_interface -> probe (parent_id)); + +table! { + volume_resource_usage (usage_id) { + usage_id -> Uuid, + + volume_id -> Uuid, + + usage_type -> crate::VolumeResourceUsageTypeEnum, + + region_id -> Nullable, + + region_snapshot_dataset_id -> Nullable, + region_snapshot_region_id -> Nullable, + region_snapshot_snapshot_id -> Nullable, + } +} diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 14eba1cb1c..70450a7776 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(113, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(114, "crucible-ref-count-records"), KnownVersion::new(113, "add-tx-eq"), KnownVersion::new(112, "blueprint-dataset"), KnownVersion::new(111, "drop-omicron-zone-underlay-address"), diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs new file mode 100644 index 0000000000..c55b053c62 --- /dev/null +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -0,0 +1,142 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::volume_resource_usage; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel( + postgres_type(name = "volume_resource_usage_type", schema = "public") + )] + pub struct VolumeResourceUsageTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq, Eq, Hash)] + #[diesel(sql_type = VolumeResourceUsageTypeEnum)] + pub enum VolumeResourceUsageType; + + ReadOnlyRegion => b"read_only_region" + RegionSnapshot => b"region_snapshot" +); + +/// Crucible volumes are created by layering read-write regions over a hierarchy +/// of read-only resources. Originally only a region snapshot could be used as a +/// read-only resource for a volume. With the introduction of read-only regions +/// (created during the region snapshot replacement process) this is no longer +/// true. +/// +/// Read-only resources can be used by many volumes, and because of this they +/// need to have a reference count so they can be deleted when they're not +/// referenced anymore. The region_snapshot table used a `volume_references` +/// column, which counts how many uses there are. The region table does not have +/// this column, and more over a simple integer works for reference counting but +/// does not tell you _what_ volume that use is from. This can be determined +/// (see omdb's validate volume references command) but it's information that is +/// tossed out, as Nexus knows what volumes use what resources! Instead of +/// throwing away that knowledge and only incrementing and decrementing an +/// integer, record what read-only resources a volume uses in this table. +/// +/// Note: users should not use this object directly, and instead use the +/// [`VolumeResourceUsage`] enum, which is type-safe and will convert to and +/// from a [`VolumeResourceUsageRecord`] when interacting with the DB. +#[derive( + Queryable, Insertable, Debug, Clone, Selectable, PartialEq, Eq, Hash, +)] +#[diesel(table_name = volume_resource_usage)] +pub struct VolumeResourceUsageRecord { + pub usage_id: Uuid, + + pub volume_id: Uuid, + + pub usage_type: VolumeResourceUsageType, + + pub region_id: Option, + + pub region_snapshot_dataset_id: Option, + pub region_snapshot_region_id: Option, + pub region_snapshot_snapshot_id: Option, +} + +#[derive(Debug, Clone)] +pub enum VolumeResourceUsage { + ReadOnlyRegion { region_id: Uuid }, + + RegionSnapshot { dataset_id: Uuid, region_id: Uuid, snapshot_id: Uuid }, +} + +impl VolumeResourceUsageRecord { + pub fn new(volume_id: Uuid, usage: VolumeResourceUsage) -> Self { + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::ReadOnlyRegion, + + region_id: Some(region_id), + + region_snapshot_dataset_id: None, + region_snapshot_region_id: None, + region_snapshot_snapshot_id: None, + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::RegionSnapshot, + + region_id: None, + + region_snapshot_dataset_id: Some(dataset_id), + region_snapshot_region_id: Some(region_id), + region_snapshot_snapshot_id: Some(snapshot_id), + }, + } + } +} + +impl TryFrom for VolumeResourceUsage { + type Error = String; + + fn try_from( + record: VolumeResourceUsageRecord, + ) -> Result { + match record.usage_type { + VolumeResourceUsageType::ReadOnlyRegion => { + let Some(region_id) = record.region_id else { + return Err("valid read-only region usage record".into()); + }; + + Ok(VolumeResourceUsage::ReadOnlyRegion { region_id }) + } + + VolumeResourceUsageType::RegionSnapshot => { + let Some(dataset_id) = record.region_snapshot_dataset_id else { + return Err("valid region snapshot usage record".into()); + }; + + let Some(region_id) = record.region_snapshot_region_id else { + return Err("valid region snapshot usage record".into()); + }; + + let Some(snapshot_id) = record.region_snapshot_snapshot_id + else { + return Err("valid region snapshot usage record".into()); + }; + + Ok(VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }) + } + } + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 5344ffc1d9..059d43b8c7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -811,7 +811,7 @@ mod test { } #[derive(Debug)] - struct TestDatasets { + pub(crate) struct TestDatasets { // eligible and ineligible aren't currently used, but are probably handy // for the future. #[allow(dead_code)] @@ -828,7 +828,7 @@ mod test { type SledToDatasetMap = HashMap>; impl TestDatasets { - async fn create( + pub(crate) async fn create( opctx: &OpContext, datastore: Arc, num_eligible_sleds: usize, diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index df733af5b5..da32494d6f 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -264,7 +264,7 @@ impl DataStore { block_size, blocks_per_extent, extent_count, - read_only: false, + read_only: maybe_snapshot_id.is_some(), }, allocation_strategy, num_regions_required, @@ -362,6 +362,12 @@ impl DataStore { ) .execute_async(&conn).await?; } + + // Whenever a region is hard-deleted, validate invariants + // for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(()) } }) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 242560a415..4a5db14bd6 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -64,14 +64,25 @@ impl DataStore { ) -> DeleteResult { use db::schema::region_snapshot::dsl; - diesel::delete(dsl::region_snapshot) + let conn = self.pool_connection_unauthorized().await?; + + let result = diesel::delete(dsl::region_snapshot) .filter(dsl::dataset_id.eq(dataset_id)) .filter(dsl::region_id.eq(region_id)) .filter(dsl::snapshot_id.eq(snapshot_id)) - .execute_async(&*self.pool_connection_unauthorized().await?) + .execute_async(&*conn) .await .map(|_rows_deleted| ()) - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)); + + // Whenever a region snapshot is hard-deleted, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + result } /// Find region snapshots on expunged disks diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 9d4900e2a4..b9f6b589d7 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -322,4 +322,23 @@ impl DataStore { .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// Get a snapshot, returning None if it does not exist (instead of a + /// NotFound error). + pub async fn snapshot_get( + &self, + opctx: &OpContext, + snapshot_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::snapshot::dsl; + dsl::snapshot + .filter(dsl::id.eq(snapshot_id)) + .select(Snapshot::as_select()) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 88b40fcf64..4cd83fff3f 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,6 +7,8 @@ use super::DataStore; use crate::db; use crate::db::datastore::OpContext; +use crate::db::datastore::RunnableQuery; +use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; @@ -22,11 +24,14 @@ use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; +use crate::db::model::VolumeResourceUsage; +use crate::db::model::VolumeResourceUsageRecord; +use crate::db::model::VolumeResourceUsageType; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; -use crate::db::queries::volume::*; use crate::db::DbConnection; use crate::transaction_retry::OptionalError; +use anyhow::anyhow; use anyhow::bail; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -53,6 +58,7 @@ use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; use std::collections::VecDeque; +use std::net::AddrParseError; use std::net::SocketAddr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -91,19 +97,252 @@ enum VolumeGetError { InvalidVolume(String), } +#[derive(Debug, thiserror::Error)] +enum VolumeCreationError { + #[error("Error from Volume creation: {0}")] + Public(Error), + + #[error("Serde error during Volume creation: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Address parsing error during Volume creation: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(SocketAddrV6), +} + +enum RegionType { + ReadWrite, + ReadOnly, +} + +#[derive(Debug, thiserror::Error)] +enum RemoveRopError { + #[error("Error removing read-only parent: {0}")] + Public(Error), + + #[error("Serde error removing read-only parent: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error("Address parsing error during ROP removal: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), +} + +#[derive(Debug, thiserror::Error)] +enum ReplaceRegionError { + #[error("Error from Volume region replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume region replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Region replacement error: {0}")] + RegionReplacementError(#[from] anyhow::Error), +} + +#[derive(Debug, thiserror::Error)] +enum ReplaceSnapshotError { + #[error("Error from Volume snapshot replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume snapshot replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Snapshot replacement error: {0}")] + SnapshotReplacementError(#[from] anyhow::Error), + + #[error("Replaced {0} targets, expected {1}")] + UnexpectedReplacedTargets(usize, usize), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error( + "Address parsing error during Volume snapshot \ + replacement: {0}" + )] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), + + #[error("Multiple volume resource usage records for {0}")] + MultipleResourceUsageRecords(String), +} + impl DataStore { - pub async fn volume_create(&self, volume: Volume) -> CreateResult { + async fn volume_create_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume: Volume, + crucible_targets: CrucibleTargets, + ) -> Result { use db::schema::volume::dsl; - #[derive(Debug, thiserror::Error)] - enum VolumeCreationError { - #[error("Error from Volume creation: {0}")] - Public(Error), + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(conn) + .await + .optional()?; + + // If the volume existed already, return it and do not increase usage + // counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } + + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .returning(Volume::as_returning()) + .get_result_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeCreationError::Public(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Volume, + volume.id().to_string().as_str(), + ), + )) + }) + })?; + + // Increase the usage count for the read-only Crucible resources + use db::schema::volume_resource_usage::dsl as ru_dsl; + + for read_only_target in crucible_targets.read_only_targets { + let read_only_target = read_only_target.parse().map_err(|e| { + err.bail(VolumeCreationError::AddressParseError(e)) + })?; + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + match maybe_usage { + Some(usage) => { + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(VolumeResourceUsageRecord::new( + volume.id(), + usage, + )) + .execute_async(conn) + .await?; + } + + None => { + // Something went wrong, bail out - we can't create this + // Volume if we can't record its resource usage correctly + return Err(err.bail( + VolumeCreationError::CouldNotFindResource( + read_only_target, + ), + )); + } + } + } + + // After volume creation, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + + Ok(volume) + } + + /// Return a region by matching against the dataset's address and region's + /// port + async fn target_to_region( + conn: &async_bb8_diesel::Connection, + target: &SocketAddrV6, + region_type: RegionType, + ) -> Result, diesel::result::Error> { + let ip: db::model::Ipv6Addr = target.ip().into(); + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl as region_dsl; + + let read_only = match region_type { + RegionType::ReadWrite => false, + RegionType::ReadOnly => true, + }; + + dataset_dsl::dataset + .inner_join( + region_dsl::region + .on(region_dsl::dataset_id.eq(dataset_dsl::id)), + ) + .filter(dataset_dsl::ip.eq(ip)) + .filter( + region_dsl::port + .eq(Some::(target.port().into())), + ) + .filter(region_dsl::read_only.eq(read_only)) + .filter(region_dsl::deleting.eq(false)) + .select(Region::as_select()) + .get_result_async::(conn) + .await + .optional() + } + + async fn read_only_target_to_volume_resource_usage( + conn: &async_bb8_diesel::Connection, + read_only_target: &SocketAddrV6, + ) -> Result, diesel::result::Error> { + // Easy case: it's a region snapshot, and we can match by the snapshot + // address directly + + let maybe_region_snapshot = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::snapshot_addr.eq(read_only_target.to_string())) + .filter(dsl::deleting.eq(false)) + .select(RegionSnapshot::as_select()) + .get_result_async::(conn) + .await + .optional()? + }; + + if let Some(region_snapshot) = maybe_region_snapshot { + return Ok(Some(VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + })); + } + + // Less easy case: it's a read-only region, and we have to match by + // dataset ip and region port + + let maybe_region = Self::target_to_region( + conn, + read_only_target, + RegionType::ReadOnly, + ) + .await?; - #[error("Serde error during Volume creation: {0}")] - SerdeError(#[from] serde_json::Error), + if let Some(region) = maybe_region { + return Ok(Some(VolumeResourceUsage::ReadOnlyRegion { + region_id: region.id(), + })); } + // If the resource was hard-deleted, or in the off chance that the + // region didn't have an assigned port, return None here. + Ok(None) + } + + pub async fn volume_create(&self, volume: Volume) -> CreateResult { // Grab all the targets that the volume construction request references. // Do this outside the transaction, as the data inside volume doesn't // change and this would simply add to the transaction time. @@ -132,67 +371,13 @@ impl DataStore { let crucible_targets = crucible_targets.clone(); let volume = volume.clone(); async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional()?; - - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } - - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Volume, - volume.id().to_string().as_str(), - ), - ), - ) - }) - })?; - - // Increase the usage count for Crucible resources according to the - // contents of the volume. - - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in &crucible_targets.read_only_targets - { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr - .eq(read_only_target.clone()), - ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) - .await?; - } - - Ok(volume) + Self::volume_create_in_txn( + &conn, + err, + volume, + crucible_targets, + ) + .await } }) .await @@ -200,10 +385,22 @@ impl DataStore { if let Some(err) = err.take() { match err { VolumeCreationError::Public(err) => err, + VolumeCreationError::SerdeError(err) => { Error::internal_error(&format!( - "Transaction error: {}", - err + "SerdeError error: {err}" + )) + } + + VolumeCreationError::CouldNotFindResource(s) => { + Error::internal_error(&format!( + "CouldNotFindResource error: {s}" + )) + } + + VolumeCreationError::AddressParseError(err) => { + Error::internal_error(&format!( + "AddressParseError error: {err}" )) } } @@ -213,18 +410,27 @@ impl DataStore { }) } - /// Return a `Option` based on id, even if it's soft deleted. - pub async fn volume_get( - &self, + async fn volume_get_impl( + conn: &async_bb8_diesel::Connection, volume_id: Uuid, - ) -> LookupResult> { + ) -> Result, diesel::result::Error> { use db::schema::volume::dsl; dsl::volume .filter(dsl::id.eq(volume_id)) .select(Volume::as_select()) - .first_async::(&*self.pool_connection_unauthorized().await?) + .first_async::(conn) .await .optional() + } + + /// Return a `Option` based on id, even if it's soft deleted. + pub async fn volume_get( + &self, + volume_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_unauthorized().await?; + Self::volume_get_impl(&conn, volume_id) + .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -241,6 +447,105 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + fn volume_usage_records_for_resource_query( + resource: VolumeResourceUsage, + ) -> impl RunnableQuery { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + dsl::volume_resource_usage + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .into_boxed() + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .into_boxed(), + } + } + + /// For a given VolumeResourceUsage, return all found usage records for it. + pub async fn volume_usage_records_for_resource( + &self, + resource: VolumeResourceUsage, + ) -> ListResultVec { + let conn = self.pool_connection_unauthorized().await?; + + Self::volume_usage_records_for_resource_query(resource) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// When moving a resource from one volume to another, call this to update + /// the corresponding volume resource usage record + pub async fn swap_volume_usage_records_for_resources( + conn: &async_bb8_diesel::Connection, + resource: VolumeResourceUsage, + from_volume_id: Uuid, + to_volume_id: Uuid, + ) -> Result<(), diesel::result::Error> { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + } + + Ok(()) + } + async fn volume_checkout_allowed( reason: &VolumeCheckoutReason, vcr: &VolumeConstructionRequest, @@ -259,9 +564,14 @@ impl DataStore { match volume_is_read_only(&vcr) { Ok(read_only) => { if !read_only { - return Err(VolumeGetError::CheckoutConditionFailed( - String::from("Non-read-only Volume Checkout for use Copy!") - )); + return Err( + VolumeGetError::CheckoutConditionFailed( + String::from( + "Non-read-only Volume Checkout \ + for use Copy!", + ), + ), + ); } Ok(()) @@ -300,49 +610,49 @@ impl DataStore { let runtime = instance.runtime(); match (runtime.propolis_id, runtime.dst_propolis_id) { (Some(_), Some(_)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} is undergoing migration", - vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} is undergoing \ + migration", + vmm_id, + instance.id(), + ))) } (None, None) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} has no propolis ids", - vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} has no \ + propolis ids", + vmm_id, + instance.id(), + ))) } (Some(propolis_id), None) => { if propolis_id != vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} propolis id {} mismatch", + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} propolis \ + id {} mismatch", vmm_id, instance.id(), propolis_id, - ) - )); + ), + ), + ); } Ok(()) } (None, Some(dst_propolis_id)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} has no propolis id but dst propolis id {}", - vmm_id, - instance.id(), - dst_propolis_id, - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} has no \ + propolis id but dst propolis id {}", + vmm_id, + instance.id(), + dst_propolis_id, + ))) } } } @@ -364,60 +674,67 @@ impl DataStore { let runtime = instance.runtime(); match (runtime.propolis_id, runtime.dst_propolis_id) { (Some(propolis_id), Some(dst_propolis_id)) => { - if propolis_id != vmm_id.into_untyped_uuid() || dst_propolis_id != target_vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} propolis id mismatches {} {}", - vmm_id, - target_vmm_id, - instance.id(), - propolis_id, - dst_propolis_id, - ) - )); + if propolis_id != vmm_id.into_untyped_uuid() + || dst_propolis_id + != target_vmm_id.into_untyped_uuid() + { + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} \ + propolis id mismatches {} {}", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + dst_propolis_id, + ), + ), + ); } Ok(()) } (None, None) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} has no propolis ids", - vmm_id, - target_vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceMigrate {} {}: instance {} has no \ + propolis ids", + vmm_id, + target_vmm_id, + instance.id(), + ))) } (Some(propolis_id), None) => { // XXX is this right? if propolis_id != vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} propolis id {} mismatch", - vmm_id, - target_vmm_id, - instance.id(), - propolis_id, - ) - )); + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} \ + propolis id {} mismatch", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + ), + ), + ); } Ok(()) } (None, Some(dst_propolis_id)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} has no propolis id but dst propolis id {}", - vmm_id, - target_vmm_id, - instance.id(), - dst_propolis_id, - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceMigrate {} {}: instance {} has no \ + propolis id but dst propolis id {}", + vmm_id, + target_vmm_id, + instance.id(), + dst_propolis_id, + ))) } } } @@ -453,10 +770,11 @@ impl DataStore { // XXX this is a Nexus bug! return Err(VolumeGetError::CheckoutConditionFailed( format!( - "Pantry: instance {} backing disk {} does not exist?", + "Pantry: instance {} backing disk {} does not \ + exist?", attach_instance_id, disk.id(), - ) + ), )); }; @@ -479,6 +797,181 @@ impl DataStore { } } + async fn volume_checkout_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: Uuid, + reason: VolumeCheckoutReason, + ) -> Result { + use db::schema::volume::dsl; + + // Grab the volume in question. + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(conn) + .await?; + + // Turn the volume.data into the VolumeConstructionRequest + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()) + .map_err(|e| err.bail(VolumeGetError::SerdeError(e)))?; + + // The VolumeConstructionRequest resulting from this checkout will have + // its generation numbers bumped, and as result will (if it has + // non-read-only sub-volumes) take over from previous read/write + // activations when sent to a place that will `construct` a new Volume. + // Depending on the checkout reason, prevent creating multiple + // read/write Upstairs acting on the same Volume, except where the take + // over is intended. + + let (maybe_disk, maybe_instance) = { + use db::schema::disk::dsl as disk_dsl; + use db::schema::instance::dsl as instance_dsl; + + let maybe_disk: Option = disk_dsl::disk + .filter(disk_dsl::time_deleted.is_null()) + .filter(disk_dsl::volume_id.eq(volume_id)) + .select(Disk::as_select()) + .get_result_async(conn) + .await + .optional()?; + + let maybe_instance: Option = + if let Some(disk) = &maybe_disk { + if let Some(attach_instance_id) = + disk.runtime().attach_instance_id + { + instance_dsl::instance + .filter(instance_dsl::time_deleted.is_null()) + .filter(instance_dsl::id.eq(attach_instance_id)) + .select(Instance::as_select()) + .get_result_async(conn) + .await + .optional()? + } else { + // Disk not attached to an instance + None + } + } else { + // Volume not associated with disk + None + }; + + (maybe_disk, maybe_instance) + }; + + if let Err(e) = Self::volume_checkout_allowed( + &reason, + &vcr, + maybe_disk, + maybe_instance, + ) + .await + { + return Err(err.bail(e)); + } + + // Look to see if the VCR is a Volume type, and if so, look at its + // sub_volumes. If they are of type Region, then we need to update their + // generation numbers and record that update back to the database. We + // return to the caller whatever the original volume data was we pulled + // from the database. + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + let mut update_needed = false; + let mut new_sv = Vec::new(); + for sv in sub_volumes { + match sv { + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen, + } => { + update_needed = true; + new_sv.push(VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen: gen + 1, + }); + } + _ => { + new_sv.push(sv); + } + } + } + + // Only update the volume data if we found the type of volume + // that needed it. + if update_needed { + // Create a new VCR and fill in the contents from what the + // original volume had, but with our updated sub_volume + // records. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes: new_sv, + read_only_parent, + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(VolumeGetError::SerdeError(e)))?; + + // Update the original volume_id with the new volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await?; + + // This should update just one row. If it does not, then + // something is terribly wrong in the database. + if num_updated != 1 { + return Err(err.bail( + VolumeGetError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + } + } + VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _, + } => { + // We don't support a pure Region VCR at the volume level in the + // database, so this choice should never be encountered, but I + // want to know if it is. + return Err(err.bail(VolumeGetError::InvalidVolume( + String::from("Region not supported as a top level volume"), + ))); + } + VolumeConstructionRequest::File { + id: _, + block_size: _, + path: _, + } + | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => + {} + } + + Ok(volume) + } + /// Checkout a copy of the Volume from the database. /// This action (getting a copy) will increase the generation number /// of Volumes of the VolumeConstructionRequest::Volume type that have @@ -490,8 +983,6 @@ impl DataStore { volume_id: Uuid, reason: VolumeCheckoutReason, ) -> LookupResult { - use db::schema::volume::dsl; - // We perform a transaction here, to be sure that on completion // of this, the database contains an updated version of the // volume with the generation number incremented (for the volume @@ -505,176 +996,8 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await?; - - // Turn the volume.data into the VolumeConstructionRequest - let vcr: VolumeConstructionRequest = - serde_json::from_str(volume.data()).map_err(|e| { - err.bail(VolumeGetError::SerdeError(e)) - })?; - - // The VolumeConstructionRequest resulting from this checkout will have its - // generation numbers bumped, and as result will (if it has non-read-only - // sub-volumes) take over from previous read/write activations when sent to a - // place that will `construct` a new Volume. Depending on the checkout reason, - // prevent creating multiple read/write Upstairs acting on the same Volume, - // except where the take over is intended. - - let (maybe_disk, maybe_instance) = { - use db::schema::instance::dsl as instance_dsl; - use db::schema::disk::dsl as disk_dsl; - - let maybe_disk: Option = disk_dsl::disk - .filter(disk_dsl::time_deleted.is_null()) - .filter(disk_dsl::volume_id.eq(volume_id)) - .select(Disk::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let maybe_instance: Option = if let Some(disk) = &maybe_disk { - if let Some(attach_instance_id) = disk.runtime().attach_instance_id { - instance_dsl::instance - .filter(instance_dsl::time_deleted.is_null()) - .filter(instance_dsl::id.eq(attach_instance_id)) - .select(Instance::as_select()) - .get_result_async(&conn) - .await - .optional()? - } else { - // Disk not attached to an instance - None - } - } else { - // Volume not associated with disk - None - }; - - (maybe_disk, maybe_instance) - }; - - if let Err(e) = Self::volume_checkout_allowed( - &reason, - &vcr, - maybe_disk, - maybe_instance, - ) - .await { - return Err(err.bail(e)); - } - - // Look to see if the VCR is a Volume type, and if so, look at - // its sub_volumes. If they are of type Region, then we need - // to update their generation numbers and record that update - // back to the database. We return to the caller whatever the - // original volume data was we pulled from the database. - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - let mut update_needed = false; - let mut new_sv = Vec::new(); - for sv in sub_volumes { - match sv { - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen, - } => { - update_needed = true; - new_sv.push( - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen: gen + 1, - }, - ); - } - _ => { - new_sv.push(sv); - } - } - } - - // Only update the volume data if we found the type - // of volume that needed it. - if update_needed { - // Create a new VCR and fill in the contents - // from what the original volume had, but with our - // updated sub_volume records. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes: new_sv, - read_only_parent, - }; - - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeGetError::SerdeError(e)) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(err.bail( - VolumeGetError::UnexpectedDatabaseUpdate( - num_updated, - 1, - ), - )); - } - } - } - VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _, - } => { - // We don't support a pure Region VCR at the volume - // level in the database, so this choice should - // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); - } - VolumeConstructionRequest::File { - id: _, - block_size: _, - path: _, - } - | VolumeConstructionRequest::Url { - id: _, - block_size: _, - url: _, - } => {} - } - Ok(volume) + Self::volume_checkout_in_txn(&conn, err, volume_id, reason) + .await } }) .await @@ -686,7 +1009,10 @@ impl DataStore { } _ => { - return Error::internal_error(&format!("Transaction error: {}", err)); + return Error::internal_error(&format!( + "Transaction error: {}", + err + )); } } } @@ -731,11 +1057,12 @@ impl DataStore { VolumeConstructionRequest::Region { opts, .. } => { if !opts.read_only { // Only one volume can "own" a Region, and that volume's - // UUID is recorded in the region table accordingly. It is - // an error to make a copy of a volume construction request - // that references non-read-only Regions. + // UUID is recorded in the region table accordingly. It + // is an error to make a copy of a volume construction + // request that references non-read-only Regions. bail!( - "only one Volume can reference a Region non-read-only!" + "only one Volume can reference a Region \ + non-read-only!" ); } @@ -781,18 +1108,35 @@ impl DataStore { } /// Find regions for deleted volumes that do not have associated region - /// snapshots. + /// snapshots and are not being used by any other non-deleted volumes, and + /// return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option, Volume)> { + ) -> ListResultVec<(Dataset, Region, Option)> { + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("find_deleted_volume_regions") + .transaction(&conn, |conn| async move { + Self::find_deleted_volume_regions_in_txn(&conn).await + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + async fn find_deleted_volume_regions_in_txn( + conn: &async_bb8_diesel::Connection, + ) -> Result)>, diesel::result::Error> + { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; use db::schema::volume::dsl as volume_dsl; - // Find all regions and datasets - region_dsl::region - .inner_join( + // Find all read-write regions (read-only region cleanup is taken care + // of in soft_delete_volume_in_txn!) and their associated datasets + let unfiltered_deleted_regions = region_dsl::region + .filter(region_dsl::read_only.eq(false)) + // the volume may be hard deleted, so use a left join here + .left_join( volume_dsl::volume.on(region_dsl::volume_id.eq(volume_dsl::id)), ) .inner_join( @@ -800,33 +1144,70 @@ impl DataStore { .on(region_dsl::dataset_id.eq(dataset_dsl::id)), ) // where there either are no region snapshots, or the region - // snapshot volume references have gone to zero + // snapshot volume has deleted = true .left_join( dsl::region_snapshot.on(dsl::region_id .eq(region_dsl::id) .and(dsl::dataset_id.eq(dataset_dsl::id))), ) - .filter( - dsl::volume_references - .eq(0) - // Despite the SQL specifying that this column is NOT NULL, - // this null check is required for this function to work! - // It's possible that the left join of region_snapshot above - // could join zero rows, making this null. - .or(dsl::volume_references.is_null()), - ) - // where the volume has already been soft-deleted - .filter(volume_dsl::time_deleted.is_not_null()) + .filter(dsl::deleting.eq(true).or(dsl::deleting.is_null())) // and return them (along with the volume so it can be hard deleted) .select(( Dataset::as_select(), Region::as_select(), Option::::as_select(), - Volume::as_select(), + // Diesel can't express a difference between + // + // a) the volume record existing and the nullable + // volume.time_deleted column being set to null + // b) the volume record does not exist (null due to left join) + // + // so return an Option and check below + Option::::as_select(), )) - .load_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .load_async(conn) + .await?; + + let mut deleted_regions = + Vec::with_capacity(unfiltered_deleted_regions.len()); + + for (dataset, region, region_snapshot, volume) in + unfiltered_deleted_regions + { + // only operate on soft deleted volumes + let soft_deleted = match &volume { + Some(volume) => volume.time_deleted.is_some(), + None => false, + }; + + if !soft_deleted { + continue; + } + + if region_snapshot.is_some() { + // We cannot delete this region: the presence of the region + // snapshot record means that the Crucible agent's snapshot has + // not been deleted yet (as the lifetime of the region snapshot + // record is equal to or longer than the lifetime of the + // Crucible agent's snapshot). + // + // This condition can occur when multiple volume delete sagas + // run concurrently: one will decrement the crucible resources + // (but hasn't made the appropriate DELETE calls to the + // appropriate Agents to tombstone the running snapshots and + // snapshots yet), and the other will be in the "delete freed + // regions" saga node trying to delete the region. Without this + // check, This race results in the Crucible Agent returning + // "must delete snapshots first" and causing saga unwinds. + // + // Another volume delete will pick up this region and remove it. + continue; + } + + deleted_regions.push((dataset, region, volume)); + } + + Ok(deleted_regions) } pub async fn read_only_resources_associated_with_volume( @@ -853,20 +1234,30 @@ impl DataStore { } #[derive(Debug, thiserror::Error)] -enum SoftDeleteTransactionError { +enum SoftDeleteError { #[error("Serde error decreasing Crucible resources: {0}")] SerdeError(#[from] serde_json::Error), - #[error("Updated {0} database rows, expected 1")] - UnexpectedDatabaseUpdate(usize), + #[error("Updated {0} database rows in {1}, expected 1")] + UnexpectedDatabaseUpdate(usize, String), + + // XXX is this an error? delete volume anyway, else we're stuck? + #[error("Could not match resource to {0}")] + CouldNotFindResource(String), + + #[error("Address parsing error during Volume soft-delete: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Invalid Volume: {0}")] + InvalidVolume(String), } impl DataStore { // See comment for `soft_delete_volume` - async fn soft_delete_volume_txn( + async fn soft_delete_volume_in_txn( conn: &async_bb8_diesel::Connection, volume_id: Uuid, - err: OptionalError, + err: OptionalError, ) -> Result { // Grab the volume, and check if the volume was already soft-deleted. // We have to guard against the case where this function is called @@ -923,8 +1314,8 @@ impl DataStore { .map_err(|e| err.bail(e.into()))?; // Grab all the targets that the volume construction request references. - // Do this _inside_ the transaction, as the read-only parent data inside - // volume can change as a result of region snapshot replacement. + // Do this _inside_ the transaction, as the data inside volumes can + // change as a result of region / region snapshot replacement. let crucible_targets = { let mut crucible_targets = CrucibleTargets::default(); read_only_resources_associated_with_volume( @@ -934,52 +1325,270 @@ impl DataStore { crucible_targets }; - // Decrease the number of references for each region snapshot that a - // volume references, returning the updated rows. - let updated_region_snapshots = ConditionallyDecreaseReferences::new( - crucible_targets.read_only_targets, - ) - .get_results_async::(conn) - .await?; + // Decrease the number of references for each resource that a volume + // references, collecting the regions and region snapshots that were + // freed up for deletion. + + let num_read_write_subvolumes = + match count_read_write_sub_volumes(&vcr) { + Ok(v) => v, + Err(e) => { + return Err(err.bail(SoftDeleteError::InvalidVolume( + format!("volume {} invalid: {e}", volume.id()), + ))); + } + }; - // Return all regions for the volume to be cleaned up. - let regions: Vec = { - use db::schema::region::dsl as region_dsl; - use db::schema::region_snapshot::dsl as region_snapshot_dsl; + let mut regions: Vec = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); - region_dsl::region - .left_join( - region_snapshot_dsl::region_snapshot - .on(region_snapshot_dsl::region_id.eq(region_dsl::id)), - ) - .filter( - region_snapshot_dsl::volume_references - .eq(0) - .or(region_snapshot_dsl::volume_references.is_null()), - ) - .filter(region_dsl::volume_id.eq(volume_id)) - .select(Region::as_select()) - .get_results_async(conn) - .await? - .into_iter() - .map(|region| region.id()) - .collect() - }; + let mut region_snapshots: Vec = + Vec::with_capacity(crucible_targets.read_only_targets.len()); - // Return the region snapshots that had their volume_references updated - // to 0 (and had the deleting flag set) to be cleaned up. - let region_snapshots = updated_region_snapshots - .into_iter() - .filter(|region_snapshot| { - region_snapshot.volume_references == 0 - && region_snapshot.deleting - }) - .map(|region_snapshot| RegionSnapshotV3 { - dataset: region_snapshot.dataset_id, - region: region_snapshot.region_id, - snapshot: region_snapshot.snapshot_id, - }) - .collect(); + // First, grab read-write regions - they're not shared, but they are + // not candidates for deletion if there are region snapshots + let mut read_write_targets = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); + + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let target = target + .parse() + .map_err(|e| err.bail(SoftDeleteError::AddressParseError(e)))?; + + let maybe_region = + Self::target_to_region(conn, &target, RegionType::ReadWrite) + .await?; + + let Some(region) = maybe_region else { + return Err(err.bail(SoftDeleteError::CouldNotFindResource( + format!("could not find resource for {target}"), + ))); + }; + + // Filter out regions that have any region-snapshots + let region_snapshot_count: i64 = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await? + }; + + if region_snapshot_count == 0 { + regions.push(region.id()); + } + } + + for read_only_target in &crucible_targets.read_only_targets { + use db::schema::volume_resource_usage::dsl as ru_dsl; + + let read_only_target = read_only_target + .parse() + .map_err(|e| err.bail(SoftDeleteError::AddressParseError(e)))?; + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + let Some(usage) = maybe_usage else { + return Err(err.bail(SoftDeleteError::CouldNotFindResource( + format!("could not find resource for {read_only_target}"), + ))); + }; + + // For each read-only resource, remove the associated volume + // resource usage record for this volume. Only return a resource for + // deletion if no more associated volume usage records are found. + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::ReadOnlyRegion, + ), + ) + .filter(ru_dsl::region_id.eq(Some(region_id))) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage (region)".into(), + ), + )); + } + + let region_usage_left = ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(ru_dsl::region_id.eq(region_id)) + .count() + .get_result_async::(conn) + .await?; + + if region_usage_left == 0 { + // There are several factors that mean Nexus does _not_ + // have to check here for region snapshots taken of + // read-only regions: + // + // - When a Crucible Volume receives a flush, it is only + // propagated to the subvolumes of that Volume, not + // the read-only parent. There's a few reasons for + // this, but the main one is that a Crucible flush + // changes the on-disk data, and the directory that + // the downstairs of a read-only parent is serving out + // of may be on read-only storage, as is the case when + // serving out of a .zfs/snapshot/ directory. + // + // - Even if a Crucible flush _did_ propagate to the + // read-only parent, Nexus should never directly send + // the snapshot volume to a place where it will be + // constructed, meaning the read-only regions of the + // snapshot volume's subvolumes will never themselves + // receive a flush. + // + // If either of these factors change, then that check is + // required here. The `validate_volume_invariants` + // function will return an error if a read-only region + // has an associated region snapshot during testing. + // + // However, don't forget to set `deleting`! These + // regions will be returned to the calling function for + // garbage collection. + use db::schema::region::dsl; + let updated_rows = diesel::update(dsl::region) + .filter(dsl::id.eq(region_id)) + .filter(dsl::read_only.eq(true)) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting (region)".into(), + ), + )); + } + + regions.push(region_id); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage \ + (region_snapshot)" + .into(), + ), + )); + } + + let region_snapshot_usage_left = + ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .count() + .get_result_async::(conn) + .await?; + + if region_snapshot_usage_left == 0 { + // Don't forget to set `deleting`! see: omicron#4095 + use db::schema::region_snapshot::dsl; + let updated_rows = diesel::update(dsl::region_snapshot) + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .filter( + dsl::snapshot_addr + .eq(read_only_target.to_string()), + ) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting (region snapshot)".into(), + ), + )); + } + + region_snapshots.push(RegionSnapshotV3 { + dataset: dataset_id, + region: region_id, + snapshot: snapshot_id, + }); + } + } + } + } let resources_to_delete = CrucibleResources::V3(CrucibleResourcesV3 { regions, @@ -1003,13 +1612,18 @@ impl DataStore { if updated_rows != 1 { return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + SoftDeleteError::UnexpectedDatabaseUpdate( updated_rows, + "volume".into(), ), )); } } + // After volume deletion, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(resources_to_delete) } @@ -1027,7 +1641,7 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - Self::soft_delete_volume_txn(&conn, volume_id, err).await + Self::soft_delete_volume_in_txn(&conn, volume_id, err).await } }) .await @@ -1040,6 +1654,191 @@ impl DataStore { }) } + async fn volume_remove_rop_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: Uuid, + temp_volume_id: Uuid, + ) -> Result { + // Grab the volume in question. If the volume record was already deleted + // then we can just return. + let volume = { + use db::schema::volume::dsl; + + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(conn) + .await + .optional()?; + + let volume = if let Some(v) = volume { + v + } else { + // the volume does not exist, nothing to do. + return Ok(false); + }; + + if volume.time_deleted.is_some() { + // this volume is deleted, so let whatever is deleting it clean + // it up. + return Ok(false); + } else { + // A volume record exists, and was not deleted, we can attempt + // to remove its read_only_parent. + volume + } + }; + + // If a read_only_parent exists, remove it from volume_id, and attach it + // to temp_volume_id. + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + if read_only_parent.is_none() { + // This volume has no read_only_parent + Ok(false) + } else { + // Create a new VCR and fill in the contents from what the + // original volume had. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent: None, + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + // Update the original volume_id with the new volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await?; + + // This should update just one row. If it does not, then + // something is terribly wrong in the database. + if num_updated != 1 { + return Err(err.bail( + RemoveRopError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + + // Make a new VCR, with the information from our + // temp_volume_id, but the read_only_parent from the + // original volume. + let rop_vcr = VolumeConstructionRequest::Volume { + id: temp_volume_id, + block_size, + sub_volumes: vec![], + read_only_parent, + }; + + let rop_volume_data = serde_json::to_string(&rop_vcr) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + // Update the temp_volume_id with the volume data that + // contains the read_only_parent. + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(temp_volume_id)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(rop_volume_data)) + .execute_async(conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + RemoveRopError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + + // Update the volume resource usage record for every + // read-only resource in the ROP + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &rop_vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in crucible_targets.read_only_targets { + let read_only_target = + read_only_target.parse().map_err(|e| { + err.bail(RemoveRopError::AddressParseError(e)) + })?; + + let maybe_usage = + Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + RemoveRopError::CouldNotFindResource(format!( + "could not find resource for \ + {read_only_target}" + )), + )); + }; + + Self::swap_volume_usage_records_for_resources( + conn, + usage, + volume_id, + temp_volume_id, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + RemoveRopError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ), + ) + }) + })?; + } + + // After read-only parent removal, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; + + Ok(true) + } + } + + VolumeConstructionRequest::File { .. } + | VolumeConstructionRequest::Region { .. } + | VolumeConstructionRequest::Url { .. } => { + // Volume has a format that does not contain ROPs + Ok(false) + } + } + } + // Here we remove the read only parent from volume_id, and attach it // to temp_volume_id. // @@ -1051,15 +1850,6 @@ impl DataStore { volume_id: Uuid, temp_volume_id: Uuid, ) -> Result { - #[derive(Debug, thiserror::Error)] - enum RemoveReadOnlyParentError { - #[error("Serde error removing read only parent: {0}")] - SerdeError(#[from] serde_json::Error), - - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), - } - // In this single transaction: // - Get the given volume from the volume_id from the database // - Extract the volume.data into a VolumeConstructionRequest (VCR) @@ -1080,147 +1870,22 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. If the volume record was already - // deleted then we can just return. - let volume = { - use db::schema::volume::dsl; - - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let volume = if let Some(v) = volume { - v - } else { - // the volume does not exist, nothing to do. - return Ok(false); - }; - - if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is deleting - // it clean it up. - return Ok(false); - } else { - // A volume record exists, and was not deleted, we - // can attempt to remove its read_only_parent. - volume - } - }; - - // If a read_only_parent exists, remove it from volume_id, and - // attach it to temp_volume_id. - let vcr: VolumeConstructionRequest = - serde_json::from_str( - volume.data() - ) - .map_err(|e| { - err.bail( - RemoveReadOnlyParentError::SerdeError( - e, - ) - ) - })?; - - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - if read_only_parent.is_none() { - // This volume has no read_only_parent - Ok(false) - } else { - // Create a new VCR and fill in the contents - // from what the original volume had. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent: None, - }; - - let new_volume_data = - serde_json::to_string( - &new_vcr - ) - .map_err(|e| { - err.bail(RemoveReadOnlyParentError::SerdeError( - e, - )) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); - } - - // Make a new VCR, with the information from - // our temp_volume_id, but the read_only_parent - // from the original volume. - let rop_vcr = VolumeConstructionRequest::Volume { - id: temp_volume_id, - block_size, - sub_volumes: vec![], - read_only_parent, - }; - let rop_volume_data = - serde_json::to_string( - &rop_vcr - ) - .map_err(|e| { - err.bail(RemoveReadOnlyParentError::SerdeError( - e, - )) - })?; - // Update the temp_volume_id with the volume - // data that contains the read_only_parent. - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(temp_volume_id)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(rop_volume_data)) - .execute_async(&conn) - .await?; - if num_updated != 1 { - return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); - } - Ok(true) - } - } - VolumeConstructionRequest::File { id: _, block_size: _, path: _ } - | VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _ } - | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { - // Volume has a format that does not contain ROPs - Ok(false) - } - } + Self::volume_remove_rop_in_txn( + &conn, + err, + volume_id, + temp_volume_id, + ) + .await } }) .await .map_err(|e| { if let Some(err) = err.take() { - return Error::internal_error(&format!("Transaction error: {}", err)); + return Error::internal_error(&format!( + "Transaction error: {}", + err + )); } public_error_from_diesel(e, ErrorHandler::Server) }) @@ -1305,9 +1970,9 @@ impl DataStore { if mismatched_record_type_count > 0 { return Err(err.bail(Error::conflict(&format!( - "existing repair type for id {} does not match {:?}!", - record.repair_id, - record.repair_type, + "existing repair type for id {} does not \ + match {:?}!", + record.repair_id, record.repair_type, )))); } @@ -1368,17 +2033,65 @@ impl DataStore { .execute_async(&conn) .await?; - Ok(()) - } - }) - .await - .map_err(|e| { - if let Some(err) = err.take() { - err - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + async fn upstairs_repair_progress_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> Result<(), diesel::result::Error> { + use db::schema::upstairs_repair_notification::dsl as notification_dsl; + use db::schema::upstairs_repair_progress::dsl; + + // Check that there is a repair id for the upstairs id + let matching_repair: Option = + notification_dsl::upstairs_repair_notification + .filter( + notification_dsl::repair_id + .eq(nexus_db_model::to_db_typed_uuid(repair_id)), + ) + .filter( + notification_dsl::upstairs_id + .eq(nexus_db_model::to_db_typed_uuid(upstairs_id)), + ) + .filter( + notification_dsl::notification_type + .eq(UpstairsRepairNotificationType::Started), + ) + .get_result_async(conn) + .await + .optional()?; + + if matching_repair.is_none() { + return Err(err.bail(Error::non_resourcetype_not_found(&format!( + "upstairs {upstairs_id} repair {repair_id} not found" + )))); + } + + diesel::insert_into(dsl::upstairs_repair_progress) + .values(UpstairsRepairProgress { + repair_id: repair_id.into(), + time: repair_progress.time, + current_item: repair_progress.current_item, + total_items: repair_progress.total_items, }) + .execute_async(conn) + .await?; + + Ok(()) } /// Record Upstairs repair progress @@ -1389,9 +2102,6 @@ impl DataStore { repair_id: TypedUuid, repair_progress: RepairProgress, ) -> Result<(), Error> { - use db::schema::upstairs_repair_notification::dsl as notification_dsl; - use db::schema::upstairs_repair_progress::dsl; - let conn = self.pool_connection_authorized(opctx).await?; let err = OptionalError::new(); @@ -1401,33 +2111,14 @@ impl DataStore { let err = err.clone(); async move { - // Check that there is a repair id for the upstairs id - let matching_repair: Option = - notification_dsl::upstairs_repair_notification - .filter(notification_dsl::repair_id.eq(nexus_db_model::to_db_typed_uuid(repair_id))) - .filter(notification_dsl::upstairs_id.eq(nexus_db_model::to_db_typed_uuid(upstairs_id))) - .filter(notification_dsl::notification_type.eq(UpstairsRepairNotificationType::Started)) - .get_result_async(&conn) - .await - .optional()?; - - if matching_repair.is_none() { - return Err(err.bail(Error::non_resourcetype_not_found(&format!( - "upstairs {upstairs_id} repair {repair_id} not found" - )))); - } - - diesel::insert_into(dsl::upstairs_repair_progress) - .values(UpstairsRepairProgress { - repair_id: repair_id.into(), - time: repair_progress.time, - current_item: repair_progress.current_item, - total_items: repair_progress.total_items, - }) - .execute_async(&conn) - .await?; - - Ok(()) + Self::upstairs_repair_progress_in_txn( + &conn, + err, + upstairs_id, + repair_id, + repair_progress, + ) + .await } }) .await @@ -1608,7 +2299,7 @@ pub struct CrucibleResourcesV2 { pub snapshots_to_delete: Vec, } -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct RegionSnapshotV3 { dataset: Uuid, region: Uuid, @@ -1883,6 +2574,7 @@ fn read_only_target_in_vcr( Ok(false) } +#[derive(Clone)] pub struct VolumeReplacementParams { pub volume_id: Uuid, pub region_id: Uuid, @@ -1920,12 +2612,12 @@ pub enum VolumeReplaceResult { } impl DataStore { - /// Replace a read-write region in a Volume with a new region. - pub async fn volume_replace_region( - &self, + async fn volume_replace_region_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, existing: VolumeReplacementParams, replacement: VolumeReplacementParams, - ) -> Result { + ) -> Result { // In a single transaction: // // - set the existing region's volume id to the replacement's volume id @@ -2016,191 +2708,546 @@ impl DataStore { // referenced (via its socket address) in NEW_VOL. For an example, this // is done as part of the region replacement start saga. - #[derive(Debug, thiserror::Error)] - enum VolumeReplaceRegionError { - #[error("Error from Volume region replacement: {0}")] - Public(Error), + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(existing.volume_id)) + .select(Volume::as_select()) + .first_async::(conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + }; + + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was hard-deleted, so return here. We can't + // perform the region replacement now, and this will short-circuit + // the rest of the process. + + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + }; + + if old_volume.time_deleted.is_some() { + // Existing volume was soft-deleted, so return here for the same + // reason: the region replacement process should be short-circuited + // now. + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + } + + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail(ReplaceRegionError::SerdeError(e))); + } + }; - #[error("Serde error during Volume region replacement: {0}")] - SerdeError(#[from] serde_json::Error), + // Does it look like this replacement already happened? + let old_region_in_vcr = + match region_in_vcr(&old_vcr, &existing.region_addr) { + Ok(v) => v, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + let new_region_in_vcr = + match region_in_vcr(&old_vcr, &replacement.region_addr) { + Ok(v) => v, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; - #[error("Region replacement error: {0}")] - RegionReplacementError(#[from] anyhow::Error), + if !old_region_in_vcr && new_region_in_vcr { + // It does seem like the replacement happened - if this function is + // called twice in a row then this can happen. + return Ok(VolumeReplaceResult::AlreadyHappened); + } else if old_region_in_vcr && !new_region_in_vcr { + // The replacement hasn't happened yet, but can proceed + } else if old_region_in_vcr && new_region_in_vcr { + // Both the old region and new region exist in this VCR. Regions are + // not reused, so this is an illegal state: if the replacement of + // the old region occurred, then the new region would be present + // multiple times in the volume. We have to bail out here. + // + // The guards against this happening are: + // + // - only one replacement can occur for a volume at a time (due to + // the volume repair lock), and + // + // - region replacement does not delete the old region until the + // "region replacement finish" saga, which happens at the very end + // of the process. If it eagerly deleted the region, the crucible + // agent would be free to reuse the port for another region + // allocation, and an identical target (read: ip and port) could + // be confusing. Most of the time, we assume that the dataset + // containing that agent has been expunged, so the agent is gone, + // so this port reuse cannot occur + return Err(err.bail(ReplaceRegionError::RegionReplacementError( + anyhow!("old_region_in_vcr && new_region_in_vcr"), + ))); + } else if !old_region_in_vcr && !new_region_in_vcr { + // Neither the region we've been asked to replace or the new region + // is in the VCR. This is an illegal state, as this function would + // be performing a no-op. We have to bail out here. + // + // The guard against this happening is again that only one + // replacement can occur for a volume at a time: if it was possible + // for multiple region replacements to occur, then both would be + // attempting to swap out the same old region for different new + // regions: + // + // region replacement one: + // + // volume_replace_region_in_txn( + // .., + // existing = [fd00:1122:3344:145::10]:40001, + // replacement = [fd00:1122:3344:322::4]:3956, + // ) + // + // region replacement two: + // + // volume_replace_region_in_txn( + // .., + // existing = [fd00:1122:3344:145::10]:40001, + // replacement = [fd00:1122:3344:fd1::123]:27001, + // ) + // + // The one that replaced second would always land in this branch. + return Err(err.bail(ReplaceRegionError::RegionReplacementError( + anyhow!("!old_region_in_vcr && !new_region_in_vcr"), + ))); } + + use db::schema::region::dsl as region_dsl; + use db::schema::volume::dsl as volume_dsl; + + // Set the existing region's volume id to the replacement's volume id + diesel::update(region_dsl::region) + .filter(region_dsl::id.eq(existing.region_id)) + .set(region_dsl::volume_id.eq(replacement.volume_id)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Set the replacement region's volume id to the existing's volume id + diesel::update(region_dsl::region) + .filter(region_dsl::id.eq(replacement.region_id)) + .set(region_dsl::volume_id.eq(existing.volume_id)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Update the existing volume's construction request to replace the + // existing region's SocketAddrV6 with the replacement region's + + // Copy the old volume's VCR, changing out the old region for the new. + let new_vcr = match replace_region_in_vcr( + &old_vcr, + existing.region_addr, + replacement.region_addr, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(ReplaceRegionError::SerdeError(e)))?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(existing.volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // After region replacement, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; + + Ok(VolumeReplaceResult::Done) + } + + /// Replace a read-write region in a Volume with a new region. + pub async fn volume_replace_region( + &self, + existing: VolumeReplacementParams, + replacement: VolumeReplacementParams, + ) -> Result { let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("volume_replace_region") .transaction(&conn, |conn| { let err = err.clone(); + let existing = existing.clone(); + let replacement = replacement.clone(); async move { - // Grab the old volume first - let maybe_old_volume = { - volume_dsl::volume - .filter(volume_dsl::id.eq(existing.volume_id)) - .select(Volume::as_select()) - .first_async::(&conn) - .await - .optional() - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - }; - - let old_volume = if let Some(old_volume) = maybe_old_volume { - old_volume - } else { - // Existing volume was hard-deleted, so return here. We - // can't perform the region replacement now, and this - // will short-circuit the rest of the process. + Self::volume_replace_region_in_txn( + &conn, + err, + existing, + replacement, + ) + .await + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + ReplaceRegionError::Public(e) => e, - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - }; + ReplaceRegionError::SerdeError(_) => { + Error::internal_error(&err.to_string()) + } - if old_volume.time_deleted.is_some() { - // Existing volume was soft-deleted, so return here for - // the same reason: the region replacement process - // should be short-circuited now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + ReplaceRegionError::RegionReplacementError(_) => { + Error::internal_error(&err.to_string()) + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } - let old_vcr: VolumeConstructionRequest = - match serde_json::from_str(&old_volume.data()) { - Ok(vcr) => vcr, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::SerdeError(e))); - }, - }; + async fn volume_replace_snapshot_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: VolumeWithTarget, + existing: ExistingTarget, + replacement: ReplacementTarget, + volume_to_delete_id: VolumeToDelete, + ) -> Result { + use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; - // Does it look like this replacement already happened? - let old_region_in_vcr = match region_in_vcr(&old_vcr, &existing.region_addr) { - Ok(v) => v, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::RegionReplacementError(e))); - }, - }; - let new_region_in_vcr = match region_in_vcr(&old_vcr, &replacement.region_addr) { - Ok(v) => v, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::RegionReplacementError(e))); - }, - }; + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(volume_id.0)) + .select(Volume::as_select()) + .first_async::(conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + }; - if !old_region_in_vcr && new_region_in_vcr { - // It does seem like the replacement happened - return Ok(VolumeReplaceResult::AlreadyHappened); - } + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was hard-deleted, so return here. We can't + // perform the region replacement now, and this will short-circuit + // the rest of the process. - use db::schema::region::dsl as region_dsl; - use db::schema::volume::dsl as volume_dsl; + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + }; - // Set the existing region's volume id to the replacement's - // volume id - diesel::update(region_dsl::region) - .filter(region_dsl::id.eq(existing.region_id)) - .set(region_dsl::volume_id.eq(replacement.volume_id)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + if old_volume.time_deleted.is_some() { + // Existing volume was soft-deleted, so return here for the same + // reason: the region replacement process should be short-circuited + // now. + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + } - // Set the replacement region's volume id to the existing's - // volume id - diesel::update(region_dsl::region) - .filter(region_dsl::id.eq(replacement.region_id)) - .set(region_dsl::volume_id.eq(existing.volume_id)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail(ReplaceSnapshotError::SerdeError(e))); + } + }; - // Update the existing volume's construction request to - // replace the existing region's SocketAddrV6 with the - // replacement region's - - // Copy the old volume's VCR, changing out the old region - // for the new. - let new_vcr = match replace_region_in_vcr( - &old_vcr, - existing.region_addr, - replacement.region_addr, - ) { - Ok(new_vcr) => new_vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceRegionError::RegionReplacementError(e) - )); - } - }; + // Does it look like this replacement already happened? + let old_target_in_vcr = + match read_only_target_in_vcr(&old_vcr, &existing.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + ReplaceSnapshotError::SnapshotReplacementError(e), + )); + } + }; - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeReplaceRegionError::SerdeError(e)) - })?; + let new_target_in_vcr = + match read_only_target_in_vcr(&old_vcr, &replacement.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + ReplaceSnapshotError::SnapshotReplacementError(e), + )); + } + }; - // Update the existing volume's data - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(existing.volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + if !old_target_in_vcr && new_target_in_vcr { + // It does seem like the replacement happened + return Ok(VolumeReplaceResult::AlreadyHappened); + } - Ok(VolumeReplaceResult::Done) - } + // Update the existing volume's construction request to replace the + // existing target's SocketAddrV6 with the replacement target's + + // Copy the old volume's VCR, changing out the old target for the new. + let (new_vcr, replacements) = match replace_read_only_target_in_vcr( + &old_vcr, + existing, + replacement, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err( + err.bail(ReplaceSnapshotError::SnapshotReplacementError(e)) + ); + } + }; + + // Expect that this only happened once. If it happened multiple times, + // question everything: how would a snapshot be used twice?! + + if replacements != 1 { + return Err(err.bail( + ReplaceSnapshotError::UnexpectedReplacedTargets( + replacements, + 1, + ), + )); + } + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(ReplaceSnapshotError::SerdeError(e)))?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id.0)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Make a new VCR that will stash the target to delete. The values here + // don't matter, just that it gets fed into the volume_delete machinery + // later. + let vcr = VolumeConstructionRequest::Volume { + id: volume_to_delete_id.0, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: sled_agent_client::types::CrucibleOpts { + id: volume_to_delete_id.0, + target: vec![existing.0.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&vcr) + .map_err(|e| err.bail(ReplaceSnapshotError::SerdeError(e)))?; + + // Update the volume to delete data + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_to_delete_id.0)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(volume_data)) + .execute_async(conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + ReplaceSnapshotError::UnexpectedDatabaseUpdate(num_updated, 1), + )); + } + + // Update the appropriate volume resource usage records - it could + // either be a read-only region or a region snapshot, so determine what + // it is first + + let maybe_existing_usage = + Self::read_only_target_to_volume_resource_usage(conn, &existing.0) + .await?; + + let Some(existing_usage) = maybe_existing_usage else { + return Err(err.bail(ReplaceSnapshotError::CouldNotFindResource( + format!("could not find resource for {}", existing.0,), + ))); + }; + + // The "existing" target moved into the volume to delete + + Self::swap_volume_usage_records_for_resources( + conn, + existing_usage, + volume_id.0, + volume_to_delete_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) }) + })?; + + let maybe_replacement_usage = + Self::read_only_target_to_volume_resource_usage( + conn, + &replacement.0, + ) + .await?; + + let Some(replacement_usage) = maybe_replacement_usage else { + return Err(err.bail(ReplaceSnapshotError::CouldNotFindResource( + format!("could not find resource for {}", existing.0,), + ))); + }; + + // The intention leaving this transaction is that the correct volume + // resource usage records exist, so: + // + // - if no usage record existed for the replacement usage, then create a + // new record that points to the volume id (this can happen if the + // volume to delete was blank when coming into this function) + // + // - if records exist for the "replacement" usage, then one of those + // will match the volume to delete id, so perform a swap instead to + // the volume id + + let existing_replacement_volume_usage_records = + Self::volume_usage_records_for_resource_query( + replacement_usage.clone(), + ) + .load_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + // TODO be smart enough to .filter the above query + .into_iter() + .filter(|record| record.volume_id == volume_to_delete_id.0) + .count(); + + // The "replacement" target moved into the volume + + if existing_replacement_volume_usage_records == 0 { + // No matching record + let new_record = + VolumeResourceUsageRecord::new(volume_id.0, replacement_usage); + + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(new_record) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + } else if existing_replacement_volume_usage_records == 1 { + // One matching record: perform swap + Self::swap_volume_usage_records_for_resources( + conn, + replacement_usage, + volume_to_delete_id.0, + volume_id.0, + ) .await .map_err(|e| { - if let Some(err) = err.take() { - match err { - VolumeReplaceRegionError::Public(e) => e, + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + } else { + // More than one matching record! + return Err(err.bail( + ReplaceSnapshotError::MultipleResourceUsageRecords(format!( + "{replacement_usage:?}" + )), + )); + } - VolumeReplaceRegionError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } + // After region snapshot replacement, validate invariants for all + // volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; - VolumeReplaceRegionError::RegionReplacementError(_) => { - Error::internal_error(&err.to_string()) - } - } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + Ok(VolumeReplaceResult::Done) } /// Replace a read-only target in a Volume with a new region @@ -2226,235 +3273,46 @@ impl DataStore { replacement: ReplacementTarget, volume_to_delete_id: VolumeToDelete, ) -> Result { - #[derive(Debug, thiserror::Error)] - enum VolumeReplaceSnapshotError { - #[error("Error from Volume snapshot replacement: {0}")] - Public(Error), - - #[error("Serde error during Volume snapshot replacement: {0}")] - SerdeError(#[from] serde_json::Error), - - #[error("Snapshot replacement error: {0}")] - SnapshotReplacementError(#[from] anyhow::Error), - - #[error("Replaced {0} targets, expected {1}")] - UnexpectedReplacedTargets(usize, usize), - - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), - } let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("volume_replace_snapshot") .transaction(&conn, |conn| { let err = err.clone(); - async move { - use db::schema::volume::dsl as volume_dsl; - - // Grab the old volume first - let maybe_old_volume = { - volume_dsl::volume - .filter(volume_dsl::id.eq(volume_id.0)) - .select(Volume::as_select()) - .first_async::(&conn) - .await - .optional() - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - }; - - let old_volume = if let Some(old_volume) = maybe_old_volume { - old_volume - } else { - // Existing volume was hard-deleted, so return here. We - // can't perform the region replacement now, and this - // will short-circuit the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - }; - - if old_volume.time_deleted.is_some() { - // Existing volume was soft-deleted, so return here for - // the same reason: the region replacement process - // should be short-circuited now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - } - - let old_vcr: VolumeConstructionRequest = - match serde_json::from_str(&old_volume.data()) { - Ok(vcr) => vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SerdeError(e) - )); - }, - }; - - // Does it look like this replacement already happened? - let old_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &existing.0) { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - }, - }; - - let new_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &replacement.0) { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - }, - }; - - if !old_target_in_vcr && new_target_in_vcr { - // It does seem like the replacement happened - return Ok(VolumeReplaceResult::AlreadyHappened); - } - - // Update the existing volume's construction request to - // replace the existing target's SocketAddrV6 with the - // replacement target's - - // Copy the old volume's VCR, changing out the old target - // for the new. - let (new_vcr, replacements) = match replace_read_only_target_in_vcr( - &old_vcr, + async move { + Self::volume_replace_snapshot_in_txn( + &conn, + err, + volume_id, existing, replacement, - ) { - Ok(new_vcr) => new_vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - } - }; - - // Expect that this only happened once. If it happened - // multiple times, question everything: how would a snapshot - // be used twice?! - - if replacements != 1 { - return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedReplacedTargets( - replacements, 1, - ) - )); - } - - let new_volume_data = serde_json::to_string( - &new_vcr, + volume_to_delete_id, ) - .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; - - // Update the existing volume's data - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id.0)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - - // Make a new VCR that will stash the target to delete. The - // values here don't matter, just that it gets fed into the - // volume_delete machinery later. - let vcr = VolumeConstructionRequest::Volume { - id: volume_to_delete_id.0, - block_size: 512, - sub_volumes: vec![ - VolumeConstructionRequest::Region { - block_size: 512, - blocks_per_extent: 1, - extent_count: 1, - gen: 1, - opts: sled_agent_client::types::CrucibleOpts { - id: volume_to_delete_id.0, - target: vec![ - existing.0.to_string(), - ], - lossy: false, - flush_timeout: None, - key: None, - cert_pem: None, - key_pem: None, - root_cert_pem: None, - control: None, - read_only: true, - }, - } - ], - read_only_parent: None, - }; - - let volume_data = serde_json::to_string(&vcr) - .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; - - // Update the volume to delete data - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_to_delete_id.0)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(volume_data)) - .execute_async(&conn) - .await?; - - if num_updated != 1 { - return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( - num_updated, 1, - ) - )); - } - - Ok(VolumeReplaceResult::Done) + .await } }) .await .map_err(|e| { if let Some(err) = err.take() { match err { - VolumeReplaceSnapshotError::Public(e) => e, - - VolumeReplaceSnapshotError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::SnapshotReplacementError(_) => { - Error::internal_error(&err.to_string()) - } + ReplaceSnapshotError::Public(e) => e, - VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) => { - Error::internal_error(&err.to_string()) - } + ReplaceSnapshotError::SerdeError(_) + | ReplaceSnapshotError::SnapshotReplacementError(_) + | ReplaceSnapshotError::UnexpectedReplacedTargets( + _, + _, + ) + | ReplaceSnapshotError::UnexpectedDatabaseUpdate( + _, + _, + ) + | ReplaceSnapshotError::AddressParseError(_) + | ReplaceSnapshotError::CouldNotFindResource(_) + | ReplaceSnapshotError::MultipleResourceUsageRecords( + _, + ) => Error::internal_error(&err.to_string()), } } else { public_error_from_diesel(e, ErrorHandler::Server) @@ -2463,7 +3321,7 @@ impl DataStore { } } -/// Return the targets from a VolumeConstructionRequest. +/// Return the read-only targets from a VolumeConstructionRequest. /// /// The targets of a volume construction request map to resources. pub fn read_only_resources_associated_with_volume( @@ -2508,6 +3366,67 @@ pub fn read_only_resources_associated_with_volume( } } +/// Return the read-write targets from a VolumeConstructionRequest. +/// +/// The targets of a volume construction request map to resources. +pub fn read_write_resources_associated_with_volume( + vcr: &VolumeConstructionRequest, + targets: &mut Vec, +) { + let mut parts: VecDeque<&VolumeConstructionRequest> = VecDeque::new(); + parts.push_back(&vcr); + + while let Some(vcr_part) = parts.pop_front() { + match vcr_part { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + for sub_volume in sub_volumes { + parts.push_back(sub_volume); + } + + // No need to look under read-only parent + } + + VolumeConstructionRequest::Url { .. } => { + // no action required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + for target in &opts.target { + targets.push(target.clone()); + } + } + } + + VolumeConstructionRequest::File { .. } => { + // no action required + } + } + } +} + +/// Return the number of read-write subvolumes in a VolumeConstructionRequest. +pub fn count_read_write_sub_volumes( + vcr: &VolumeConstructionRequest, +) -> anyhow::Result { + Ok(match vcr { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + sub_volumes.len() + } + + VolumeConstructionRequest::Url { .. } => 0, + + VolumeConstructionRequest::Region { .. } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered. + bail!("Region not supported as a top level volume"); + } + + VolumeConstructionRequest::File { .. } => 0, + }) +} + /// Returns true if the sub-volumes of a Volume are all read-only pub fn volume_is_read_only( vcr: &VolumeConstructionRequest, @@ -2535,7 +3454,7 @@ pub fn volume_is_read_only( // We don't support a pure Region VCR at the volume // level in the database, so this choice should // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); + bail!("Region not supported as a top level volume"); } VolumeConstructionRequest::File { .. } => { @@ -2713,66 +3632,253 @@ fn find_matching_rw_regions_in_volume( } } - VolumeConstructionRequest::Url { .. } => { - // nothing required + VolumeConstructionRequest::Url { .. } => { + // nothing required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + for target in &opts.target { + let parsed_target: SocketAddrV6 = target.parse()?; + if parsed_target.ip() == ip { + matched_targets.push(parsed_target); + } + } + } + } + + VolumeConstructionRequest::File { .. } => { + // nothing required + } + } + } + + Ok(()) +} + +impl DataStore { + pub async fn find_volumes_referencing_socket_addr( + &self, + opctx: &OpContext, + address: SocketAddr, + ) -> ListResultVec { + opctx.check_complex_operations_allowed()?; + + let mut volumes = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = self.pool_connection_authorized(opctx).await?; + + let needle = address.to_string(); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for volume in haystack { + if volume.data().contains(&needle) { + volumes.push(volume); + } + } + } + + Ok(volumes) + } +} + +// Add some validation that runs only for tests +#[cfg(any(test, feature = "testing"))] +impl DataStore { + fn volume_invariant_violated(msg: String) -> diesel::result::Error { + diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::CheckViolation, + Box::new(msg), + ) + } + + /// Tests each Volume to see if invariants hold + /// + /// If an invariant is violated, this function returns a `CheckViolation` + /// with the text of what invariant was violated. + pub(crate) async fn validate_volume_invariants( + conn: &async_bb8_diesel::Connection, + ) -> Result<(), diesel::result::Error> { + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(conn) + .await?; + + paginator = p.found_batch(&haystack, &|v| v.id()); + + for volume in haystack { + Self::validate_volume_has_all_resources(&conn, volume).await?; + } + } + + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::region::dsl; + let haystack = + paginated(dsl::region, dsl::id, &p.current_pagparams()) + .select(Region::as_select()) + .get_results_async::(conn) + .await?; + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for region in haystack { + Self::validate_read_only_region_has_no_snapshots(&conn, region) + .await?; + } + } + + Ok(()) + } + + /// Assert that the resources that comprise non-deleted volumes have not + /// been prematurely deleted. + async fn validate_volume_has_all_resources( + conn: &async_bb8_diesel::Connection, + volume: Volume, + ) -> Result<(), diesel::result::Error> { + if volume.time_deleted.is_some() { + // Do not need to validate resources for soft-deleted volumes + return Ok(()); + } + + let vcr: VolumeConstructionRequest = + serde_json::from_str(&volume.data()).unwrap(); + + // validate all read/write resources still exist + + let num_read_write_subvolumes = match count_read_write_sub_volumes(&vcr) + { + Ok(v) => v, + Err(e) => { + return Err(Self::volume_invariant_violated(format!( + "volume {} had error: {e}", + volume.id(), + ))); } + }; - VolumeConstructionRequest::Region { opts, .. } => { - if !opts.read_only { - for target in &opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target.ip() == ip { - matched_targets.push(parsed_target); - } - } + let mut read_write_targets = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); + + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let target = match target.parse() { + Ok(t) => t, + Err(e) => { + return Err(Self::volume_invariant_violated(format!( + "could not parse {target}: {e}" + ))); } - } + }; - VolumeConstructionRequest::File { .. } => { - // nothing required - } + let maybe_region = DataStore::target_to_region( + conn, + &target, + RegionType::ReadWrite, + ) + .await?; + + let Some(_region) = maybe_region else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {target}" + ))); + }; } - } - Ok(()) -} + // validate all read-only resources still exist -impl DataStore { - pub async fn find_volumes_referencing_socket_addr( - &self, - opctx: &OpContext, - address: SocketAddr, - ) -> ListResultVec { - opctx.check_complex_operations_allowed()?; + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &vcr, + &mut crucible_targets, + ); + crucible_targets + }; - let mut volumes = Vec::new(); - let mut paginator = Paginator::new(SQL_BATCH_SIZE); - let conn = self.pool_connection_authorized(opctx).await?; + for read_only_target in &crucible_targets.read_only_targets { + let read_only_target = read_only_target.parse().map_err(|e| { + Self::volume_invariant_violated(format!( + "could not parse {read_only_target}: {e}" + )) + })?; - let needle = address.to_string(); + let maybe_usage = + DataStore::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; - while let Some(p) = paginator.next() { - use db::schema::volume::dsl; + let Some(_usage) = maybe_usage else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {read_only_target}" + ))); + }; + } - let haystack = - paginated(dsl::volume, dsl::id, &p.current_pagparams()) - .select(Volume::as_select()) - .get_results_async::(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + Ok(()) + } - paginator = p.found_batch(&haystack, &|r| r.id()); + /// Assert that read-only regions do not have any associated region + /// snapshots (see associated comment in `soft_delete_volume_in_txn`) + async fn validate_read_only_region_has_no_snapshots( + conn: &async_bb8_diesel::Connection, + region: Region, + ) -> Result<(), diesel::result::Error> { + if !region.read_only() { + return Ok(()); + } - for volume in haystack { - if volume.data().contains(&needle) { - volumes.push(volume); - } - } + use db::schema::volume_resource_usage::dsl; + + let matching_usage_records: Vec = + dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_region_id.eq(region.id())) + .select(VolumeResourceUsageRecord::as_select()) + .get_results_async(conn) + .await? + .into_iter() + .map(|r| r.try_into().unwrap()) + .collect(); + + if !matching_usage_records.is_empty() { + return Err(Self::volume_invariant_violated(format!( + "read-only region {} has matching usage records: {:?}", + region.id(), + matching_usage_records, + ))); } - Ok(volumes) + Ok(()) } } @@ -2780,7 +3886,13 @@ impl DataStore { mod tests { use super::*; + use crate::db::datastore::test::TestDatasets; + use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::pub_test_utils::TestDatabase; + use nexus_config::RegionAllocationStrategy; + use nexus_db_model::SqlU16; + use nexus_types::external_api::params::DiskSource; + use omicron_common::api::external::ByteCount; use omicron_test_utils::dev; use sled_agent_client::types::CrucibleOpts; @@ -2811,8 +3923,8 @@ mod tests { .await .unwrap(); - // Add old CrucibleResources json in the `resources_to_clean_up` column - - // this was before the `deleting` column / field was added to + // Add old CrucibleResources json in the `resources_to_clean_up` column + // - this was before the `deleting` column / field was added to // ResourceSnapshot. { @@ -2894,47 +4006,82 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_region"); let log = logctx.log.new(o!()); let db = TestDatabase::new_with_datastore(&log).await; + let opctx = db.opctx(); let datastore = db.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); - // Insert four Region records (three, plus one additionally allocated) + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); - let new_volume_id = Uuid::new_v4(); - - let mut region_and_volume_ids = [ - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), new_volume_id), - ]; + let volume_to_delete_id = Uuid::new_v4(); - { - let conn = datastore.pool_connection_for_tests().await.unwrap(); + let datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); - for i in 0..4 { - let (_, volume_id) = region_and_volume_ids[i]; + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); - let region = Region::new( - Uuid::new_v4(), // dataset id - volume_id, - 512_i64.try_into().unwrap(), - 10, - 10, - 10001, - false, - ); + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); - region_and_volume_ids[i].0 = region.id(); + let address: SocketAddrV6 = + datastore.region_addr(region.id()).await.unwrap().unwrap(); - use nexus_db_model::schema::region::dsl; - diesel::insert_into(dsl::region) - .values(region.clone()) - .execute_async(&*conn) - .await - .unwrap(); - } + region_addresses.push(address.to_string()); } + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + false, // read-write + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + let _volume = datastore .volume_create(nexus_db_model::Volume::new( volume_id, @@ -2949,9 +4096,10 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // target to replace - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + // target to replace + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -2972,26 +4120,19 @@ mod tests { // Replace one - let target = region_and_volume_ids[0]; - let replacement = region_and_volume_ids[3]; - let volume_replace_region_result = datastore .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, ) .await @@ -3018,9 +4159,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:101::1]:11111"), // replaced - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + replacement_region_addr.to_string(), // replaced + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3041,19 +4182,15 @@ mod tests { .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, ) .await @@ -3080,9 +4217,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // back to what it was - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), // back to what it was + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3107,13 +4244,127 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_snapshot"); let log = logctx.log.new(o!()); let db = TestDatabase::new_with_datastore(&log).await; + let opctx = db.opctx(); let datastore = db.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); - // Insert two volumes: one with the target to replace, and one temporary - // "volume to delete" that's blank. + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); let volume_to_delete_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); + + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); + + let address: SocketAddrV6 = + datastore.region_addr(region.id()).await.unwrap().unwrap(); + + region_addresses.push(address.to_string()); + } + + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + true, // read-only + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + let region_snapshots = [ + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + ), + ]; + + datastore + .region_snapshot_create(region_snapshots[0].clone()) + .await + .unwrap(); + datastore + .region_snapshot_create(region_snapshots[1].clone()) + .await + .unwrap(); + datastore + .region_snapshot_create(region_snapshots[2].clone()) + .await + .unwrap(); + + // Insert two volumes: one with the target to replace, and one temporary + // "volume to delete" that's blank. Validate the pre-replacement volume + // resource usage records. + let rop_id = Uuid::new_v4(); datastore @@ -3130,9 +4381,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3154,9 +4405,9 @@ mod tests { id: rop_id, target: vec![ // target to replace - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3175,6 +4426,22 @@ mod tests { .await .unwrap(); + for region_snapshot in ®ion_snapshots { + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + datastore .volume_create(nexus_db_model::Volume::new( volume_to_delete_id, @@ -3189,21 +4456,33 @@ mod tests { .await .unwrap(); + // `volume_create` above was called with a blank volume, so no usage + // record will have been created for the read-only region + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + // Do the replacement let volume_replace_snapshot_result = datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd00:1122:3344:104::1]:400".parse().unwrap()), - ReplacementTarget( - "[fd55:1122:3344:101::1]:111".parse().unwrap(), - ), + ExistingTarget(address_1.parse().unwrap()), + ReplacementTarget(replacement_region_addr), VolumeToDelete(volume_to_delete_id), ) .await .unwrap(); - assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done,); + assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done); // Ensure the shape of the resulting VCRs @@ -3225,9 +4504,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3249,9 +4528,9 @@ mod tests { id: rop_id, target: vec![ // target replaced - String::from("[fd55:1122:3344:101::1]:111"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + replacement_region_addr.to_string(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3291,7 +4570,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replaced target stashed here - String::from("[fd00:1122:3344:104::1]:400"), + address_1.clone(), ], lossy: false, flush_timeout: None, @@ -3307,15 +4586,54 @@ mod tests { }, ); + // Validate the post-replacement volume resource usage records + + for (i, region_snapshot) in region_snapshots.iter().enumerate() { + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + + match i { + 0 => { + assert_eq!(usage[0].volume_id, volume_to_delete_id); + } + + 1 | 2 => { + assert_eq!(usage[0].volume_id, volume_id); + } + + _ => panic!("out of range"), + } + } + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + // Now undo the replacement. Note volume ID is not swapped. let volume_replace_snapshot_result = datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd55:1122:3344:101::1]:111".parse().unwrap()), - ReplacementTarget( - "[fd00:1122:3344:104::1]:400".parse().unwrap(), - ), + ExistingTarget(replacement_region_addr), + ReplacementTarget(address_1.parse().unwrap()), VolumeToDelete(volume_to_delete_id), ) .await @@ -3342,9 +4660,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3366,9 +4684,7 @@ mod tests { id: rop_id, target: vec![ // back to what it was - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1, address_2, address_3, ], lossy: false, flush_timeout: None, @@ -3408,7 +4724,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replacement stashed here - String::from("[fd55:1122:3344:101::1]:111"), + replacement_region_addr.to_string(), ], lossy: false, flush_timeout: None, @@ -3424,6 +4740,36 @@ mod tests { }, ); + // Validate the post-post-replacement volume resource usage records + + for region_snapshot in ®ion_snapshots { + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); + db.terminate().await; logctx.cleanup_successful(); } @@ -3438,6 +4784,41 @@ mod tests { let volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + )) + .await + .unwrap(); + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + )) + .await + .unwrap(); + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + )) + .await + .unwrap(); + // case where the needle is found datastore @@ -3456,9 +4837,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2, + address_3, ], lossy: false, flush_timeout: None, @@ -3480,7 +4861,7 @@ mod tests { let volumes = datastore .find_volumes_referencing_socket_addr( &opctx, - "[fd00:1122:3344:104::1]:400".parse().unwrap(), + address_1.parse().unwrap(), ) .await .unwrap(); @@ -4008,4 +5389,52 @@ mod tests { } ); } + + /// Assert that there are no "deleted" r/w regions found when the associated + /// volume hasn't been created yet. + #[tokio::test] + async fn test_no_find_deleted_region_for_no_volume() { + let logctx = + dev::test_setup_log("test_no_find_deleted_region_for_no_volume"); + let log = logctx.log.new(o!()); + let db = TestDatabase::new_with_datastore(&log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + let volume_id = Uuid::new_v4(); + + // Assert that allocating regions without creating the volume does not + // cause them to be returned as "deleted" regions, as this can cause + // sagas that allocate regions to race with the volume delete saga and + // cause premature region deletion. + + let _datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let deleted_regions = datastore + .find_deleted_volume_regions() + .await + .expect("find_deleted_volume_regions"); + + assert!(deleted_regions.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index 02800e3a3c..5f34c7cfb3 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -14,7 +14,6 @@ pub mod network_interface; pub mod oximeter; pub mod region_allocation; pub mod virtual_provisioning_collection_update; -pub mod volume; pub mod vpc; pub mod vpc_subnet; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index efdc9a21fb..a9130d87f7 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -255,7 +255,8 @@ pub fn allocation_query( ").param().sql(" AS blocks_per_extent, ").param().sql(" AS extent_count, NULL AS port, - ").param().sql(" AS read_only + ").param().sql(" AS read_only, + FALSE as deleting FROM shuffled_candidate_datasets") // Only select the *additional* number of candidate regions for the required // redundancy level @@ -368,7 +369,7 @@ pub fn allocation_query( .sql(" inserted_regions AS ( INSERT INTO region - (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only) + (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting) SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql(" FROM candidate_regions WHERE diff --git a/nexus/db-queries/src/db/queries/volume.rs b/nexus/db-queries/src/db/queries/volume.rs deleted file mode 100644 index e7fe832a82..0000000000 --- a/nexus/db-queries/src/db/queries/volume.rs +++ /dev/null @@ -1,114 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper queries for working with volumes. - -use crate::db; -use crate::db::pool::DbConnection; -use diesel::expression::is_aggregate; -use diesel::expression::ValidGrouping; -use diesel::pg::Pg; -use diesel::query_builder::AstPass; -use diesel::query_builder::Query; -use diesel::query_builder::QueryFragment; -use diesel::query_builder::QueryId; -use diesel::sql_types; -use diesel::Column; -use diesel::Expression; -use diesel::QueryResult; -use diesel::RunQueryDsl; - -/// Produces a query fragment that will conditionally reduce the volume -/// references for region_snapshot rows whose snapshot_addr column is part of a -/// list. -/// -/// The output should look like: -/// -/// ```sql -/// update region_snapshot set -/// volume_references = volume_references - 1, -/// deleting = case when volume_references = 1 -/// then true -/// else false -/// end -/// where -/// snapshot_addr in ('a1', 'a2', 'a3') and -/// volume_references >= 1 and -/// deleting = false -/// returning * -/// ``` -#[must_use = "Queries must be executed"] -pub struct ConditionallyDecreaseReferences { - snapshot_addrs: Vec, -} - -impl ConditionallyDecreaseReferences { - pub fn new(snapshot_addrs: Vec) -> Self { - Self { snapshot_addrs } - } -} - -impl QueryId for ConditionallyDecreaseReferences { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} - -impl QueryFragment for ConditionallyDecreaseReferences { - fn walk_ast<'a>(&'a self, mut out: AstPass<'_, 'a, Pg>) -> QueryResult<()> { - use db::schema::region_snapshot::dsl; - - out.push_sql("UPDATE "); - dsl::region_snapshot.walk_ast(out.reborrow())?; - out.push_sql(" SET "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" - 1, "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = CASE WHEN "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = 1 THEN TRUE ELSE FALSE END WHERE "); - out.push_identifier(dsl::snapshot_addr::NAME)?; - out.push_sql(" IN ("); - - // If self.snapshot_addrs is empty, this query fragment will - // intentionally not update any region_snapshot rows. - for (i, snapshot_addr) in self.snapshot_addrs.iter().enumerate() { - out.push_bind_param::(snapshot_addr)?; - if i == self.snapshot_addrs.len() - 1 { - out.push_sql(" "); - } else { - out.push_sql(", "); - } - } - - out.push_sql(") AND "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" >= 1 AND "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = false RETURNING *"); - - Ok(()) - } -} - -impl Expression for ConditionallyDecreaseReferences { - type SqlType = sql_types::Array; -} - -impl ValidGrouping - for ConditionallyDecreaseReferences -{ - type IsAggregate = is_aggregate::Never; -} - -impl RunQueryDsl for ConditionallyDecreaseReferences {} - -type SelectableSql = < - >::SelectExpression as diesel::Expression ->::SqlType; - -impl Query for ConditionallyDecreaseReferences { - type SqlType = SelectableSql; -} diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql index 9ee71b403f..f7b5354b42 100644 --- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -101,7 +102,8 @@ WITH $9 AS blocks_per_extent, $10 AS extent_count, NULL AS port, - $11 AS read_only + $11 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -211,7 +213,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -223,7 +226,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -238,7 +242,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -301,7 +306,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -331,7 +337,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql index 369410c68c..9083967714 100644 --- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -99,7 +100,8 @@ WITH $8 AS blocks_per_extent, $9 AS extent_count, NULL AS port, - $10 AS read_only + $10 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -209,7 +211,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -221,7 +224,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -236,7 +240,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -299,7 +304,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -329,7 +335,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql index 9251139c4e..589d6976a8 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -112,7 +113,8 @@ WITH $10 AS blocks_per_extent, $11 AS extent_count, NULL AS port, - $12 AS read_only + $12 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -222,7 +224,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -234,7 +237,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -249,7 +253,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -312,7 +317,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -342,7 +348,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql index c8aa8adf2e..3e1b047773 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -110,7 +111,8 @@ WITH $9 AS blocks_per_extent, $10 AS extent_count, NULL AS port, - $11 AS read_only + $11 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -220,7 +222,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -232,7 +235,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -247,7 +251,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -310,7 +315,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -340,7 +346,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 963f0bdcac..ad39777054 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -778,8 +778,7 @@ impl BackgroundTasksInitializer { "detect if region snapshots need replacement and begin the \ process", period: config.region_snapshot_replacement_start.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new(RegionSnapshotReplacementDetector::disabled( + task_impl: Box::new(RegionSnapshotReplacementDetector::new( datastore.clone(), sagas.clone(), )), @@ -795,13 +794,10 @@ impl BackgroundTasksInitializer { period: config .region_snapshot_replacement_garbage_collection .period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementGarbageCollect::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_garbage_collection, @@ -813,13 +809,10 @@ impl BackgroundTasksInitializer { "detect what volumes were affected by a region snapshot \ replacement, and run the step saga for them", period: config.region_snapshot_replacement_step.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFindAffected::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementFindAffected::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_step, @@ -831,10 +824,9 @@ impl BackgroundTasksInitializer { "complete a region snapshot replacement if all the steps are \ done", period: config.region_snapshot_replacement_finish.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFinishDetector::disabled(datastore), - ), + task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( + datastore, + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_finish, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 83078cb978..caa2fa7bed 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -19,17 +19,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, - disabled: bool, } impl RegionSnapshotReplacementFinishDetector { - #[allow(dead_code)] pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: false } - } - - pub fn disabled(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: true } + RegionSnapshotReplacementFinishDetector { datastore } } async fn transition_requests_to_done( @@ -159,10 +153,6 @@ impl BackgroundTask for RegionSnapshotReplacementFinishDetector { async move { let mut status = RegionSnapshotReplacementFinishStatus::default(); - if self.disabled { - return json!(status); - } - self.transition_requests_to_done(opctx, &mut status).await; json!(status) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index eb171fda12..f3b1b68198 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -22,28 +22,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementGarbageCollect { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementGarbageCollect { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementGarbageCollect { datastore, sagas } } async fn send_garbage_collect_request( @@ -152,10 +135,6 @@ impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { let mut status = RegionSnapshotReplacementGarbageCollectStatus::default(); - if self.disabled { - return json!(status); - } - self.clean_up_region_snapshot_replacement_volumes( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 8fd1e55975..bc739ecf27 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -29,20 +29,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementDetector { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementDetector { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: false } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: true } + RegionSnapshotReplacementDetector { datastore, sagas } } async fn send_start_request( @@ -246,10 +237,6 @@ impl BackgroundTask for RegionSnapshotReplacementDetector { async { let mut status = RegionSnapshotReplacementStartStatus::default(); - if self.disabled { - return json!(status); - } - self.create_requests_for_region_snapshots_on_expunged_disks( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index da05500a58..29878364e6 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -42,28 +42,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFindAffected { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementFindAffected { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementFindAffected { datastore, sagas } } async fn send_start_request( @@ -233,14 +216,16 @@ impl RegionSnapshotReplacementFindAffected { Ok(Some(region_snapshot)) => region_snapshot, Ok(None) => { + // If the associated region snapshot was deleted, then there + // are no more volumes that reference it. This is not an + // error! Continue processing the other requests. let s = format!( - "region snapshot {} {} {} not found!", + "region snapshot {} {} {} not found", request.old_dataset_id, request.old_region_id, request.old_snapshot_id, ); - error!(&log, "{s}"); - status.errors.push(s); + info!(&log, "{s}"); continue; } @@ -452,10 +437,6 @@ impl BackgroundTask for RegionSnapshotReplacementFindAffected { async move { let mut status = RegionSnapshotReplacementStepStatus::default(); - if self.disabled { - return json!(status); - } - // Importantly, clean old steps up before finding affected volumes! // Otherwise, will continue to find the snapshot in volumes to // delete, and will continue to see conflicts in next function. @@ -500,6 +481,19 @@ mod test { ) -> Uuid { let new_volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + snapshot_addr.clone(), + )) + .await + .unwrap(); + let volume_construction_request = VolumeConstructionRequest::Volume { id: new_volume_id, block_size: 0, diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index d2e3053668..7efe70da77 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -934,6 +934,34 @@ pub(crate) mod test { .is_none() } + async fn no_volume_resource_usage_records_exist( + datastore: &DataStore, + ) -> bool { + use nexus_db_queries::db::schema::volume_resource_usage::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let rows = datastore + .transaction_retry_wrapper("no_volume_resource_usage_records_exist") + .transaction(&conn, |conn| async move { + conn.batch_execute_async( + nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, + ) + .await + .unwrap(); + + Ok(dsl::volume_resource_usage + .count() + .get_result_async::(&conn) + .await + .unwrap()) + }) + .await + .unwrap(); + + rows == 0 + } + async fn no_virtual_provisioning_resource_records_exist( datastore: &DataStore, ) -> bool { @@ -1030,6 +1058,7 @@ pub(crate) mod test { .await; assert!(no_disk_records_exist(datastore).await); assert!(no_volume_records_exist(datastore).await); + assert!(no_volume_resource_usage_records_exist(datastore).await); assert!( no_virtual_provisioning_resource_records_exist(datastore).await ); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 6d56732388..4e6c3e1e16 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -58,7 +58,6 @@ use crate::app::db::datastore::ReplacementTarget; use crate::app::db::datastore::VolumeReplaceResult; use crate::app::db::datastore::VolumeToDelete; use crate::app::db::datastore::VolumeWithTarget; -use crate::app::db::lookup::LookupPath; use crate::app::sagas::common_storage::find_only_new_region; use crate::app::sagas::declare_saga_actions; use crate::app::RegionAllocationStrategy; @@ -242,12 +241,21 @@ async fn rsrss_get_alloc_region_params( ); // Look up the existing snapshot - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + // Find the region to replace let db_region = osagactx .datastore() @@ -480,12 +488,22 @@ async fn rsrss_get_old_snapshot_volume_id( ¶ms.serialized_authn, ); - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + // Look up the existing snapshot + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + Ok(db_snapshot.volume_id) } @@ -510,6 +528,13 @@ async fn rsrss_create_fake_volume( gen: 0, opts: CrucibleOpts { id: new_volume_id, + // Do not put the new region ID here: it will be deleted during + // the associated garbage collection saga if + // `volume_replace_snapshot` does not perform the swap (which + // happens when the snapshot volume is deleted) and we still + // want it to exist when performing other replacement steps. If + // the replacement does occur, then the old address will swapped + // in here. target: vec![], lossy: false, flush_timeout: None, @@ -673,14 +698,15 @@ async fn rsrss_replace_snapshot_in_volume( } VolumeReplaceResult::ExistingVolumeDeleted => { - // Unwind the saga here to clean up the resources allocated during - // this saga. The associated background task will transition this - // request's state to Completed. - - Err(ActionError::action_failed(format!( - "existing volume {} deleted", - replacement_params.old_volume_id - ))) + // If the snapshot volume was deleted, we still want to proceed with + // replacing the rest of the uses of the region snapshot. Note this + // also covers the case where this saga node runs (performing the + // replacement), the executor crashes before it can record that + // success, and then before this node is rerun the snapshot is + // deleted. If this saga unwound here, that would violate the + // property of idempotency. + + Ok(()) } } } @@ -801,6 +827,7 @@ pub(crate) mod test { const SNAPSHOT_NAME: &str = "my-snap"; const PROJECT_NAME: &str = "springfield-squidport"; + /// Create four zpools, a disk, and a snapshot of that disk async fn prepare_for_test( cptestctx: &ControlPlaneTestContext, ) -> PrepareResult { diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index c5e1b49f61..12d48cb367 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -29,7 +29,7 @@ use super::NexusSaga; use crate::app::sagas::declare_saga_actions; use nexus_db_model::Dataset; use nexus_db_model::Region; -use nexus_db_model::RegionSnapshot; +use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; use nexus_types::identity::Asset; @@ -330,8 +330,7 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = - Vec<(Dataset, Region, Option, Uuid)>; +type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent @@ -394,7 +393,7 @@ type FreedCrucibleRegions = /// The disk's volume has no read only resources, while the snapshot's volume /// does. The disk volume's targets are all regions (backed by downstairs that /// are read/write) while the snapshot volume's targets are all snapshots -/// (backed by volumes that are read-only). The two volumes are linked in the +/// (backed by downstairs that are read-only). The two volumes are linked in the /// sense that the snapshots from the second are contained *within* the regions /// of the first, reflecting the resource nesting from ZFS. This is also /// reflected in the REST endpoint that the Crucible agent uses: @@ -436,7 +435,7 @@ async fn svd_find_freed_crucible_regions( // Don't serialize the whole Volume, as the data field contains key material! Ok(freed_datasets_regions_and_volumes .into_iter() - .map(|x| (x.0, x.1, x.2, x.3.id())) + .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) .collect()) } @@ -451,23 +450,7 @@ async fn svd_delete_freed_crucible_regions( let freed_datasets_regions_and_volumes = sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, region_snapshot, volume_id) in - freed_datasets_regions_and_volumes - { - if region_snapshot.is_some() { - // We cannot delete this region yet, the snapshot has not been - // deleted. This can occur when multiple volume delete sagas run - // concurrently: one will decrement the crucible resources (but - // hasn't made the appropriate DELETE calls to remove the running - // snapshots and snapshots yet), and the other will be here trying - // to delete the region. This race results in the crucible agent - // returning "must delete snapshots first" and causing saga unwinds. - // - // Another volume delete (probably the one racing with this one!) - // will pick up this region and remove it. - continue; - } - + for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -496,14 +479,16 @@ async fn svd_delete_freed_crucible_regions( })?; // Remove volume DB record - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; + if let Some(volume_id) = volume_id { + osagactx.datastore().volume_hard_delete(volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; + } } Ok(()) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 859796f556..b0474037a8 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -332,7 +332,7 @@ pub async fn run_replacement_tasks_to_completion( } }, &Duration::from_secs(1), - &Duration::from_secs(10), + &Duration::from_secs(20), ) .await .unwrap(); diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 4463267b42..939fd61249 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -2442,83 +2442,6 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( assert_eq!(datasets_and_regions.len(), REGION_REDUNDANCY_THRESHOLD); } -// Confirm that a region set can start at N, a region can be deleted, and the -// allocation CTE can bring the redundancy back to N. -#[nexus_test] -async fn test_region_allocation_after_delete( - cptestctx: &ControlPlaneTestContext, -) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - - // Create three 10 GiB zpools, each with one dataset. - let _disk_test = DiskTest::new(&cptestctx).await; - - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - - // Create a disk - let client = &cptestctx.external_client; - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, DISK_NAME).await; - - // Assert disk has three allocated regions - let disk_id = disk.identity.id; - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk_id) - .fetch() - .await - .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); - - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); - - // Delete one of the regions - let region_to_delete: &nexus_db_model::Region = &allocated_regions[0].1; - datastore - .regions_hard_delete(&opctx.log, vec![region_to_delete.id()]) - .await - .unwrap(); - - // Assert disk's volume has one less allocated region - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD - 1); - - let region_total_size: ByteCount = ByteCount::try_from( - region_to_delete.block_size().to_bytes() - * region_to_delete.blocks_per_extent() - * region_to_delete.extent_count(), - ) - .unwrap(); - - // Rerun disk region allocation - datastore - .disk_region_allocate( - &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_delete.block_size().to_bytes() as u32, - ) - .unwrap(), - }, - region_total_size, - &RegionAllocationStrategy::Random { seed: None }, - ) - .await - .unwrap(); - - // Assert redundancy was restored - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); -} - #[nexus_test] async fn test_no_halt_disk_delete_one_region_on_expunged_agent( cptestctx: &ControlPlaneTestContext, diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 8765218d33..887afff20f 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,31 +5,58 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; +use diesel::ExpressionMethods; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_config::RegionAllocationStrategy; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::RegionSnapshotReplacementState; +use nexus_db_model::Volume; +use nexus_db_model::VolumeResourceUsage; +use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::CrucibleResources; +use nexus_db_queries::db::datastore::ExistingTarget; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; +use nexus_db_queries::db::datastore::ReplacementTarget; +use nexus_db_queries::db::datastore::VolumeReplaceResult; +use nexus_db_queries::db::datastore::VolumeToDelete; +use nexus_db_queries::db::datastore::VolumeWithTarget; +use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::pagination::paginated; +use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use nexus_test_utils::background::run_replacement_tasks_to_completion; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_default_ip_pool; +use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; use nexus_types::identity::Asset; +use nexus_types::identity::Resource; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_common::api::internal; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; @@ -37,6 +64,8 @@ use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use std::collections::HashSet; +use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; @@ -2182,38 +2211,38 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), + String::from("[fd00:1122:3344:101::7]:19016"), ), ( zpool1.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:102:7]:19016"), + String::from("[fd00:1122:3344:102::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19016"), + String::from("[fd00:1122:3344:103::7]:19016"), ), // second snapshot-create ( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), // duplicate! + String::from("[fd00:1122:3344:101::7]:19016"), // duplicate! ), ( zpool3.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:104:7]:19016"), + String::from("[fd00:1122:3344:104::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19017"), + String::from("[fd00:1122:3344:103::7]:19017"), ), ]; @@ -2286,36 +2315,47 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate the volume's region_snapshots got incremented by - // volume_create + // Also validate the volume's region_snapshots had volume resource usage + // records created by volume_create for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } - // Soft delete the volume, and validate that only three region_snapshot - // records are returned. + // Soft delete the volume, and validate that the volume's region_snapshots + // had their volume resource usage records deleted let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -2396,30 +2436,42 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate only the volume's region_snapshots got incremented by - // volume_create. + // Also validate only the new volume's region_snapshots had usage records + // created. for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } + for i in 3..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } // Soft delete the volume, and validate that only three region_snapshot @@ -2427,18 +2479,23 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); - // Make sure every region_snapshot is now 0, and deleting + // Make sure every region_snapshot has no usage records now for i in 0..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -3503,3 +3560,2761 @@ async fn test_cte_returns_regions(cptestctx: &ControlPlaneTestContext) { .collect::>(), ); } + +struct TestReadOnlyRegionReferenceUsage { + datastore: Arc, + + region: db::model::Region, + region_address: SocketAddrV6, + + first_volume_id: Uuid, + second_volume_id: Uuid, + + last_resources_to_delete: Option, +} + +impl TestReadOnlyRegionReferenceUsage { + pub async fn new(cptestctx: &ControlPlaneTestContext) -> Self { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let first_volume_id = Uuid::new_v4(); + let second_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: first_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + assert_eq!(datasets_and_regions.len(), 1); + + let (_, region) = &datasets_and_regions[0]; + + assert_eq!(region.volume_id(), first_volume_id); + assert!(region.read_only()); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(region.id(), 12345).await.unwrap(); + + let region_address = + datastore.region_addr(region.id()).await.unwrap().unwrap(); + + let region = datastore.get_region(region.id()).await.unwrap(); + + TestReadOnlyRegionReferenceUsage { + datastore: datastore.clone(), + + region, + region_address, + + first_volume_id, + second_volume_id, + + last_resources_to_delete: None, + } + } + + pub async fn create_first_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_only_first_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.first_volume_id); + } + + pub async fn validate_only_second_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.second_volume_id); + } + + pub async fn delete_first_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.first_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn delete_second_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.second_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn validate_no_usage_records(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + } + + pub async fn validate_region_returned_for_cleanup(&self) { + assert!(self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + pub async fn validate_region_not_returned_for_cleanup(&self) { + assert!(!self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + // read-only regions should never be returned by find_deleted_volume_regions + pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { + let deleted_volume_regions = + self.datastore.find_deleted_volume_regions().await.unwrap(); + + assert!(!deleted_volume_regions + .into_iter() + .any(|(_, r, _)| r.id() == self.region.id())); + } + + pub async fn create_first_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_both_volumes_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == self.first_volume_id)); + assert!(usage.iter().any(|r| r.volume_id == self.second_volume_id)); + } +} + +/// Assert that creating a volume with a read-only region in a subvolume creates +/// an appropriate usage record, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_usage_sanity( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume().await; + + // Validate only one volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // Now, soft-delete the volume, and make sure that the associated volume + // resource usage record is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating a volume with a read-only region in the ROP creates an +/// appropriate reference, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // It should be _not_ returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Now, soft-delete the volume, and make sure that read-only volume + // reference is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region creates the +/// appropriate references, and that deleting only one of those volumes does not +/// mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume().await; + harness.create_second_volume().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the first volume, and make sure that only one read-only + // volume reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region in the ROP +/// creates the appropriate references, and that deleting only one of those +/// volumes does not mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume_region_in_rop().await; + harness.create_second_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the volume, and make sure that only one read-only volume + // reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that a read-only region is properly reference counted and not +/// prematurely deleted: +/// +/// 1) create a disk, then a snapshot of that disk, then a disk from that +/// snapshot +/// +/// 2) issue a region snapshot replacement request for one of the region +/// snapshots in that snapshot, then run that process to completion +/// +/// 3) delete the snapshot +/// +/// 4) expect that the reference to the read-only region in the disk created +/// from the snapshot means that read-only region will not be cleaned up by +/// Nexus +/// +/// 5) clean up all the objects, and expect the crucible resources are properly +/// cleaned up +#[nexus_test] +async fn test_read_only_region_reference_counting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // The snapshot's allocated regions should have the one read-only region + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + let db_read_only_dataset = + datastore.dataset_get(read_only_region.dataset_id()).await.unwrap(); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Deleting the snapshot should _not_ cause the region to get deleted from + // CRDB + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + let post_delete_region = + datastore.get_region(read_only_region.id()).await.unwrap(); + assert_eq!(post_delete_region, *read_only_region); + + // or cause Nexus to send delete commands to the appropriate Crucible + // agent. + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Created + ); + + // Expect that there is one read-only region reference now, and that's from + // disk-from-snap + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk, and expect that does not alter the volume usage records + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk from snapshot, verify everything is cleaned up + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Destroyed + ); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +/// Assert that a snapshot of a volume with a read-only region is properly +/// reference counted. +#[nexus_test] +async fn test_read_only_region_reference_counting_layers( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // Grab the read-only region in the snapshot volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Take a snapshot of the disk-from-snapshot disk + + let double_snapshot = create_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + "double-snapshot", + ) + .await; + + // Assert correct volume usage records + + let (.., db_double_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(double_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "double_snapshot {:?} should exist", + double_snapshot.identity.id + ) + }); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + // Delete resources, assert volume resource usage records along the way + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("double-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_volume_replace_snapshot_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + // There won't be any regions for the snapshot volume, only region snapshots + + assert!(allocated_regions.is_empty()); + + // Get another region to use with volume_replace_snapshot + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: db_snapshot.id(), + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + allocated_regions.len() + 1, + ) + .await + .unwrap(); + + // Get the newly allocated region + + let mut new_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(new_allocated_regions.len(), 1); + + let (_, new_region) = + new_allocated_regions.pop().expect("we just checked the length!"); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(new_region.id(), 12345).await.unwrap(); + + // Create a blank region to use as the "volume to delete" + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert the correct volume resource usage record before the replacement: + // nothing should reference the newly allocated region yet + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + // Perform replacement of the first region in the snapshot + + let existing = datastore + .region_snapshot_get( + disk_allocated_regions[0].1.dataset_id(), + disk_allocated_regions[0].1.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let replacement = datastore + .region_addr(new_region.id()) + .await + .expect("new region has address!") + .unwrap(); + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(existing.snapshot_addr.parse().unwrap()), + ReplacementTarget(replacement), + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the volume resource usage record after volume_replace_snapshot: + // the new region should have a usage for the snapshot's volume + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_snapshot.volume_id); + + // Now, reverse the replacement + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(replacement), // swapped! + ReplacementTarget(existing.snapshot_addr.parse().unwrap()), // swapped! + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the new region's volume resource usage record now references the + // volume to delete + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); +} + +/// Test that the `volume_remove_rop` function correctly updates volume resource +/// usage records +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: + // both the snapshot volume and disk_from_snapshot volume should have usage + // records for the three region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: + // the snapshot volume should still have usage records for the three region + // snapshots, and now so should the volume to delete + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + } +} + +/// Test that the `volume_remove_rop` function only updates volume resource +/// usage records for the volume being operated on +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting_no_modify_others( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_another_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(another_disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "another_disk_from_snapshot {:?} should exist", + another_disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: the + // snapshot volume, disk_from_snapshot volume, and + // another_disk_from_snapshot volume should have usage records for the three + // region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: the + // snapshot volume and another_disk_from_snapshot volume should still have + // usage records for the three region snapshots, and now so should the + // volume to delete. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } +} + +async fn delete_all_volume_resource_usage_records(datastore: &DataStore) { + use db::schema::volume_resource_usage::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::delete(dsl::volume_resource_usage) + .filter(dsl::usage_id.ne(Uuid::new_v4())) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn perform_migration(datastore: &DataStore) { + const MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL: &str = include_str!( + "../../../schema/crdb/crucible-ref-count-records/up08.sql" + ); + + assert!(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL + .contains("INSERT INTO volume_resource_usage")); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + // To make sure that the migration is idempotent, perform it twice + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); + + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn get_volume_resource_usage_records( + datastore: &DataStore, +) -> HashSet { + use db::schema::volume_resource_usage::dsl; + + let mut records: Vec = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + while let Some(p) = paginator.next() { + let page = paginated( + dsl::volume_resource_usage, + dsl::usage_id, + &p.current_pagparams(), + ) + .get_results_async::(&*conn) + .await + .unwrap(); + + paginator = p.found_batch(&page, &|r| r.usage_id); + + for record in page { + records.push(record); + } + } + + records + .into_iter() + .map(|mut record: VolumeResourceUsageRecord| { + // Zero out usage_id for comparison + record.usage_id = Uuid::nil(); + record + }) + .collect() +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk + + create_disk(&client, PROJECT_NAME, "disk").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a snapshot + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a disk from that snapshot + + create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the snapshot + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the disk from snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_soft_delete_volume( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot from that disk, then an image based on + // that snapshot + + create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let params = params::ImageCreate { + identity: IdentityMetadataCreateParams { + name: "windows99".parse().unwrap(), + description: String::from("as soon as we get CSM support!"), + }, + source: params::ImageSource::Snapshot { id: snapshot.identity.id }, + os: "windows98".to_string(), + version: "se".to_string(), + }; + + let images_url = format!("/v1/images?project={}", PROJECT_NAME); + NexusRequest::objects_post(client, &images_url, ¶ms) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap(); + + // Soft-delete the snapshot's volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let resources = + datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that the region snapshots did not have deleted set to true + + assert!(datastore + .snapshots_to_delete(&resources) + .await + .unwrap() + .is_empty()); + + // This means that the snapshot volume is soft-deleted, make sure the + // migration does not make usage records for it! + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( + cptestctx: &ControlPlaneTestContext, +) { + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let mut iter = disk_test.zpools(); + let zpool0 = iter.next().expect("Expected four zpools"); + let zpool1 = iter.next().expect("Expected four zpools"); + let zpool2 = iter.next().expect("Expected four zpools"); + let zpool3 = iter.next().expect("Expected four zpools"); + + // (dataset_id, region_id, snapshot_id, snapshot_addr) + let region_snapshots = vec![ + ( + zpool0.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101::7]:19016"), + ), + ( + zpool1.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:102::7]:19016"), + ), + ( + zpool2.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103::7]:19016"), + ), + ( + zpool3.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:104::7]:19016"), + ), + ]; + + for i in 0..4 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + // Create two volumes, one with the first three region snapshots, one with + // the last three region snapshots + + let first_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[0].3.clone(), + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + let second_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + region_snapshots[3].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Deleting the first volume should only return region_snapshot[0] for + // deletion. + + let resources = + datastore.soft_delete_volume(first_volume_id).await.unwrap(); + + let snapshots_to_delete = + datastore.snapshots_to_delete(&resources).await.unwrap(); + + assert_eq!(snapshots_to_delete.len(), 1); + + let region_snapshot_to_delete = &snapshots_to_delete[0].1; + + assert_eq!(region_snapshot_to_delete.dataset_id, region_snapshots[0].0); + assert_eq!(region_snapshot_to_delete.region_id, region_snapshots[0].1); + assert_eq!(region_snapshot_to_delete.snapshot_id, region_snapshots[0].2); + assert_eq!(region_snapshot_to_delete.snapshot_addr, region_snapshots[0].3); + assert_eq!(region_snapshot_to_delete.volume_references, 0); + assert_eq!(region_snapshot_to_delete.deleting, true); + + // Test the migration does not incorrectly think a region snapshot with + // deleting = true is used by any volume + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_double_layer_with_read_only_region_delete( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, snapshot, then two disks from those snapshots + // 2) Replace one of the region snapshots in that snapshot volume with a + // read-only region + // 3) Delete in the following order: disk, snapshot, then two disks from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let _another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's targets, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Finally, delete another-disk-from-snapshot + + NexusRequest::object_delete( + client, + &get_disk_url("another-disk-from-snapshot"), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete another-disk-from-snapshot"); + + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_double_layer_snapshot_with_read_only_region_delete_2( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, then a snapshot + // 2) Replace two of the region snapshots in that snapshot volume with + // read-only regions + // 3) Create a disk from the snapshot + // 4) Replace the last of the region snapshots in that snapshot volume with + // a read-only region + // 5) Delete in the following order: disk, snapshot, then the disk from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Perform region snapshot replacement for two of the snapshot's targets, + // causing two read-only regions to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + wait_for_condition( + || { + let datastore = datastore.clone(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + let (dataset, region) = &allocated_regions[1]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Create a disk from the snapshot + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Replace the last of the region snapshot targets + + let (dataset, region) = &allocated_regions[2]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Assert everything was cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_no_zombie_region_snapshots(cptestctx: &ControlPlaneTestContext) { + // 1) Create disk, then a snapshot + // 2) Delete the disk + // 3) Create a volume that uses the snapshot volume as a read-only parent + // 4) Test that a race of the following steps does not cause a region + // snapshot to have volume usage records even though it is marked for + // deletion: + // + // a) delete the snapshot volume + // b) delete the volume created in step 3 + // c) create another volume that uses the snapshot volume as a read-only + // parent + + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create disk, then a snapshot + + let _disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Delete the disk + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Create a volume that uses the snapshot volume as a read-only parent + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let snapshot_volume: Volume = datastore + .volume_get(db_snapshot.volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let snapshot_vcr: VolumeConstructionRequest = + serde_json::from_str(snapshot_volume.data()).unwrap(); + + let step_3_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_3_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_3_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(snapshot_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the snapshot volume + + let cr = datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the volume created as part of step 3 + + let cr = datastore.soft_delete_volume(step_3_volume_id).await.unwrap(); + + // Assert that region snapshots _are_ returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(!datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the snapshot volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(snapshot_vcr.clone())), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} + +#[nexus_test] +async fn test_no_zombie_read_only_regions(cptestctx: &ControlPlaneTestContext) { + // 1) Create a volume with three read-only regions + // 2) Create another volume that uses the first step's volume as a read-only + // parent + // 3) Test that a race of the following steps does not cause a region to + // have volume usage records: + // + // a) delete the step 1 volume + // b) delete the step 2 volume + // c) create another volume that uses the step 1 volume as a read-only + // parent + + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + // Create a volume with three read-only regions + + let step_1_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: step_1_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 3, + ) + .await + .unwrap(); + + // We're not sending allocation requests to any simulated crucible agent, so + // fill in a random port here. + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + datastore.region_set_port(region.id(), 20000 + i as u16).await.unwrap(); + } + + let region_addrs: Vec = vec![ + datastore + .region_addr(datasets_and_regions[0].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[1].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[2].1.id()) + .await + .unwrap() + .unwrap(), + ]; + + // Assert they're all unique + + assert_ne!(region_addrs[0], region_addrs[1]); + assert_ne!(region_addrs[0], region_addrs[2]); + assert_ne!(region_addrs[1], region_addrs[2]); + + // Mimic what a snapshot volume has: three read-only regions in a volume's + // subvolume, not the read-only parent + + datastore + .volume_create(nexus_db_model::Volume::new( + step_1_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_1_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_addrs[0].to_string(), + region_addrs[1].to_string(), + region_addrs[2].to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Create another volume that uses the first step's volume as a read-only + // parent + + let step_1_volume: Volume = datastore + .volume_get(step_1_volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let step_1_vcr: VolumeConstructionRequest = + serde_json::from_str(step_1_volume.data()).unwrap(); + + let step_2_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_2_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_2_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the step 1 volume + + let cr = datastore.soft_delete_volume(step_1_volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the step 2 volume + + let cr = datastore.soft_delete_volume(step_2_volume_id).await.unwrap(); + + // Assert that the read-only regions _are_ returned for clean-up + + assert!(!datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the step 1 volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr)), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} + +#[nexus_test] +async fn test_no_zombie_read_write_regions( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create a volume with three read-write regions + // 2) Create another volume that uses the first step's volume as a read-only + // parent + // 3) Test that a race of the following steps does not cause a region to + // have volume usage records: + // + // a) delete the step 1 volume + // b) delete the step 2 volume + // c) create another volume that uses the step 1 volume as a read-only + // parent + + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + // Create a volume with three read-only regions + + let step_1_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: step_1_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 3, + ) + .await + .unwrap(); + + // We're not sending allocation requests to any simulated crucible agent, so + // fill in a random port here. + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + datastore.region_set_port(region.id(), 20000 + i as u16).await.unwrap(); + } + + let region_addrs: Vec = vec![ + datastore + .region_addr(datasets_and_regions[0].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[1].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[2].1.id()) + .await + .unwrap() + .unwrap(), + ]; + + // Assert they're all unique + + assert_ne!(region_addrs[0], region_addrs[1]); + assert_ne!(region_addrs[0], region_addrs[2]); + assert_ne!(region_addrs[1], region_addrs[2]); + + // Mimic what a snapshot volume has: three read-only regions in a volume's + // subvolume, not the read-only parent + + datastore + .volume_create(nexus_db_model::Volume::new( + step_1_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_1_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_addrs[0].to_string(), + region_addrs[1].to_string(), + region_addrs[2].to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Create another volume that uses the first step's volume as a read-only + // parent + + let step_1_volume: Volume = datastore + .volume_get(step_1_volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let step_1_vcr: VolumeConstructionRequest = + serde_json::from_str(step_1_volume.data()).unwrap(); + + let step_2_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_2_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_2_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the step 1 volume + + let cr = datastore.soft_delete_volume(step_1_volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the step 2 volume + + let cr = datastore.soft_delete_volume(step_2_volume_id).await.unwrap(); + + // Assert that the read-only regions _are_ returned for clean-up + + assert!(!datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the step 1 volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr)), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} diff --git a/schema/crdb/crucible-ref-count-records/up01.sql b/schema/crdb/crucible-ref-count-records/up01.sql new file mode 100644 index 0000000000..b28e61135b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up01.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); diff --git a/schema/crdb/crucible-ref-count-records/up02.sql b/schema/crdb/crucible-ref-count-records/up02.sql new file mode 100644 index 0000000000..31fa35c3b8 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up02.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); diff --git a/schema/crdb/crucible-ref-count-records/up03.sql b/schema/crdb/crucible-ref-count-records/up03.sql new file mode 100644 index 0000000000..03940aabb0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up03.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); diff --git a/schema/crdb/crucible-ref-count-records/up04.sql b/schema/crdb/crucible-ref-count-records/up04.sql new file mode 100644 index 0000000000..c020499418 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up04.sql @@ -0,0 +1,37 @@ +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + region_id UUID, + + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id), + + CONSTRAINT exactly_one_usage_source CHECK ( + ( + (usage_type = 'read_only_region') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NULL AND + region_snapshot_region_id IS NULL AND + region_snapshot_snapshot_id IS NULL + ) + ) + OR + ( + (usage_type = 'region_snapshot') AND + (region_id IS NULL) AND + ( + region_snapshot_dataset_id IS NOT NULL AND + region_snapshot_region_id IS NOT NULL AND + region_snapshot_snapshot_id IS NOT NULL + ) + ) + ) +); diff --git a/schema/crdb/crucible-ref-count-records/up05.sql b/schema/crdb/crucible-ref-count-records/up05.sql new file mode 100644 index 0000000000..a88a3fc6fd --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up05.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); diff --git a/schema/crdb/crucible-ref-count-records/up06.sql b/schema/crdb/crucible-ref-count-records/up06.sql new file mode 100644 index 0000000000..3262b194be --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up06.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up07.sql b/schema/crdb/crucible-ref-count-records/up07.sql new file mode 100644 index 0000000000..91c788be1b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up07.sql @@ -0,0 +1,8 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up08.sql b/schema/crdb/crucible-ref-count-records/up08.sql new file mode 100644 index 0000000000..49374301ec --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up08.sql @@ -0,0 +1,31 @@ +/* Construct a UUIDv4 deterministically from the existing data in the region + snapshot records to populate the volume resource usage records */ +INSERT INTO volume_resource_usage ( + SELECT + gen_random_uuid() AS usage_id, + volume.id AS volume_id, + 'region_snapshot' AS usage_type, + NULL AS region_id, + region_snapshot.dataset_id AS region_snapshot_dataset_id, + region_snapshot.region_id AS region_snapshot_region_id, + region_snapshot.snapshot_id AS region_snapshot_snapshot_id + FROM + volume + JOIN + region_snapshot + ON + volume.data like ('%' || region_snapshot.snapshot_addr || '%') + LEFT JOIN + volume_resource_usage + ON + volume_resource_usage.volume_id = volume.id AND + volume_resource_usage.usage_type = 'region_snapshot' AND + volume_resource_usage.region_id IS NULL AND + volume_resource_usage.region_snapshot_region_id = region_snapshot.region_id AND + volume_resource_usage.region_snapshot_dataset_id = region_snapshot.dataset_id AND + volume_resource_usage.region_snapshot_snapshot_id = region_snapshot.snapshot_id + WHERE + volume.time_deleted is NULL AND + region_snapshot.deleting = false AND + volume_resource_usage.usage_id IS NULL +); diff --git a/schema/crdb/crucible-ref-count-records/up09.sql b/schema/crdb/crucible-ref-count-records/up09.sql new file mode 100644 index 0000000000..be56ed401c --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up09.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); diff --git a/schema/crdb/crucible-ref-count-records/up10.sql b/schema/crdb/crucible-ref-count-records/up10.sql new file mode 100644 index 0000000000..5d7390e124 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up10.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region ADD COLUMN IF NOT EXISTS deleting BOOLEAN NOT NULL DEFAULT false; diff --git a/schema/crdb/crucible-ref-count-records/up11.sql b/schema/crdb/crucible-ref-count-records/up11.sql new file mode 100644 index 0000000000..ef7845c8d0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up11.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region ALTER COLUMN deleting DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b095c4c89a..f689c7e9f7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -619,6 +619,8 @@ CREATE INDEX IF NOT EXISTS lookup_dataset_by_zpool on omicron.public.dataset ( id ) WHERE pool_id IS NOT NULL AND time_deleted IS NULL; +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); + /* * A region of space allocated to Crucible Downstairs, within a dataset. */ @@ -641,7 +643,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( port INT4, - read_only BOOL NOT NULL + read_only BOOL NOT NULL, + + deleting BOOL NOT NULL ); /* @@ -664,6 +668,9 @@ CREATE INDEX IF NOT EXISTS lookup_regions_missing_ports on omicron.public.region (id) WHERE port IS NULL; +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); + /* * A snapshot of a region, within a dataset. */ @@ -698,6 +705,10 @@ CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_region_id on omicron.public region_id ); +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); + /* * Index on volume_references and snapshot_addr for crucible * resource accounting lookup @@ -4589,6 +4600,78 @@ CREATE INDEX IF NOT EXISTS lookup_bgp_config_by_bgp_announce_set_id ON omicron.p ) WHERE time_deleted IS NULL; +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); + +/* + * This table records when a Volume makes use of a read-only resource. When + * there are no more entries for a particular read-only resource, then that + * resource can be garbage collected. + */ +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + /* + * This column contains a non-NULL value when the usage type is read_only + * region + */ + region_id UUID, + + /* + * These columns contain non-NULL values when the usage type is region + * snapshot + */ + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id), + + CONSTRAINT exactly_one_usage_source CHECK ( + ( + (usage_type = 'read_only_region') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NULL AND + region_snapshot_region_id IS NULL AND + region_snapshot_snapshot_id IS NULL + ) + ) + OR + ( + (usage_type = 'region_snapshot') AND + (region_id IS NULL) AND + ( + region_snapshot_dataset_id IS NOT NULL AND + region_snapshot_region_id IS NOT NULL AND + region_snapshot_snapshot_id IS NOT NULL + ) + ) + ) +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); + +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); /* * Keep this at the end of file so that the database does not contain a version @@ -4601,7 +4684,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '113.0.0', NULL) + (TRUE, NOW(), NOW(), '114.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 589ba87700..8369499658 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -124,7 +124,7 @@ impl CrucibleDataInner { key_pem: None, root_pem: None, source: None, - read_only: false, + read_only: params.source.is_some(), }; let old = self.regions.insert(id, region.clone());