diff --git a/control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs b/control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs index 73a6bbfe2..b2572e3d4 100644 --- a/control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs +++ b/control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs @@ -53,6 +53,7 @@ impl TaskPoller for PoolReconciler { results.push(Self::squash_results(vec![ pool.garbage_collect(context).await, pool.recreate_state(context).await, + retry_create_pool_reconciler(&mut pool, context).await, ])) } capacity::remove_larger_replicas(context.registry()).await; @@ -214,3 +215,12 @@ async fn deleting_pool_spec_reconciler( )) .await } + +#[tracing::instrument(skip(pool, context), level = "trace", fields(pool.id = %pool.id(), request.reconcile = true))] +async fn retry_create_pool_reconciler( + pool: &mut OperationGuardArc, + context: &PollContext, +) -> PollResult { + pool.retry_creating(context.registry()).await?; + Ok(PollerState::Idle) +} diff --git a/control-plane/agents/src/bin/core/controller/registry.rs b/control-plane/agents/src/bin/core/controller/registry.rs index 83dea07ea..41e3db591 100644 --- a/control-plane/agents/src/bin/core/controller/registry.rs +++ b/control-plane/agents/src/bin/core/controller/registry.rs @@ -86,6 +86,11 @@ pub(crate) struct RegistryInner { /// The duration for which the reconciler waits for the replica to /// to be healthy again before attempting to online the faulted child. faulted_child_wait_period: Option, + /// When the pool creation gRPC times out, the actual call in the io-engine + /// may still progress. + /// We wait up to this period before considering the operation a failure and + /// GC'ing the pool. + pool_async_creat_tmo: std::time::Duration, /// Disable partial rebuild for volume targets. disable_partial_rebuild: bool, /// Disable nvmf target access control gates. @@ -122,6 +127,7 @@ impl Registry { reconcile_period: std::time::Duration, reconcile_idle_period: std::time::Duration, faulted_child_wait_period: Option, + pool_async_creat_tmo: std::time::Duration, disable_partial_rebuild: bool, disable_target_acc: bool, max_rebuilds: Option, @@ -165,6 +171,7 @@ impl Registry { reconcile_period, reconcile_idle_period, faulted_child_wait_period, + pool_async_creat_tmo, disable_partial_rebuild, disable_target_acc, reconciler: ReconcilerControl::new(), @@ -298,6 +305,10 @@ impl Registry { pub(crate) fn faulted_child_wait_period(&self) -> Option { self.faulted_child_wait_period } + /// Allow for this given time before assuming failure and allowing the pool to get deleted. + pub(crate) fn pool_async_creat_tmo(&self) -> std::time::Duration { + self.pool_async_creat_tmo + } /// The maximum number of concurrent create volume requests. pub(crate) fn create_volume_limit(&self) -> usize { self.create_volume_limit diff --git a/control-plane/agents/src/bin/core/controller/resources/operations_helper.rs b/control-plane/agents/src/bin/core/controller/resources/operations_helper.rs index 3bf259913..ef855a812 100644 --- a/control-plane/agents/src/bin/core/controller/resources/operations_helper.rs +++ b/control-plane/agents/src/bin/core/controller/resources/operations_helper.rs @@ -51,9 +51,9 @@ enum SpecError { } /// What to do when creation fails. +#[derive(Debug)] pub(crate) enum OnCreateFail { /// Leave object as `Creating`, could allow for frontend retries. - #[allow(unused)] LeaveAsIs, /// When frontend retries don't make sense, set it to deleting so we can clean-up. SetDeleting, @@ -68,7 +68,28 @@ impl OnCreateFail { pub(crate) fn eeinval_delete(result: &Result) -> Self { match result { Err(error) if error.tonic_code() == tonic::Code::InvalidArgument => Self::Delete, - Err(error) if error.tonic_code() == tonic::Code::NotFound => Self::Delete, + _ => Self::SetDeleting, + } + } + /// Map errors into `Self` for pool creation requests, specifically. + pub(crate) fn on_pool_create_err(result: &Result) -> Self { + let Err(ref error) = result else { + // nonsensical but that's how the api is today... + return Self::SetDeleting; + }; + match error.tonic_code() { + // 1. the disk is open by another pool or bdev + // 2. the disk contains a pool with another name + tonic::Code::InvalidArgument => Self::Delete, + // 1. the pool disk is not available (ie not found or broken somehow) + tonic::Code::NotFound => Self::Delete, + // In this case, it's the pool operator's job to attempt re-creation of the pool. + // 1. pre-2.6 dataplane, contention on the pool service + // 2. pool disk is very slow or extremely large + // 3. dataplane core is shared with other processes + // TODO: use higher timeout on larger pool sizes or potentially make this + // an async operation. + tonic::Code::Cancelled => Self::LeaveAsIs, _ => Self::SetDeleting, } } diff --git a/control-plane/agents/src/bin/core/main.rs b/control-plane/agents/src/bin/core/main.rs index e0b584dff..745d52f4f 100644 --- a/control-plane/agents/src/bin/core/main.rs +++ b/control-plane/agents/src/bin/core/main.rs @@ -47,6 +47,13 @@ pub(crate) struct CliArgs { #[clap(long)] pub(crate) faulted_child_wait_period: Option, + /// When the pool creation gRPC times out, the actual call in the io-engine + /// may still progress. + /// We wait up to this period before considering the operation a failure and + /// GC'ing the pool. + #[clap(long, default_value = "15m")] + pub(crate) pool_async_creat_tmo: humantime::Duration, + /// Disable partial rebuild for volume targets. #[clap(long, env = "DISABLE_PARTIAL_REBUILD")] pub(crate) disable_partial_rebuild: bool, @@ -194,6 +201,7 @@ async fn server(cli_args: CliArgs) -> anyhow::Result<()> { cli_args.reconcile_period.into(), cli_args.reconcile_idle_period.into(), cli_args.faulted_child_wait_period.map(|t| t.into()), + cli_args.pool_async_creat_tmo.into(), cli_args.disable_partial_rebuild, cli_args.disable_target_acc, cli_args.max_rebuilds, diff --git a/control-plane/agents/src/bin/core/pool/operations_helper.rs b/control-plane/agents/src/bin/core/pool/operations_helper.rs index ce02ba3ae..0ed7152bf 100644 --- a/control-plane/agents/src/bin/core/pool/operations_helper.rs +++ b/control-plane/agents/src/bin/core/pool/operations_helper.rs @@ -1,14 +1,77 @@ -use crate::controller::{ - registry::Registry, - resources::{operations::ResourceLifecycle, OperationGuardArc}, +use crate::{ + controller::{ + io_engine::PoolApi, + registry::Registry, + resources::{ + operations::ResourceLifecycle, + operations_helper::{GuardedOperationsHelper, OnCreateFail, SpecOperationsHelper}, + OperationGuardArc, + }, + }, + node::wrapper::GetterOps, }; use agents::{errors, errors::SvcError}; use snafu::OptionExt; +use std::ops::Deref; use stor_port::types::v0::{ - store::replica::{PoolRef, ReplicaSpec}, - transport::{DestroyReplica, NodeId, ReplicaOwners}, + store::{ + pool::PoolSpec, + replica::{PoolRef, ReplicaSpec}, + }, + transport::{CreatePool, DestroyReplica, NodeId, ReplicaOwners}, }; +impl OperationGuardArc { + /// Retries the creation of the pool which is being done in the background by the io-engine. + /// This may happen if the pool create gRPC times out, for very large pools. + /// We could increase the timeout but as things stand today that would block all gRPC + /// access to the node. + /// TODO: Since the data-plane now allows concurrent gRPC we should also modify the + /// control-plane to allow this, which would allows to set large timeouts for some gRPCs. + pub(crate) async fn retry_creating(&mut self, registry: &Registry) -> Result<(), SvcError> { + let request = { + let spec = self.lock(); + if on_create_fail(&spec, registry).is_some() { + return Ok(()); + } + CreatePool::from(spec.deref()) + }; + + let node = registry.node_wrapper(&request.node).await?; + if node.pool(&request.id).await.is_none() { + return Ok(()); + } + + let _ = self.start_create(registry, &request).await?; + let result = node.create_pool(&request).await; + let _state = self + .complete_create(result, registry, OnCreateFail::LeaveAsIs) + .await?; + + Ok(()) + } + + /// Ge the `OnCreateFail` policy. + /// For more information see [`Self::retry_creating`]. + pub(crate) fn on_create_fail(&self, registry: &Registry) -> OnCreateFail { + let spec = self.lock(); + on_create_fail(&spec, registry).unwrap_or(OnCreateFail::LeaveAsIs) + } +} + +fn on_create_fail(pool: &PoolSpec, registry: &Registry) -> Option { + if !pool.status().creating() { + return Some(OnCreateFail::LeaveAsIs); + } + let Some(last_mod_elapsed) = pool.creat_tsc.and_then(|t| t.elapsed().ok()) else { + return Some(OnCreateFail::SetDeleting); + }; + if last_mod_elapsed > registry.pool_async_creat_tmo() { + return Some(OnCreateFail::SetDeleting); + } + None +} + impl OperationGuardArc { /// Destroy the replica from its volume pub(crate) async fn destroy_volume_replica( diff --git a/control-plane/agents/src/bin/core/pool/pool_operations.rs b/control-plane/agents/src/bin/core/pool/pool_operations.rs index da3ac6730..34d27abb0 100644 --- a/control-plane/agents/src/bin/core/pool/pool_operations.rs +++ b/control-plane/agents/src/bin/core/pool/pool_operations.rs @@ -65,8 +65,7 @@ impl ResourceLifecycle for OperationGuardArc { let _ = pool.start_create(registry, request).await?; let result = node.create_pool(request).await; - let on_fail = OnCreateFail::eeinval_delete(&result); - + let on_fail = OnCreateFail::on_pool_create_err(&result); let state = pool.complete_create(result, registry, on_fail).await?; let spec = pool.lock().clone(); Ok(Pool::new(spec, Some(CtrlPoolState::new(state)))) @@ -93,14 +92,11 @@ impl ResourceLifecycle for OperationGuardArc { Err(SvcError::PoolNotFound { .. }) => { match node.import_pool(&self.as_ref().into()).await { Ok(_) => node.destroy_pool(request).await, - Err(error) - if allow_not_found - && error.tonic_code() == tonic::Code::InvalidArgument => - { - Ok(()) - } - Err(error) if error.tonic_code() == tonic::Code::InvalidArgument => Ok(()), - Err(error) => Err(error), + Err(error) => match error.tonic_code() { + tonic::Code::NotFound if allow_not_found => Ok(()), + tonic::Code::InvalidArgument => Ok(()), + _other => Err(error), + }, } } Err(error) => Err(error), diff --git a/control-plane/agents/src/bin/core/pool/specs.rs b/control-plane/agents/src/bin/core/pool/specs.rs index 2ce080600..bcd070671 100644 --- a/control-plane/agents/src/bin/core/pool/specs.rs +++ b/control-plane/agents/src/bin/core/pool/specs.rs @@ -425,7 +425,9 @@ impl ResourceSpecsLocked { let pools = self.pools_rsc(); for pool in pools { if let Ok(mut guard) = pool.operation_guard() { - if !guard.handle_incomplete_ops(registry).await { + let on_fail = guard.on_create_fail(registry); + + if !guard.handle_incomplete_ops_ext(registry, on_fail).await { // Not all pending operations could be handled. pending_ops = true; } diff --git a/control-plane/agents/src/bin/core/tests/deserializer.rs b/control-plane/agents/src/bin/core/tests/deserializer.rs index b56ebbfb6..3b8378043 100644 --- a/control-plane/agents/src/bin/core/tests/deserializer.rs +++ b/control-plane/agents/src/bin/core/tests/deserializer.rs @@ -152,6 +152,7 @@ fn test_deserialization_v1_to_v2() { }, sequencer: Default::default(), operation: None, + creat_tsc: None, }), }, TestEntry { diff --git a/control-plane/agents/src/bin/core/tests/pool/mod.rs b/control-plane/agents/src/bin/core/tests/pool/mod.rs index 649246558..de6b1c10a 100644 --- a/control-plane/agents/src/bin/core/tests/pool/mod.rs +++ b/control-plane/agents/src/bin/core/tests/pool/mod.rs @@ -21,7 +21,10 @@ use stor_port::{ VolumePolicy, }, }, - store::replica::{ReplicaSpec, ReplicaSpecKey}, + store::{ + pool::PoolLabel, + replica::{ReplicaSpec, ReplicaSpecKey}, + }, transport::{ CreatePool, CreateReplica, DestroyPool, DestroyReplica, Filter, GetSpecs, NexusId, NodeId, Protocol, Replica, ReplicaId, ReplicaName, ReplicaOwners, ReplicaShareProtocol, @@ -1027,3 +1030,86 @@ async fn destroy_after_restart() { assert_eq!(pool.state().unwrap().id, create.id); } + +#[tokio::test] +async fn slow_create() { + const POOL_SIZE_BYTES: u64 = 128 * 1024 * 1024; + + let vg = deployer_cluster::lvm::VolGroup::new("slow-pool-1", POOL_SIZE_BYTES).unwrap(); + let lvol = vg.create_lvol("lvol0", POOL_SIZE_BYTES / 2).unwrap(); + lvol.suspend().unwrap(); + { + let cluster = ClusterBuilder::builder() + .with_io_engines(1) + .with_reconcile_period(Duration::from_secs(2), Duration::from_secs(2)) + .with_cache_period("1s") + .with_options(|o| o.with_io_engine_devices(vec![lvol.path()])) + // .with_req_timeouts(Duration::from_secs(2), Duration::from_secs(2)) + .compose_build(|b| b.with_clean(true)) + .await + .unwrap(); + + let client = cluster.grpc_client(); + + let create = CreatePool { + node: cluster.node(0), + id: "bob".into(), + disks: vec![lvol.path().into()], + labels: Some(PoolLabel::from([("a".into(), "b".into())])), + }; + + let error = client + .pool() + .create(&create, None) + .await + .expect_err("device suspended"); + // TODO: check if the errors are being mapped correctly! + assert_eq!(error.kind, ReplyErrorKind::Cancelled); + + lvol.resume().unwrap(); + + let start = std::time::Instant::now(); + let timeout = std::time::Duration::from_secs(30); + loop { + if std::time::Instant::now() > (start + timeout) { + panic!("Timeout waiting for the pool"); + } + tokio::time::sleep(Duration::from_millis(100)).await; + + let pools = client + .pool() + .get(Filter::Pool(create.id.clone()), None) + .await + .unwrap(); + + let Some(pool) = pools.0.first() else { + continue; + }; + let Some(pool_spec) = pool.spec() else { + continue; + }; + if !pool_spec.status.created() { + continue; + } + break; + } + let destroy = DestroyPool::from(create.clone()); + client.pool().destroy(&destroy, None).await.unwrap(); + + // Now we try to recreate using an API call, rather than using the reconciler + lvol.suspend().unwrap(); + + let error = client + .pool() + .create(&create, None) + .await + .expect_err("device suspended"); + // TODO: check if the errors are being mapped correctly! + assert_eq!(error.kind, ReplyErrorKind::Cancelled); + + lvol.resume().unwrap(); + + let pool = client.pool().create(&create, None).await.unwrap(); + assert!(pool.spec().unwrap().status.created()); + } +} diff --git a/control-plane/agents/src/common/errors.rs b/control-plane/agents/src/common/errors.rs index 7d5001cc1..89d31c0b0 100644 --- a/control-plane/agents/src/common/errors.rs +++ b/control-plane/agents/src/common/errors.rs @@ -1124,7 +1124,7 @@ fn grpc_to_reply_error(error: SvcError) -> ReplyError { } => { let kind = match source.code() { Code::Ok => ReplyErrorKind::Internal, - Code::Cancelled => ReplyErrorKind::Internal, + Code::Cancelled => ReplyErrorKind::Cancelled, Code::Unknown => ReplyErrorKind::Internal, Code::InvalidArgument => ReplyErrorKind::InvalidArgument, Code::DeadlineExceeded => ReplyErrorKind::DeadlineExceeded, diff --git a/control-plane/grpc/proto/v1/misc/common.proto b/control-plane/grpc/proto/v1/misc/common.proto index 403f3805e..c4569eaa1 100644 --- a/control-plane/grpc/proto/v1/misc/common.proto +++ b/control-plane/grpc/proto/v1/misc/common.proto @@ -69,6 +69,7 @@ enum ReplyErrorKind { InUse = 29; CapacityLimitExceeded = 30; NotAcceptable = 31; + Cancelled = 32; } // ResourceKind for the resource which has undergone this error diff --git a/control-plane/grpc/src/misc/traits.rs b/control-plane/grpc/src/misc/traits.rs index 5ff69c1b8..8f6cc1480 100644 --- a/control-plane/grpc/src/misc/traits.rs +++ b/control-plane/grpc/src/misc/traits.rs @@ -102,6 +102,7 @@ impl From for common::ReplyErrorKind { ReplyErrorKind::InUse => Self::InUse, ReplyErrorKind::CapacityLimitExceeded => Self::CapacityLimitExceeded, ReplyErrorKind::NotAcceptable => Self::NotAcceptable, + ReplyErrorKind::Cancelled => Self::Cancelled, } } } @@ -141,6 +142,7 @@ impl From for ReplyErrorKind { common::ReplyErrorKind::InUse => Self::InUse, common::ReplyErrorKind::CapacityLimitExceeded => Self::CapacityLimitExceeded, common::ReplyErrorKind::NotAcceptable => Self::NotAcceptable, + common::ReplyErrorKind::Cancelled => Self::Cancelled, } } } diff --git a/control-plane/grpc/src/operations/pool/traits.rs b/control-plane/grpc/src/operations/pool/traits.rs index 1c0c653c3..ff2258f25 100644 --- a/control-plane/grpc/src/operations/pool/traits.rs +++ b/control-plane/grpc/src/operations/pool/traits.rs @@ -97,6 +97,7 @@ impl TryFrom for PoolSpec { }, sequencer: Default::default(), operation: None, + creat_tsc: None, }) } } diff --git a/control-plane/rest/openapi-specs/v0_api_spec.yaml b/control-plane/rest/openapi-specs/v0_api_spec.yaml index a0a516470..c961df596 100644 --- a/control-plane/rest/openapi-specs/v0_api_spec.yaml +++ b/control-plane/rest/openapi-specs/v0_api_spec.yaml @@ -3268,6 +3268,7 @@ components: - InUse - CapacityLimitExceeded - NotAcceptable + - Cancelled required: - details - kind diff --git a/control-plane/stor-port/src/transport_api/mod.rs b/control-plane/stor-port/src/transport_api/mod.rs index d87df6c2f..c59a66cb8 100644 --- a/control-plane/stor-port/src/transport_api/mod.rs +++ b/control-plane/stor-port/src/transport_api/mod.rs @@ -444,11 +444,14 @@ pub enum ReplyErrorKind { InUse, CapacityLimitExceeded, NotAcceptable, + Cancelled, } impl From for ReplyErrorKind { fn from(code: tonic::Code) -> Self { match code { + Code::Ok => Self::Internal, + Code::Unknown => Self::Internal, Code::InvalidArgument => Self::InvalidArgument, Code::DeadlineExceeded => Self::DeadlineExceeded, Code::NotFound => Self::NotFound, @@ -463,7 +466,7 @@ impl From for ReplyErrorKind { Code::Unavailable => Self::Unavailable, Code::DataLoss => Self::FailedPersist, Code::Unauthenticated => Self::Unauthenticated, - _ => Self::Aborted, + Code::Cancelled => Self::Cancelled, } } } diff --git a/control-plane/stor-port/src/types/mod.rs b/control-plane/stor-port/src/types/mod.rs index 56cb8a917..1b2c347de 100644 --- a/control-plane/stor-port/src/types/mod.rs +++ b/control-plane/stor-port/src/types/mod.rs @@ -142,6 +142,10 @@ impl From for RestError { let error = RestJsonError::new(details, message, Kind::NotAcceptable); (StatusCode::NOT_ACCEPTABLE, error) } + ReplyErrorKind::Cancelled => { + let error = RestJsonError::new(details, message, Kind::Cancelled); + (StatusCode::GATEWAY_TIMEOUT, error) + } }; RestError::new(status, error) diff --git a/control-plane/stor-port/src/types/v0/store/pool.rs b/control-plane/stor-port/src/types/v0/store/pool.rs index f6a344051..b4fc9a3f9 100644 --- a/control-plane/stor-port/src/types/v0/store/pool.rs +++ b/control-plane/stor-port/src/types/v0/store/pool.rs @@ -53,6 +53,17 @@ impl From<&CreatePool> for PoolSpec { labels: request.labels.clone(), sequencer: OperationSequence::new(), operation: None, + creat_tsc: None, + } + } +} +impl From<&PoolSpec> for CreatePool { + fn from(pool: &PoolSpec) -> Self { + Self { + node: pool.node.clone(), + id: pool.id.clone(), + disks: pool.disks.clone(), + labels: pool.labels.clone(), } } } @@ -61,6 +72,7 @@ impl PartialEq for PoolSpec { let mut other = PoolSpec::from(other); other.status = self.status.clone(); other.sequencer = self.sequencer.clone(); + other.creat_tsc = self.creat_tsc; &other == self } } @@ -83,6 +95,9 @@ pub struct PoolSpec { pub sequencer: OperationSequence, /// Record of the operation in progress pub operation: Option, + /// Last modification timestamp. + #[serde(skip)] + pub creat_tsc: Option, } impl PoolSpec { @@ -206,6 +221,9 @@ impl SpecTransaction for PoolSpec { } fn start_op(&mut self, operation: PoolOperation) { + if matches!(operation, PoolOperation::Create) && self.creat_tsc.is_none() { + self.creat_tsc = Some(std::time::SystemTime::now()); + } self.operation = Some(PoolOperationState { operation, result: None, diff --git a/control-plane/stor-port/src/types/v0/transport/pool.rs b/control-plane/stor-port/src/types/v0/transport/pool.rs index be5d211c9..3567cc653 100644 --- a/control-plane/stor-port/src/types/v0/transport/pool.rs +++ b/control-plane/stor-port/src/types/v0/transport/pool.rs @@ -346,6 +346,11 @@ impl DestroyPool { Self { node, id } } } +impl From for DestroyPool { + fn from(value: CreatePool) -> Self { + Self::new(value.node, value.id) + } +} /// Label Pool Request. #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] diff --git a/terraform/cluster/mod/k8s/repo.sh b/terraform/cluster/mod/k8s/repo.sh index a561ec7d5..12044c52c 100644 --- a/terraform/cluster/mod/k8s/repo.sh +++ b/terraform/cluster/mod/k8s/repo.sh @@ -45,10 +45,13 @@ fi sudo mkdir -p /etc/apt/keyrings/ KUBE_APT_V=$(echo "${kube_version}" | awk -F. '{ sub(/-.*/, ""); print "v" $1 "." $2 }') -curl -fsSL https://pkgs.k8s.io/core:/stable:/$KUBE_APT_V/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg -cat < &str { self.inner.uri() } + /// Disk path on the host. + pub fn path(&self) -> &str { + &self.inner.path + } /// Get the inner disk if there are no other references to it. pub fn into_inner(self) -> Result> { @@ -652,7 +657,10 @@ impl TmpDiskFileInner { } } fn make_path(name: &str) -> String { - format!("/tmp/io-engine-disk-{name}") + // todo: use known path to facilitate cleanup. + // let root = std::env::var("WORKSPACE_ROOT").as_deref().unwrap_or("/tmp"); + let root = "/tmp"; + format!("{root}/io-engine-disk-{name}") } fn uri(&self) -> &str { &self.uri diff --git a/utils/deployer-cluster/src/lvm.rs b/utils/deployer-cluster/src/lvm.rs new file mode 100644 index 000000000..fb75c2bd9 --- /dev/null +++ b/utils/deployer-cluster/src/lvm.rs @@ -0,0 +1,100 @@ +//! LVM helper methods which are useful for setting up test block devices. + +use crate::TmpDiskFile; + +/// An LVM Logical Volume. +pub struct Lvol { + name: String, + path: String, +} +impl Lvol { + /// Get the host path for the lvol. + pub fn path(&self) -> &str { + &self.path + } + /// Suspends the device for IO. + pub fn suspend(&self) -> anyhow::Result<()> { + let _ = VolGroup::command(&["dmsetup", "suspend", self.path.as_str()])?; + Ok(()) + } + /// Resumes the device for IO. + pub fn resume(&self) -> anyhow::Result<()> { + let _ = VolGroup::command(&["dmsetup", "resume", self.path.as_str()])?; + Ok(()) + } +} +impl Drop for Lvol { + fn drop(&mut self) { + println!("Dropping Lvol {}", self.name); + self.resume().ok(); + } +} + +/// An LVM Volume Group. +pub struct VolGroup { + backing_file: TmpDiskFile, + dev_loop: String, + name: String, +} + +impl VolGroup { + /// Creates a new LVM Volume Group. + pub fn new(name: &str, size: u64) -> Result { + let backing_file = TmpDiskFile::new(name, size); + + let dev_loop = Self::command(&["losetup", "--show", "-f", backing_file.path()])?; + let dev_loop = dev_loop.trim_end().to_string(); + let _ = Self::command(&["pvcreate", dev_loop.as_str()])?; + let _ = Self::command(&["vgcreate", name, dev_loop.as_str()])?; + + Ok(Self { + backing_file, + dev_loop, + name: name.to_string(), + }) + } + /// Creates a new Lvol for the LVM Volume Group. + pub fn create_lvol(&self, name: &str, size: u64) -> Result { + let size = format!("{size}B"); + + let vg_name = self.name.as_str(); + let _ = Self::command(&["lvcreate", "-L", size.as_str(), "-n", name, vg_name])?; + + Ok(Lvol { + name: name.to_string(), + path: format!("/dev/{vg_name}/{name}"), + }) + } + /// Run a command with sudo, and the given args. + /// The output string is returned. + fn command(args: &[&str]) -> Result { + let cmd = args.first().unwrap(); + let output = std::process::Command::new("sudo") + .arg("-E") + .args(args) + .output()?; + if !output.status.success() { + return Err(anyhow::anyhow!( + "{cmd} Exit Code: {}\nstdout: {}, stderr: {}", + output.status, + String::from_utf8(output.stdout).unwrap_or_default(), + String::from_utf8(output.stderr).unwrap_or_default() + )); + } + let output = String::from_utf8(output.stdout)?; + Ok(output) + } +} + +impl Drop for VolGroup { + fn drop(&mut self) { + println!( + "Dropping VolGroup {} <== {}", + self.name, + self.backing_file.path() + ); + + let _ = Self::command(&["vgremove", self.name.as_str(), "-y"]); + let _ = Self::command(&["losetup", "-d", self.dev_loop.as_str()]); + } +}