From 557416f9edf488d85375232b60cc5d9258e68b95 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 16 Mar 2026 13:33:18 -0700 Subject: [PATCH 1/5] support NVMe Deallocate --- .../src/lib/stats/virtual_disk.rs | 3 +- lib/propolis/src/block/crucible.rs | 2 +- lib/propolis/src/block/file.rs | 12 +- lib/propolis/src/block/in_memory.rs | 2 +- lib/propolis/src/block/mem_async.rs | 2 +- lib/propolis/src/block/minder.rs | 6 +- lib/propolis/src/block/mod.rs | 26 +++-- lib/propolis/src/hw/nvme/bits.rs | 35 ++++++ lib/propolis/src/hw/nvme/cmds.rs | 107 +++++++++++++++++- lib/propolis/src/hw/nvme/mod.rs | 2 + lib/propolis/src/hw/nvme/requests.rs | 38 ++++++- lib/propolis/src/hw/virtio/block.rs | 4 +- 12 files changed, 212 insertions(+), 27 deletions(-) diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 4a39cfde0..3049ef403 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -77,9 +77,10 @@ impl VirtualDiskStats { self.on_write_completion(result, len, duration) } Operation::Flush => self.on_flush_completion(result, duration), - Operation::Discard(..) => { + Operation::Discard => { // Discard is not wired up in backends we care about for now, so // it can safely be ignored. + // XXX no longer true } } } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 5ba04a14b..1ddc68171 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -145,7 +145,7 @@ impl WorkerState { let _ = block.flush(None).await?; } } - block::Operation::Discard(..) => { + block::Operation::Discard => { // Crucible does not support discard operations for now return Err(Error::Unsupported); } diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 0d95aa43f..5fac93ba0 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -113,12 +113,16 @@ impl SharedState { self.fp.sync_data().map_err(|_| "io error")?; } } - block::Operation::Discard(off, len) => { + block::Operation::Discard => { if let Some(mech) = self.discard_mech { - dkioc::do_discard(&self.fp, mech, off as u64, len as u64) + for &(off, len) in &req.ranges { + dkioc::do_discard( + &self.fp, mech, off as u64, len as u64, + ) .map_err(|_| { - "io error while attempting to free block(s)" - })?; + "io error while attempting to free block(s)" + })?; + } } else { unreachable!("handled above in processing_loop()"); } diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 949412a45..9d45c1fa1 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -86,7 +86,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()"); } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index d834bd388..d376189dc 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -89,7 +89,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()") } } diff --git a/lib/propolis/src/block/minder.rs b/lib/propolis/src/block/minder.rs index c9c62f576..e9ff8fa54 100644 --- a/lib/propolis/src/block/minder.rs +++ b/lib/propolis/src/block/minder.rs @@ -283,9 +283,9 @@ impl QueueMinder { Operation::Flush => { probes::block_begin_flush!(|| { (devqid, id) }); } - Operation::Discard(off, len) => { + Operation::Discard => { probes::block_begin_discard!(|| { - (devqid, id, off as u64, len as u64) + (devqid, id, req.ranges.len() as u64) }); } } @@ -355,7 +355,7 @@ impl QueueMinder { (devqid, id, rescode, ns_processed, ns_queued) }); } - Operation::Discard(..) => { + Operation::Discard => { probes::block_complete_discard!(|| { (devqid, id, rescode, ns_processed, ns_queued) }); diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index 674f2a985..380acfb34 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -50,7 +50,7 @@ mod probes { fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_flush(devq_id: u64, req_id: u64) {} - fn block_begin_discard(devq_id: u64, req_id: u64, offset: u64, len: u64) {} + fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {} fn block_complete_read( devq_id: u64, @@ -106,8 +106,8 @@ pub enum Operation { Write(ByteOffset, ByteLen), /// Flush buffer(s) Flush, - /// Discard/UNMAP/deallocate region - Discard(ByteOffset, ByteLen), + /// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges + Discard, } impl Operation { pub const fn is_read(&self) -> bool { @@ -120,7 +120,7 @@ impl Operation { matches!(self, Operation::Flush) } pub const fn is_discard(&self) -> bool { - matches!(self, Operation::Discard(..)) + matches!(self, Operation::Discard) } } @@ -203,6 +203,10 @@ pub struct Request { /// A list of regions of guest memory to read/write into as part of the I/O /// request pub regions: Vec, + + /// A list of byte ranges to discard as part of the I/O request. This is only + /// relevant for discard operations, and is expected to be empty otherwise. + pub ranges: Vec<(ByteOffset, ByteLen)>, } impl Request { pub fn new_read( @@ -210,7 +214,7 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Read(off, len), regions } + Self { op: Operation::Read(off, len), regions, ranges: Vec::new() } } pub fn new_write( @@ -218,17 +222,17 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Write(off, len), regions } + Self { op: Operation::Write(off, len), regions, ranges: Vec::new() } } pub fn new_flush() -> Self { let op = Operation::Flush; - Self { op, regions: Vec::new() } + Self { op, regions: Vec::new(), ranges: Vec::new() } } - pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self { - let op = Operation::Discard(off, len); - Self { op, regions: Vec::new() } + pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self { + let op = Operation::Discard; + Self { op, regions: Vec::new(), ranges } } pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option>> { @@ -239,7 +243,7 @@ impl Request { Operation::Write(..) => { self.regions.iter().map(|r| mem.readable_region(r)).collect() } - Operation::Flush | Operation::Discard(..) => None, + Operation::Flush | Operation::Discard => None, } } } diff --git a/lib/propolis/src/hw/nvme/bits.rs b/lib/propolis/src/hw/nvme/bits.rs index c227ae1f4..dab6d3757 100644 --- a/lib/propolis/src/hw/nvme/bits.rs +++ b/lib/propolis/src/hw/nvme/bits.rs @@ -4,6 +4,7 @@ #![allow(dead_code)] +use crate::block::{ByteLen, ByteOffset}; use bitstruct::bitstruct; use zerocopy::FromBytes; @@ -176,6 +177,38 @@ impl CompletionQueueEntry { } } +/// A Dataset Management Range Definition as represented in memory. +/// +/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition +#[derive(Debug, Default, Copy, Clone, FromBytes)] +#[repr(C, packed(1))] +pub struct DatasetManagementRangeDefinition { + /// The context attributes specified for each range provides information about how the range + /// is intended to be used by host software. The use of this information is optional and the + /// controller is not required to perform any specific action. + pub context_attributes: u32, + + pub number_logical_blocks: u32, + + pub starting_lba: u64, +} +impl DatasetManagementRangeDefinition { + pub fn new( + context_attributes: u32, + number_logical_blocks: u32, + starting_lba: u64, + ) -> Self { + Self { context_attributes, number_logical_blocks, starting_lba } + } + + pub fn offset_len(&self, lba_data_size: u64) -> (ByteOffset, ByteLen) { + ( + (self.starting_lba * lba_data_size) as ByteOffset, + (self.number_logical_blocks as u64 * lba_data_size) as ByteLen, + ) + } +} + // Register bits bitstruct! { @@ -539,6 +572,8 @@ pub const NVM_OPC_FLUSH: u8 = 0x00; pub const NVM_OPC_WRITE: u8 = 0x01; /// Read Command Opcode pub const NVM_OPC_READ: u8 = 0x02; +/// Dataset Mangement Command Opcode +pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09; // Generic Command Status values // See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index b11234aea..5187d1ff5 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -2,7 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::bits::{self, StatusCodeType, SubmissionQueueEntry}; +use super::bits::{ + self, DatasetManagementRangeDefinition, StatusCodeType, + SubmissionQueueEntry, +}; use super::queue::{QueueCreateErr, QueueId}; use crate::block; use crate::common::*; @@ -678,6 +681,8 @@ pub enum NvmCmd { Write(WriteCmd), /// Read data and metadata Read(ReadCmd), + /// Dataset Management Command + DatasetManagement(DatasetManagementCmd), /// An unknown NVM command Unknown(GuestData), } @@ -709,6 +714,17 @@ impl NvmCmd { prp1: raw.prp1, prp2: raw.prp2, }), + bits::NVM_OPC_DATASET_MANAGEMENT => { + NvmCmd::DatasetManagement(DatasetManagementCmd { + prp1: raw.prp1, + prp2: raw.prp2, + // Convert from 0's based value + nr: (raw.cdw10 & 0xFF) as u16 + 1, + ad: raw.cdw11 & (1 << 2) != 0, + idw: raw.cdw11 & (1 << 1) != 0, + idr: raw.cdw11 & (1 << 0) != 0, + }) + } _ => NvmCmd::Unknown(raw), }; Ok(cmd) @@ -779,6 +795,95 @@ impl ReadCmd { } } +/// Dataset Management Command Parameters +#[derive(Debug)] +#[allow(dead_code)] +pub struct DatasetManagementCmd { + /// PRP Entry 1 (PRP1) + /// + /// Indicates a data buffer that contains the LBA range information. + prp1: u64, + + /// PRP Entry 2 (PRP2) + /// + /// This field contains the second PRP entry that specifies the location where data should be + /// transferred from (if there is a physical discontinuity). + prp2: u64, + + /// Number of Ranges (NR) + /// + /// Indicates the number of 16 byte range sets that are specified in the command. This is a + /// 0’s based value. + pub nr: u16, + + /// Attribute – Deallocate (AD) + /// + /// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs + /// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or + /// the last data written to the associated LBA. + /// + /// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT + /// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3. + ad: bool, + + /// Attribute – Integral Dataset for Write (IDW) + /// + /// If set to ‘1’ then the dataset should be optimized for write access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// writes, indicating that if a portion of the dataset is written it is expected that all of + /// the ranges in the dataset are going to be written. + idw: bool, + + /// Attribute – Integral Dataset for Read (IDR) + /// + /// If set to ‘1’ then the dataset should be optimized for read access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// reads, indicating that if a portion of the dataset is read it is expected that all of the + /// ranges in the dataset are going to be read. + idr: bool, +} + +impl DatasetManagementCmd { + /// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges. + pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> { + PrpIter::new( + u64::from(self.nr) + * size_of::() as u64, + self.prp1, + self.prp2, + mem, + ) + } + + /// Returns an Iterator that yields the LBA ranges specified in this command. Note that if + /// some of the ranges couldn't be read from guest memory, this will yield fewer than + /// `Self.nr` ranges. + pub fn ranges<'a>( + &self, + mem: &'a MemCtx, + ) -> impl Iterator + 'a { + self.data(mem).flat_map(|region| { + let mut ranges = Vec::new(); + if let Some(mapping) = mem.readable_region(®ion) { + ranges.resize_with( + mapping.len() + / size_of::(), + Default::default, + ); + if mapping.read_many(&mut ranges).is_err() { + ranges.clear(); + } + }; + + ranges.into_iter() + }) + } + + pub fn is_deallocate(&self) -> bool { + self.ad + } +} + /// Indicates the possible states of a [`PrpIter`]. #[derive(Clone, Copy, Eq, PartialEq, Debug)] enum PrpNext { diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 9a3bf657d..7f8f639b1 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -846,6 +846,8 @@ impl PciNvme { // Supporting multiple namespaces complicates I/O dispatching, // so for now we limit the device to a single namespace. nn: 1, + // bit 2 indicates support for the Dataset Management command + oncs: (1 << 2), // bit 0 indicates volatile write cache is present vwc: 1, // bit 8 indicates Doorbell Buffer support diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 4821b47f0..f4ec661ee 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -38,6 +38,9 @@ mod probes { } fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_discard_enqueue(devsq_id: u64, idx: u16, cid: u16, nr: u16) {} + fn nvme_discard_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {} fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {} @@ -142,6 +145,37 @@ impl block::DeviceQueue for NvmeBlockQueue { Request::new_read(off as usize, size as usize, bufs); return Some((req, permit, None)); } + Ok(NvmCmd::DatasetManagement(cmd)) => { + // We only support the "deallocate" (discard/trim) operation of Dataset + // Management. XXX should we return success? + if !cmd.is_deallocate() { + permit.complete( + Completion::generic_err(bits::STS_INVAL_FIELD) + .dnr(), + ); + continue; + } + probes::nvme_discard_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + cmd.nr, + )); + let ranges: Vec<_> = cmd + .ranges(&mem) + .map(|r| r.offset_len(params.lba_data_size)) + .collect(); + if ranges.len() != cmd.nr as usize { + // If we couldn't read all the ranges, fail the command + permit.complete( + Completion::generic_err(bits::STS_DATA_XFER_ERR) + .dnr(), + ); + continue; + } + let req = Request::new_discard(ranges); + return Some((req, permit, None)); + } Ok(NvmCmd::Flush) => { probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid)); let req = Request::new_flush(); @@ -179,8 +213,8 @@ impl block::DeviceQueue for NvmeBlockQueue { Operation::Flush => { probes::nvme_flush_complete!(|| (devsq_id, cid, resnum)); } - Operation::Discard(..) => { - unreachable!("discard not supported in NVMe for now"); + Operation::Discard => { + probes::nvme_discard_complete!(|| (devsq_id, cid, resnum)); } } diff --git a/lib/propolis/src/hw/virtio/block.rs b/lib/propolis/src/hw/virtio/block.rs index 84c662019..75659e46d 100644 --- a/lib/propolis/src/hw/virtio/block.rs +++ b/lib/propolis/src/hw/virtio/block.rs @@ -198,7 +198,7 @@ impl block::DeviceQueue for BlockVq { rid, off as u64, sz as u64, )); Ok(( - block::Request::new_discard(off, sz), + block::Request::new_discard(vec![(off, sz)]), CompletionToken { rid, chain }, None, )) @@ -245,7 +245,7 @@ impl block::DeviceQueue for BlockVq { block::Operation::Flush => { probes::vioblk_flush_complete!(|| (rid, resnum)); } - block::Operation::Discard(..) => { + block::Operation::Discard => { probes::vioblk_discard_complete!(|| (rid, resnum)); } } From 6c6c6d3886f6a462888252bca0005cc630e421c6 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 13 Apr 2026 12:06:09 -0700 Subject: [PATCH 2/5] ixi code review --- Cargo.lock | 1 + .../src/lib/stats/virtual_disk.rs | 5 +- lib/propolis/Cargo.toml | 1 + lib/propolis/src/block/crucible.rs | 6 +- lib/propolis/src/hw/nvme/cmds.rs | 59 +++++++++++-------- lib/propolis/src/hw/nvme/requests.rs | 23 +++++--- lib/propolis/src/vmm/mem.rs | 22 +++++++ 7 files changed, 79 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c6320d360..482e96ece 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6434,6 +6434,7 @@ dependencies = [ "futures", "iddqd", "ispf", + "itertools 0.13.0", "lazy_static", "libc", "libloading 0.7.4", diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 3049ef403..8ee0a4c85 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -78,9 +78,8 @@ impl VirtualDiskStats { } Operation::Flush => self.on_flush_completion(result, duration), Operation::Discard => { - // Discard is not wired up in backends we care about for now, so - // it can safely be ignored. - // XXX no longer true + // Discard is only partially wired up in backends we care about, and it's not + // clear what stats (if any) to report yet. } } } diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index d092121f6..464a293db 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -44,6 +44,7 @@ async-trait.workspace = true iddqd.workspace = true nix.workspace = true vm-attest.workspace = true +itertools.workspace = true # falcon libloading = { workspace = true, optional = true } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 1ddc68171..bdcf6ab60 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -146,8 +146,10 @@ impl WorkerState { } } block::Operation::Discard => { - // Crucible does not support discard operations for now - return Err(Error::Unsupported); + // Crucible does not support discard operations for now, so we implement this as + // a no-op (which technically is a valid implementation of discard, just one that + // doesn't actually free any space). + return Ok(()); } } Ok(()) diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index 5187d1ff5..0754811c4 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -31,6 +31,10 @@ pub enum ParseErr { /// An invalid value was specified in the FUSE bits of `CDW0`. #[error("reserved FUSE value specified")] ReservedFuse, + + /// A reserved field was set to a non-zero value. + #[error("reserved field value specified")] + Reserved, } /// A parsed Admin Command @@ -715,14 +719,19 @@ impl NvmCmd { prp2: raw.prp2, }), bits::NVM_OPC_DATASET_MANAGEMENT => { + if (raw.cdw11 & !0b111) != 0 { + // Only the lowest 3 bits of CDW11 are used for Dataset + // Management, so reject if any other bits are set. + return Err(ParseErr::Reserved); + } NvmCmd::DatasetManagement(DatasetManagementCmd { prp1: raw.prp1, prp2: raw.prp2, // Convert from 0's based value nr: (raw.cdw10 & 0xFF) as u16 + 1, ad: raw.cdw11 & (1 << 2) != 0, - idw: raw.cdw11 & (1 << 1) != 0, - idr: raw.cdw11 & (1 << 0) != 0, + _idw: raw.cdw11 & (1 << 1) != 0, + _idr: raw.cdw11 & (1 << 0) != 0, }) } _ => NvmCmd::Unknown(raw), @@ -797,7 +806,6 @@ impl ReadCmd { /// Dataset Management Command Parameters #[derive(Debug)] -#[allow(dead_code)] pub struct DatasetManagementCmd { /// PRP Entry 1 (PRP1) /// @@ -806,14 +814,13 @@ pub struct DatasetManagementCmd { /// PRP Entry 2 (PRP2) /// - /// This field contains the second PRP entry that specifies the location where data should be - /// transferred from (if there is a physical discontinuity). + /// Indicates a second data buffer that contains LBA range information. It may not be a PRP + /// List. prp2: u64, /// Number of Ranges (NR) /// - /// Indicates the number of 16 byte range sets that are specified in the command. This is a - /// 0’s based value. + /// Indicates the number of 16 byte range sets that are specified in the command. pub nr: u16, /// Attribute – Deallocate (AD) @@ -832,7 +839,9 @@ pub struct DatasetManagementCmd { /// The host expects to perform operations on all ranges provided as an integral unit for /// writes, indicating that if a portion of the dataset is written it is expected that all of /// the ranges in the dataset are going to be written. - idw: bool, + /// + /// Note: this field is advisory, and we ignore it. + _idw: bool, /// Attribute – Integral Dataset for Read (IDR) /// @@ -840,7 +849,9 @@ pub struct DatasetManagementCmd { /// The host expects to perform operations on all ranges provided as an integral unit for /// reads, indicating that if a portion of the dataset is read it is expected that all of the /// ranges in the dataset are going to be read. - idr: bool, + /// + /// Note: this field is advisory, and we ignore it. + _idr: bool, } impl DatasetManagementCmd { @@ -855,27 +866,23 @@ impl DatasetManagementCmd { ) } - /// Returns an Iterator that yields the LBA ranges specified in this command. Note that if - /// some of the ranges couldn't be read from guest memory, this will yield fewer than - /// `Self.nr` ranges. + /// Returns an Iterator that yields the LBA ranges specified in this command. If any of the + /// ranges cannot be read from guest memory, yields an error for that range instead. pub fn ranges<'a>( &self, mem: &'a MemCtx, - ) -> impl Iterator + 'a { + ) -> impl Iterator< + Item = Result, + > + 'a { self.data(mem).flat_map(|region| { - let mut ranges = Vec::new(); - if let Some(mapping) = mem.readable_region(®ion) { - ranges.resize_with( - mapping.len() - / size_of::(), - Default::default, - ); - if mapping.read_many(&mut ranges).is_err() { - ranges.clear(); - } - }; - - ranges.into_iter() + if let Some(Ok(defs)) = mem + .readable_region(®ion) + .map(|mapping| mapping.read_many_owned()) + { + defs.into_iter().map(Ok).collect::>().into_iter() + } else { + vec![Err("Failed to read LBA range")].into_iter() + } }) } diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index f4ec661ee..e00158525 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -2,12 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use itertools::Itertools; use std::sync::Arc; use std::time::Instant; use super::{cmds::NvmCmd, queue::Permit, PciNvme}; use crate::accessors::MemAccessor; use crate::block::{self, Operation, Request}; +use crate::hw::nvme::cmds::ParseErr; use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] @@ -147,7 +149,7 @@ impl block::DeviceQueue for NvmeBlockQueue { } Ok(NvmCmd::DatasetManagement(cmd)) => { // We only support the "deallocate" (discard/trim) operation of Dataset - // Management. XXX should we return success? + // Management. if !cmd.is_deallocate() { permit.complete( Completion::generic_err(bits::STS_INVAL_FIELD) @@ -161,18 +163,18 @@ impl block::DeviceQueue for NvmeBlockQueue { cid, cmd.nr, )); - let ranges: Vec<_> = cmd + let Ok(ranges): Result, _> = cmd .ranges(&mem) - .map(|r| r.offset_len(params.lba_data_size)) - .collect(); - if ranges.len() != cmd.nr as usize { - // If we couldn't read all the ranges, fail the command + .map_ok(|r| r.offset_len(params.lba_data_size)) + .try_collect() + else { + // If we couldn't read the ranges, fail the command permit.complete( Completion::generic_err(bits::STS_DATA_XFER_ERR) .dnr(), ); continue; - } + }; let req = Request::new_discard(ranges); return Some((req, permit, None)); } @@ -181,6 +183,13 @@ impl block::DeviceQueue for NvmeBlockQueue { let req = Request::new_flush(); return Some((req, permit, None)); } + Err(ParseErr::ReservedFuse) | Err(ParseErr::Reserved) => { + // For commands that fail parsing due to reserved fields being set, + // complete with an invalid field error + let comp = + Completion::generic_err(bits::STS_INVAL_FIELD).dnr(); + permit.complete(comp); + } Ok(NvmCmd::Unknown(_)) | Err(_) => { // For any other unrecognized or malformed command, // just immediately complete it with an error diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index fbce4b3da..3a7a2482a 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -557,6 +557,28 @@ impl SubMapping<'_> { Ok(unsafe { typed.read_unaligned() }) } + /// Read the entire mapping as an array of `T` objects. + /// The size of the mapping must be aligned to `size_of::()`. + pub fn read_many_owned(&self) -> Result> { + self.check_read_access()?; + if !self.len.is_multiple_of(size_of::()) { + return Err(Error::new( + ErrorKind::InvalidInput, + "Mapping size not aligned to value type", + )); + } + let count = self.len / size_of::(); + let mut vec = Vec::with_capacity(count); + + self.read_many(&mut vec.spare_capacity_mut()[..count])?; + // Safety: read_many() was successful and just initialized the first `count` elements of + // the vector. + unsafe { + vec.set_len(count); + } + Ok(vec) + } + /// Read `values` from the mapping. pub fn read_many( &self, From 4ef3f73344abe723efa70ec6d40b64adcb17b20c Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 15 Apr 2026 10:18:30 -0700 Subject: [PATCH 3/5] tests --- lib/propolis/src/hw/nvme/bits.rs | 3 +- lib/propolis/src/hw/nvme/cmds.rs | 125 +++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/lib/propolis/src/hw/nvme/bits.rs b/lib/propolis/src/hw/nvme/bits.rs index dab6d3757..9cd86889e 100644 --- a/lib/propolis/src/hw/nvme/bits.rs +++ b/lib/propolis/src/hw/nvme/bits.rs @@ -180,7 +180,7 @@ impl CompletionQueueEntry { /// A Dataset Management Range Definition as represented in memory. /// /// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition -#[derive(Debug, Default, Copy, Clone, FromBytes)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, FromBytes)] #[repr(C, packed(1))] pub struct DatasetManagementRangeDefinition { /// The context attributes specified for each range provides information about how the range @@ -1230,5 +1230,6 @@ mod test { assert_eq!(size_of::(), 4096); assert_eq!(size_of::(), 4); assert_eq!(size_of::(), 4096); + assert_eq!(size_of::(), 16); } } diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index 0754811c4..8c51a8e99 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -1206,6 +1206,8 @@ impl From for Completion { mod test { use crate::accessors::MemAccessor; use crate::common::*; + use crate::hw::nvme::bits::DatasetManagementRangeDefinition; + use crate::hw::nvme::cmds::DatasetManagementCmd; use crate::vmm::mem::PhysMap; use super::PrpIter; @@ -1410,4 +1412,127 @@ mod test { } assert_eq!(iter.next(), None); } + + static RANGES: [DatasetManagementRangeDefinition; 3] = [ + DatasetManagementRangeDefinition { + context_attributes: 0, + starting_lba: 0x1000, + number_logical_blocks: 0x10, + }, + DatasetManagementRangeDefinition { + context_attributes: 0, + starting_lba: 0x2000, + number_logical_blocks: 0x20, + }, + DatasetManagementRangeDefinition { + context_attributes: 0, + starting_lba: 0x3000, + number_logical_blocks: 0x30, + }, + ]; + + #[test] + fn test_dsmgmt_ranges() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let listaddr = 0x80000u64; + memctx.write_many(GuestAddr(listaddr), &RANGES); + + let cmd = DatasetManagementCmd { + prp1: listaddr, + prp2: 0, + nr: RANGES.len() as u16, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + for expected in &RANGES { + assert_eq!( + iter.next(), + Some(Ok(*expected)), + "bad range definition" + ); + } + assert_eq!(iter.next(), None); + } + + #[test] + fn test_dsmgmt_ranges_dual() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let listaddr1 = 0x80FF0u64; + let listaddr2 = 0x90000u64; + memctx.write_many(GuestAddr(listaddr1), &RANGES[0..1]); + memctx.write_many(GuestAddr(listaddr2), &RANGES[1..]); + + let cmd = DatasetManagementCmd { + prp1: listaddr1, + prp2: listaddr2, + nr: RANGES.len() as u16, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + for expected in &RANGES { + assert_eq!( + iter.next(), + Some(Ok(*expected)), + "bad range definition" + ); + } + assert_eq!(iter.next(), None); + } + + #[test] + fn test_dsmgmt_ranges_bad_dual() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let listaddr1 = 0x80FF8u64; + let listaddr2 = 0x90000u64; + memctx.write_many(GuestAddr(listaddr1), &RANGES[0..1]); + memctx.write_many(GuestAddr(listaddr2), &RANGES[1..]); + + let cmd = DatasetManagementCmd { + prp1: listaddr1, + prp2: listaddr2, + nr: RANGES.len() as u16, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + match iter.next() { + Some(Err(_)) => {} + other => panic!("expected alignment error, got {other:?}"), + } + } + + #[test] + fn test_dsmgmt_ranges_bad_address() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let cmd = DatasetManagementCmd { + prp1: VM_SIZE as u64, // out of bounds + prp2: 0, + nr: 1, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + match iter.next() { + Some(Err(_)) => {} + other => panic!("expected alignment error, got {other:?}"), + } + } } From 190c095be956fed399888890554518e119880198 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 15 Apr 2026 21:33:30 -0700 Subject: [PATCH 4/5] overflow check --- .../src/lib/stats/virtual_disk.rs | 4 ++-- lib/propolis/src/block/file.rs | 4 ++++ lib/propolis/src/hw/nvme/bits.rs | 22 ++++++++++++++----- lib/propolis/src/hw/nvme/cmds.rs | 1 + lib/propolis/src/hw/nvme/requests.rs | 19 ++++++++++++---- 5 files changed, 39 insertions(+), 11 deletions(-) diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 8ee0a4c85..bba317e0a 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -78,8 +78,8 @@ impl VirtualDiskStats { } Operation::Flush => self.on_flush_completion(result, duration), Operation::Discard => { - // Discard is only partially wired up in backends we care about, and it's not - // clear what stats (if any) to report yet. + // Discard is now wired up for local disks. We need to add support for it to the + // schema in Omicron before we can report stats for it. For now, just ignore it. } } } diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 5fac93ba0..5c0c2bc25 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -116,6 +116,10 @@ impl SharedState { block::Operation::Discard => { if let Some(mech) = self.discard_mech { for &(off, len) in &req.ranges { + // There might be some performance benefits to combining the ranges into + // one DKIOCFREE call, but ZFS will only issue one range to the + // underlying disk at a time, so we expect the benefit to be minimal in + // practice. dkioc::do_discard( &self.fp, mech, off as u64, len as u64, ) diff --git a/lib/propolis/src/hw/nvme/bits.rs b/lib/propolis/src/hw/nvme/bits.rs index 9cd86889e..75ce638b5 100644 --- a/lib/propolis/src/hw/nvme/bits.rs +++ b/lib/propolis/src/hw/nvme/bits.rs @@ -201,11 +201,23 @@ impl DatasetManagementRangeDefinition { Self { context_attributes, number_logical_blocks, starting_lba } } - pub fn offset_len(&self, lba_data_size: u64) -> (ByteOffset, ByteLen) { - ( - (self.starting_lba * lba_data_size) as ByteOffset, - (self.number_logical_blocks as u64 * lba_data_size) as ByteLen, - ) + pub fn offset_len( + &self, + lba_data_size: u64, + ) -> Result<(ByteOffset, ByteLen), &'static str> { + // Check for overflow in the byte offset calculation + let byte_offset = self.starting_lba.checked_mul(lba_data_size).ok_or( + "Starting LBA and LBA data size multiplication overflowed", + )?; + // Check for overflow in the byte length calculation + let byte_len = (u64::from(self.number_logical_blocks)) + .checked_mul(lba_data_size) + .ok_or("Number of logical blocks and LBA data size multiplication overflowed")?; + // Check for overflow of offset + length + byte_offset + .checked_add(byte_len) + .ok_or("Byte offset and byte length addition overflowed")?; + Ok((byte_offset as ByteOffset, byte_len as ByteLen)) } } diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index 8c51a8e99..0120ac00b 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -858,6 +858,7 @@ impl DatasetManagementCmd { /// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges. pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> { PrpIter::new( + // given that self.nr is at most 256, the multiplication here cannot overflow a u64 u64::from(self.nr) * size_of::() as u64, self.prp1, diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index e00158525..8427a2618 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -163,10 +163,8 @@ impl block::DeviceQueue for NvmeBlockQueue { cid, cmd.nr, )); - let Ok(ranges): Result, _> = cmd - .ranges(&mem) - .map_ok(|r| r.offset_len(params.lba_data_size)) - .try_collect() + let Ok(ranges): Result, _> = + cmd.ranges(&mem).try_collect() else { // If we couldn't read the ranges, fail the command permit.complete( @@ -175,6 +173,19 @@ impl block::DeviceQueue for NvmeBlockQueue { ); continue; }; + let Ok(ranges) = ranges + .into_iter() + .map(|r| r.offset_len(params.lba_data_size)) + .try_collect() + else { + // If the ranges were invalid (e.g. arithmetic overflow), fail the command + permit.complete( + Completion::generic_err(bits::STS_INVAL_FIELD) + .dnr(), + ); + continue; + }; + let req = Request::new_discard(ranges); return Some((req, permit, None)); } From efd72e04fc9e3a64d8f10c7a916f7f0c4a537049 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 20 Apr 2026 11:33:30 -0700 Subject: [PATCH 5/5] handle old ZFS --- bin/propolis-server/src/lib/initializer.rs | 1 + bin/propolis-standalone/src/config.rs | 3 ++- lib/propolis/src/block/file.rs | 25 +++++++++++++++++----- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 89658c840..f64a826cd 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -651,6 +651,7 @@ impl MachineInitializer<'_> { ..Default::default() }, nworkers, + self.log.clone(), ) .with_context(|| { format!( diff --git a/bin/propolis-standalone/src/config.rs b/bin/propolis-standalone/src/config.rs index 3cf1157c0..1269c74e1 100644 --- a/bin/propolis-standalone/src/config.rs +++ b/bin/propolis-standalone/src/config.rs @@ -248,7 +248,8 @@ pub fn block_backend( } None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(), }; - block::FileBackend::create(&parsed.path, opts, workers).unwrap() + block::FileBackend::create(&parsed.path, opts, workers, log.clone()) + .unwrap() } "crucible" => create_crucible_backend(be, opts, log), "crucible-mem" => create_crucible_mem_backend(be, opts, log), diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 5c0c2bc25..03d623361 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex}; use crate::block::{self, SyncWorkerCtx, WorkerId}; use crate::tasks::ThreadGroup; use crate::vmm::{MappingExt, MemCtx}; +use slog::warn; use anyhow::Context; @@ -31,6 +32,7 @@ struct SharedState { info: block::DeviceInfo, skip_flush: bool, + log: slog::Logger, } struct WceState { initial: bool, @@ -43,6 +45,7 @@ impl SharedState { skip_flush: bool, wce_state: Option, discard_mech: Option, + log: slog::Logger, ) -> Arc { let state = SharedState { fp, @@ -50,6 +53,7 @@ impl SharedState { discard_mech, skip_flush, info, + log, }; // Attempt to enable write caching if underlying resource supports it @@ -120,12 +124,21 @@ impl SharedState { // one DKIOCFREE call, but ZFS will only issue one range to the // underlying disk at a time, so we expect the benefit to be minimal in // practice. - dkioc::do_discard( + if let Err(e) = dkioc::do_discard( &self.fp, mech, off as u64, len as u64, - ) - .map_err(|_| { - "io error while attempting to free block(s)" - })?; + ) { + if e.kind() == ErrorKind::Unsupported { + // If the discard mechanism is unsupported, we should not have + // advertised support for discard in the first place. However, if + // this happens, it likely means we're running on older ZFS bits that + // don't support DKIOCFREE on raw zvols. Since this is not a supported + // configuration, but developer machines might be in this state, we + // swallow errors from the ioctl rather than failing the command. + warn!(self.log, "discard at offset {off} length {len} is unsupported; check ZFS version"); + } else { + return Err("io error while attempting to free block(s)"); + } + } } } else { unreachable!("handled above in processing_loop()"); @@ -167,6 +180,7 @@ impl FileBackend { path: impl AsRef, opts: block::BackendOpts, worker_count: NonZeroUsize, + log: slog::Logger, ) -> Result> { let p: &Path = path.as_ref(); @@ -210,6 +224,7 @@ impl FileBackend { skip_flush, wce_state, disk_info.discard_mech, + log, ), block_attach, worker_count,