From f4bf878524ea1a98a2669f38aae1bd9f0c4c8247 Mon Sep 17 00:00:00 2001
From: Wei Lu <wei@siftstack.com>
Date: Mon, 18 May 2026 18:11:13 -0700
Subject: [PATCH 01/15] rust(feat): ping sub-command (#572)

---
 rust/crates/sift_cli/src/cli/mod.rs  |  3 +++
 rust/crates/sift_cli/src/cmd/mod.rs  |  1 +
 rust/crates/sift_cli/src/cmd/ping.rs | 24 ++++++++++++++++++++++++
 rust/crates/sift_cli/src/main.rs     |  1 +
 4 files changed, 29 insertions(+)
 create mode 100644 rust/crates/sift_cli/src/cmd/ping.rs
diff --git a/rust/crates/sift_cli/src/cli/mod.rs b/rust/crates/sift_cli/src/cli/mod.rs
index 2fa3bd42e..ef00512e4 100644
--- a/rust/crates/sift_cli/src/cli/mod.rs
+++ b/rust/crates/sift_cli/src/cli/mod.rs
@@ -52,6 +52,9 @@ pub enum Cmd {
     /// Import time series files into Sift
     #[command(subcommand)]
     Import(ImportCmd),
+
+    /// Ping the Sift API to verify credentials and connectivity
+    Ping,
 }
 
 #[derive(Subcommand)]
diff --git a/rust/crates/sift_cli/src/cmd/mod.rs b/rust/crates/sift_cli/src/cmd/mod.rs
index 4ee74da15..fc2a2ba24 100644
--- a/rust/crates/sift_cli/src/cmd/mod.rs
+++ b/rust/crates/sift_cli/src/cmd/mod.rs
@@ -8,6 +8,7 @@ pub mod completions;
 pub mod config;
 pub mod export;
 pub mod import;
+pub mod ping;
 
 pub struct Context {
     pub grpc_uri: String,
diff --git a/rust/crates/sift_cli/src/cmd/ping.rs b/rust/crates/sift_cli/src/cmd/ping.rs
new file mode 100644
index 000000000..3ce5ff462
--- /dev/null
+++ b/rust/crates/sift_cli/src/cmd/ping.rs
@@ -0,0 +1,24 @@
+use std::process::ExitCode;
+
+use anyhow::{Context as AnyhowContext, Result};
+use sift_rs::ping::v1::{PingRequest, ping_service_client::PingServiceClient};
+
+use crate::util::{api::create_grpc_channel, tty::Output};
+
+use super::Context;
+
+pub async fn run(ctx: Context) -> Result<ExitCode> {
+    let grpc_channel = create_grpc_channel(&ctx)?;
+    let mut ping_service = PingServiceClient::new(grpc_channel);
+
+    let response = ping_service
+        .ping(PingRequest::default())
+        .await
+        .context("failed to ping Sift")?
+        .into_inner()
+        .response;
+
+    Output::new().line(response).print();
+
+    Ok(ExitCode::SUCCESS)
+}
diff --git a/rust/crates/sift_cli/src/main.rs b/rust/crates/sift_cli/src/main.rs
index 8ad9e6f88..037ce388b 100644
--- a/rust/crates/sift_cli/src/main.rs
+++ b/rust/crates/sift_cli/src/main.rs
@@ -90,6 +90,7 @@ fn run(clargs: cli::Args) -> Result<ExitCode> {
             cli::ExportCmd::Run(args) => run_future(cmd::export::run(ctx, args)),
             cli::ExportCmd::Asset(args) => run_future(cmd::export::asset(ctx, args)),
         },
+        Cmd::Ping => run_future(cmd::ping::run(ctx)),
         _ => Ok(ExitCode::SUCCESS),
     }
 }

From ab47756b53515578a0b93e21a0d54924cdf997d5 Mon Sep 17 00:00:00 2001
From: Brandon Shippy <84881014+Brandon-Shippy@users.noreply.github.com>
Date: Tue, 19 May 2026 10:38:07 -0700
Subject: [PATCH 02/15] rust(feat): Added cli args for parquet export (#573)

Co-authored-by: Brandon Shippy <brandon.shippy@siftstack.com>
---
 rust/crates/sift_cli/src/cli/export.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rust/crates/sift_cli/src/cli/export.rs b/rust/crates/sift_cli/src/cli/export.rs
index f82cd495a..eb589f3c2 100644
--- a/rust/crates/sift_cli/src/cli/export.rs
+++ b/rust/crates/sift_cli/src/cli/export.rs
@@ -5,6 +5,7 @@ use sift_rs::exports::v1::ExportOutputFormat;
 pub enum Format {
     Csv,
     Sun,
+    Parquet,
 }
 
 impl From<Format> for ExportOutputFormat {
@@ -12,6 +13,7 @@ impl From<Format> for ExportOutputFormat {
         match val {
             Format::Csv => ExportOutputFormat::Csv,
             Format::Sun => ExportOutputFormat::Sun,
+            Format::Parquet => ExportOutputFormat::Parquet,
         }
     }
 }

From 5406f229e23e4d0db5216fdc8bf2873a74378ea3 Mon Sep 17 00:00:00 2001
From: ian-sift <ian@siftstack.com>
Date: Tue, 19 May 2026 13:59:14 -0700
Subject: [PATCH 03/15] =?UTF-8?q?bugfix(python):=20Use=20a=20cross=20platf?=
 =?UTF-8?q?orm=20file=20locking=20package=20instead=20of=20=E2=80=A6=20(#5?=
 =?UTF-8?q?74)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/python_ci.yaml              |  7 ++
 .../low_level_wrappers/_test_results_log.py   | 73 ++++++++++---------
 python/pyproject.toml                         |  1 +
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/python_ci.yaml b/.github/workflows/python_ci.yaml
index 6f4f533db..1ec71fa55 100644
--- a/.github/workflows/python_ci.yaml
+++ b/.github/workflows/python_ci.yaml
@@ -72,6 +72,13 @@ jobs:
         run: |
           mypy lib
 
+      # Re-run mypy with --platform=win32 so typeshed evaluates platform-gated
+      # stubs (e.g. fcntl) as if we were on Windows. Catches imports that
+      # would only fail at runtime on Windows.
+      - name: MyPy (Windows platform)
+        run: |
+          mypy --platform=win32 lib
+
       - name: Pyright
         run: |
           pyright lib
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py b/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
index 4f41d7989..24e0534d7 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
@@ -1,11 +1,15 @@
 """Internal log-format primitives for test-result simulation logs.
 
-Two files per run:
+Three files per run:
 
 * **Log file** (e.g. ``foo.jsonl``) - append-only record of each logged API call,
   one line per call. Written by :func:`log_request_to_file` in the test process
   and read by :func:`iter_log_data_lines` / the replay subprocess. Has no header:
   every line is a data line.
+* **Lock sidecar** (``foo.jsonl.lock``) - empty file used by :class:`filelock.FileLock`
+  to coordinate appends and snapshot reads across processes. Created on demand;
+  ``filelock`` unlinks it on release on Unix, and on Windows it may linger but is
+  harmless to leave or delete after the run.
 * **Tracking sidecar** (``foo.jsonl.tracking``) - small JSON file holding the
   incremental replay cursor (``lastUploadedLine``) and the simulated-to-real ID
   map. Written only by the replay subprocess via :meth:`LogTracking.save` using
@@ -15,26 +19,26 @@
 
 # Concurrency
 
-With tracking moved out of the main log, the log file becomes strictly
-append-only and has exactly one in-place mutator (the writer) and one scanner
-(the replay subprocess). POSIX guarantees that an ``O_APPEND`` write atomically
-bumps the EOF, so parallel writers can't lose data. To keep a concurrent reader
-from observing a mid-append partial final line we still take ``LOCK_EX`` on the
-writer's single append and ``LOCK_SH`` on the reader's ``readlines()``; there
-is never any exclusive-vs-exclusive contention because nothing rewrites the
-file any more.
-
-The sidecar has a single writer (the replay subprocess) and no live reader, so
-it needs no locking. Atomic rename is still used to keep the on-disk contents
-valid across crashes.
-
-``flock`` is advisory, so this contract only holds for processes that use these
-helpers; ad-hoc writers are not protected.
+With tracking moved out of the main log, the log file is strictly append-only
+and has exactly one in-place mutator (the writer) and one scanner (the replay
+subprocess). Both serialize through a cross-platform exclusive ``FileLock`` on
+the lock sidecar: the writer holds it across a single append, the reader holds
+it across the snapshot ``readlines()``. That keeps a concurrent reader from
+observing a mid-append partial final line on any OS, including Windows where
+POSIX advisory ``flock`` is unavailable. The exclusive-only lock means a hot
+reader briefly blocks the writer (and vice versa), which is acceptable because
+writes are tiny and we only have one reader.
+
+The tracking sidecar has a single writer (the replay subprocess) and no live
+reader, so it needs no locking. Atomic rename is still used to keep the on-disk
+contents valid across crashes.
+
+The lock is advisory: it only protects callers that go through these helpers.
+Ad-hoc writers to the same path are not protected.
 """
 
 from __future__ import annotations
 
-import fcntl
 import json
 import os
 import re
@@ -42,6 +46,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator
 
+from filelock import FileLock
 from google.protobuf import json_format
 
 if TYPE_CHECKING:
@@ -153,9 +158,10 @@ def log_request_to_file(
 ) -> None:
     """Append a request as a JSON-encoded line to ``log_file``.
 
-    Takes ``LOCK_EX`` across the append so a concurrent reader holding
-    ``LOCK_SH`` in :func:`iter_log_data_lines` can't see a mid-write partial
-    final line. See the module docstring for the full concurrency model.
+    Holds an exclusive :class:`filelock.FileLock` on the sidecar across the
+    append so a concurrent reader in :func:`iter_log_data_lines` can't see a
+    mid-write partial final line. See the module docstring for the full
+    concurrency model.
 
     Args:
         log_file: Path to the log file.
@@ -171,15 +177,15 @@ def log_request_to_file(
     request_dict = json_format.MessageToDict(request)
     request_json = json.dumps(request_dict, separators=(",", ":"))
     line = f"[{tag}] {request_json}\n"
-    with open(log_path, "a") as f:
-        fcntl.flock(f, fcntl.LOCK_EX)
-        # Closing the file flushes and releases the flock atomically; no
-        # explicit unlock needed here.
-        f.write(line)
+    with FileLock(str(log_path.with_name(log_path.name + ".lock"))):
+        with open(log_path, "a") as f:
+            # The inner ``with`` flushes and closes the file before the
+            # FileLock is released, so no reader sees a partial line.
+            f.write(line)
 
 
 def iter_log_data_lines(
-    log_path: Path,
+    log_path: str | Path,
     start_line: int = 0,
 ) -> Generator[tuple[str, str | None, str], None, None]:
     """Parse data lines from a log file.
@@ -191,15 +197,16 @@ def iter_log_data_lines(
     iterator skips the first ``start_line`` lines and yields the rest. Pass 0
     to read all data lines.
 
-    Acquires ``LOCK_SH`` only while snapshotting the file into memory, then
-    releases before yielding. Lines appended by a concurrent
-    :func:`log_request_to_file` after the snapshot are not visible this call --
-    they will be picked up on the next invocation.
+    Holds the sidecar :class:`filelock.FileLock` only while snapshotting the
+    file into memory, then releases before yielding. Lines appended by a
+    concurrent :func:`log_request_to_file` after the snapshot are not visible
+    this call -- they will be picked up on the next invocation.
     """
     line_pattern = re.compile(r"^\[(\w+)(?::([^\]]+))?\]\s*(.+)$")
-    with open(log_path) as f:
-        fcntl.flock(f, fcntl.LOCK_SH)
-        raw_lines = f.readlines()
+    log_path = Path(log_path)
+    with FileLock(str(log_path.with_name(log_path.name + ".lock"))):
+        with open(log_path) as f:
+            raw_lines = f.readlines()
 
     data_line_count = 0
     for raw_line in raw_lines:
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 0fc34e914..b0b1d3cc2 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "types-requests~=2.25",
     "googleapis-common-protos>=1.60",
     "protoc-gen-openapiv2>=0.0.1",
+    "filelock~=3.13",
 ]
 
 [project.urls]

From 1d5e2df7c4f205070b3e89e699cc6d4b2dc2edec Mon Sep 17 00:00:00 2001
From: Wei Lu <wei@siftstack.com>
Date: Tue, 19 May 2026 14:31:17 -0700
Subject: [PATCH 04/15] python(chore): v0.16.1 prep (#577)

---
 python/CHANGELOG.md   | 5 +++++
 python/pyproject.toml | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index 246bf7416..e60084db8 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -3,6 +3,11 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## [v0.16.1] - May 19, 2026
+
+### Bugfixes
+- Replace the POSIX-only `fcntl.flock` call in the test-results log with a cross-platform `filelock.FileLock` so importing `sift_client` no longer fails on Windows. ([#574](https://github.com/sift-stack/sift/pull/574))
+
 ## [v0.16.0] - May 14, 2026
 
 ### What's New
diff --git a/python/pyproject.toml b/python/pyproject.toml
index b0b1d3cc2..629ef6174 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.16.0"
+version = "0.16.1"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }

From d1e6e70d131506fbb042f986abe79efdfe88acda Mon Sep 17 00:00:00 2001
From: Wei Lu <wei@siftstack.com>
Date: Wed, 20 May 2026 10:09:24 -0700
Subject: [PATCH 05/15] rust(fix): use job ID from upload response for
 deterministic lookup  (#576)

---
 rust/crates/sift_cli/Cargo.toml               |  1 +
 rust/crates/sift_cli/src/cmd/import/csv.rs    | 35 ++++---------------
 .../sift_cli/src/cmd/import/hdf5/import.rs    |  4 +--
 rust/crates/sift_cli/src/cmd/import/mod.rs    | 15 +++-----
 .../src/cmd/import/parquet/flat_dataset.rs    | 31 +++-------------
 .../src/cmd/import/tdms/detect_tdms_config.rs |  4 +--
 rust/crates/sift_cli/src/cmd/import/utils.rs  | 22 ++++++++++--
 rust/crates/sift_cli/src/util/job.rs          | 23 +-----------
 rust/crates/sift_cli/src/util/mod.rs          |  1 -
 rust/crates/sift_cli/src/util/user.rs         | 15 --------
 10 files changed, 39 insertions(+), 112 deletions(-)
 delete mode 100644 rust/crates/sift_cli/src/util/user.rs

diff --git a/rust/crates/sift_cli/Cargo.toml b/rust/crates/sift_cli/Cargo.toml
index fc04b8919..d3bcb2756 100644
--- a/rust/crates/sift_cli/Cargo.toml
+++ b/rust/crates/sift_cli/Cargo.toml
@@ -30,6 +30,7 @@ indicatif = { workspace = true }
 parquet = { workspace = true }
 pbjson-types = { workspace = true }
 reqwest = { workspace = true }
+serde_json = { workspace = true }
 sift_pbfs = { workspace = true }
 tdms = { workspace = true }
 hdf5 = { workspace = true }
diff --git a/rust/crates/sift_cli/src/cmd/import/csv.rs b/rust/crates/sift_cli/src/cmd/import/csv.rs
index 627556efd..eade97445 100644
--- a/rust/crates/sift_cli/src/cmd/import/csv.rs
+++ b/rust/crates/sift_cli/src/cmd/import/csv.rs
@@ -9,7 +9,6 @@ use anyhow::{Context as AnyhowContext, Result, anyhow};
 use chrono::DateTime;
 use crossterm::style::Stylize;
 use pbjson_types::Timestamp;
-use reqwest::header::{CONTENT_ENCODING, CONTENT_TYPE};
 use sift_rs::{
     common::r#type::v1::{ChannelConfig, ChannelDataType},
     data_imports::v2::{
@@ -25,15 +24,12 @@ use crate::{
         Context,
         import::utils::{try_parse_bit_field_config, try_parse_enum_config},
     },
-    util::{
-        api::{create_grpc_channel, create_rest_client},
-        tty::Output,
-    },
+    util::{api::create_grpc_channel, tty::Output},
 };
 
 use super::{
     preview_import_config,
-    utils::{gzip_file, validate_time_format},
+    utils::{upload_gzipped_file, validate_time_format},
     wait_for_job_completion,
 };
 
@@ -75,29 +71,10 @@ pub async fn run(ctx: Context, args: ImportCsvArgs) -> Result<ExitCode> {
         .into_inner();
 
     csv_file.rewind()?;
-    let compressed_data = gzip_file(csv_file)?;
-
-    let rest_client = create_rest_client(&ctx)?;
-    let res = rest_client
-        .post(upload_url)
-        .header(CONTENT_ENCODING, "gzip")
-        .header(CONTENT_TYPE, "text/csv")
-        .body(compressed_data)
-        .send()
+    let job_id = upload_gzipped_file(&ctx, &upload_url, csv_file, "text/csv")
         .await
         .context("failed to upload CSV file")?;
 
-    if !res.status().is_success() {
-        let status = res.status();
-        let text = res
-            .text()
-            .await
-            .unwrap_or_else(|_| "<failed to read body>".into());
-        return Err(anyhow!(
-            "failed to upload CSV with http status {status}: {text}"
-        ));
-    }
-
     let location = args.run.as_ref().map_or_else(
         || format!("asset '{}'", args.asset.cyan()),
         |r| format!("run '{}'", r.clone().cyan()),
@@ -113,7 +90,7 @@ pub async fn run(ctx: Context, args: ImportCsvArgs) -> Result<ExitCode> {
 
         return Ok(ExitCode::SUCCESS);
     }
-    wait_for_job_completion(grpc_channel, location).await
+    wait_for_job_completion(grpc_channel, job_id, location).await
 }
 
 fn create_data_import_request<R: io::Read>(
@@ -300,14 +277,14 @@ fn create_data_import_request<R: io::Read>(
                     let mut enum_configs = Vec::new();
                     let mut bit_field_configs = Vec::new();
 
-                    if data_type == ChannelDataType::Enum.into() {
+                    if data_type == i32::from(ChannelDataType::Enum) {
                         let Some(configs) = enum_configs_iter.next() else {
                             return Err(anyhow!(
                                 "'{name}' was declared as type enum but --enum-config was not specified"
                             ));
                         };
                         enum_configs = configs;
-                    } else if data_type == ChannelDataType::BitField.into() {
+                    } else if data_type == i32::from(ChannelDataType::BitField) {
                         let Some(configs) = bit_field_configs_iter.next() else {
                             return Err(anyhow!(
                                 "'{name}' was declared as type bit-field but --bit-field-config was not specified"
diff --git a/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs b/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs
index c6e3674ab..ba1f91f93 100644
--- a/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs
+++ b/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs
@@ -79,7 +79,7 @@ pub async fn run(ctx: Context, args: ImportHdf5Args) -> Result<ExitCode> {
         .context("error creating data import for hdf5")?
         .into_inner();
 
-    upload_gzipped_file(&ctx, &upload_url, file, "application/x-hdf5")
+    let job_id = upload_gzipped_file(&ctx, &upload_url, file, "application/x-hdf5")
         .await
         .context("failed to upload hdf5 file")?;
 
@@ -99,7 +99,7 @@ pub async fn run(ctx: Context, args: ImportHdf5Args) -> Result<ExitCode> {
         return Ok(ExitCode::SUCCESS);
     }
 
-    wait_for_job_completion(grpc_channel, location).await
+    wait_for_job_completion(grpc_channel, job_id, location).await
 }
 
 pub fn build_hdf5_config(args: &ImportHdf5Args) -> Result<Hdf5Config> {
diff --git a/rust/crates/sift_cli/src/cmd/import/mod.rs b/rust/crates/sift_cli/src/cmd/import/mod.rs
index 40e5c3b2e..c223242a0 100644
--- a/rust/crates/sift_cli/src/cmd/import/mod.rs
+++ b/rust/crates/sift_cli/src/cmd/import/mod.rs
@@ -3,13 +3,9 @@ use crossterm::style::Stylize;
 use std::{process::ExitCode, time::Duration};
 use tokio::time::sleep;
 
-use sift_rs::{
-    SiftChannel,
-    common::r#type::v1::ChannelConfig,
-    jobs::v1::{JobStatus, JobType},
-};
+use sift_rs::{SiftChannel, common::r#type::v1::ChannelConfig, jobs::v1::JobStatus};
 
-use crate::util::{job::JobServiceWrapper, progress::Spinner, tty::Output, user::get_user_id};
+use crate::util::{job::JobServiceWrapper, progress::Spinner, tty::Output};
 
 pub mod backup;
 pub mod csv;
@@ -25,18 +21,15 @@ const INDENT_4: &str = "        ";
 
 pub async fn wait_for_job_completion(
     grpc_channel: SiftChannel,
+    job_id: String,
     import_output_location: String,
 ) -> Result<ExitCode> {
     let spinner = Spinner::new();
     spinner.set_message(format!("{} file for processing", "Uploaded".green()));
 
-    let user_id = get_user_id(grpc_channel.clone()).await?;
     let mut job_service = JobServiceWrapper::new(grpc_channel.clone());
 
-    let Some(mut job) = job_service
-        .get_latest_job_for_user(&user_id, JobType::DataImport)
-        .await?
-    else {
+    let Some(mut job) = job_service.get_job(&job_id).await? else {
         spinner.finish_and_clear();
 
         Output::new()
diff --git a/rust/crates/sift_cli/src/cmd/import/parquet/flat_dataset.rs b/rust/crates/sift_cli/src/cmd/import/parquet/flat_dataset.rs
index 72f3bca8f..80aba690b 100644
--- a/rust/crates/sift_cli/src/cmd/import/parquet/flat_dataset.rs
+++ b/rust/crates/sift_cli/src/cmd/import/parquet/flat_dataset.rs
@@ -2,7 +2,6 @@ use std::{collections::HashMap, fs::File, io::Seek, process::ExitCode};
 
 use anyhow::{Context as AnyhowContext, Result, anyhow};
 use crossterm::style::Stylize;
-use reqwest::header::{CONTENT_ENCODING, CONTENT_TYPE};
 use sift_rs::{
     common::r#type::v1::{ChannelConfig, ChannelDataType},
     data_imports::v2::{
@@ -20,14 +19,11 @@ use crate::{
         import::{
             parquet::FooterMetadata,
             preview_import_config,
-            utils::{gzip_file, try_parse_bit_field_config, try_parse_enum_config},
+            utils::{try_parse_bit_field_config, try_parse_enum_config, upload_gzipped_file},
             wait_for_job_completion,
         },
     },
-    util::{
-        api::{create_grpc_channel, create_rest_client},
-        tty::Output,
-    },
+    util::{api::create_grpc_channel, tty::Output},
 };
 
 pub async fn run(ctx: Context, args: FlatDatasetArgs) -> Result<ExitCode> {
@@ -78,29 +74,10 @@ pub async fn run(ctx: Context, args: FlatDatasetArgs) -> Result<ExitCode> {
         .into_inner();
 
     file.rewind()?;
-    let compressed_data = gzip_file(file)?;
-
-    let rest_client = create_rest_client(&ctx)?;
-    let res = rest_client
-        .post(upload_url)
-        .header(CONTENT_ENCODING, "gzip")
-        .header(CONTENT_TYPE, "application/vnd.apache.parquet")
-        .body(compressed_data)
-        .send()
+    let job_id = upload_gzipped_file(&ctx, &upload_url, file, "application/vnd.apache.parquet")
         .await
         .context("failed to upload Parquet file")?;
 
-    if !res.status().is_success() {
-        let status = res.status();
-        let text = res
-            .text()
-            .await
-            .unwrap_or_else(|_| "<failed to read body>".into());
-        return Err(anyhow!(
-            "failed to upload Parquet with http status {status}: {text}"
-        ));
-    }
-
     let location = args.run.as_ref().map_or_else(
         || format!("asset '{}'", args.asset.cyan()),
         |r| format!("run '{}'", r.clone().cyan()),
@@ -116,7 +93,7 @@ pub async fn run(ctx: Context, args: FlatDatasetArgs) -> Result<ExitCode> {
 
         return Ok(ExitCode::SUCCESS);
     }
-    wait_for_job_completion(grpc_channel, location).await
+    wait_for_job_completion(grpc_channel, job_id, location).await
 }
 
 fn update_config_with_overrides(
diff --git a/rust/crates/sift_cli/src/cmd/import/tdms/detect_tdms_config.rs b/rust/crates/sift_cli/src/cmd/import/tdms/detect_tdms_config.rs
index 96cfe52d5..f2fe3cc81 100644
--- a/rust/crates/sift_cli/src/cmd/import/tdms/detect_tdms_config.rs
+++ b/rust/crates/sift_cli/src/cmd/import/tdms/detect_tdms_config.rs
@@ -69,7 +69,7 @@ pub async fn run(ctx: Context, args: ImportTdmsArgs) -> Result<ExitCode> {
         .context("error creating data import for tdms")?
         .into_inner();
 
-    upload_gzipped_file(&ctx, &upload_url, file, "application/octet-stream")
+    let job_id = upload_gzipped_file(&ctx, &upload_url, file, "application/octet-stream")
         .await
         .context("failed to upload tdms file")?;
 
@@ -89,7 +89,7 @@ pub async fn run(ctx: Context, args: ImportTdmsArgs) -> Result<ExitCode> {
         return Ok(ExitCode::SUCCESS);
     }
 
-    wait_for_job_completion(grpc_channel, location).await
+    wait_for_job_completion(grpc_channel, job_id, location).await
 }
 
 pub fn build_tdms_config(args: &ImportTdmsArgs) -> Result<TdmsConfig> {
diff --git a/rust/crates/sift_cli/src/cmd/import/utils.rs b/rust/crates/sift_cli/src/cmd/import/utils.rs
index 8fbc73830..af5f03dea 100644
--- a/rust/crates/sift_cli/src/cmd/import/utils.rs
+++ b/rust/crates/sift_cli/src/cmd/import/utils.rs
@@ -11,13 +11,14 @@ use sift_rs::common::r#type::v1::{ChannelBitFieldElement, ChannelEnumType};
 use crate::{cli::time::TimeFormat, cmd::Context as CmdContext, util::api::create_rest_client};
 
 /// Gzip and upload a file to a pre-signed upload URL with the given content type.
-/// Reads from the file's current cursor position.
+/// Reads from the file's current cursor position. Returns the job ID from the
+/// upload response.
 pub async fn upload_gzipped_file(
     ctx: &CmdContext,
     upload_url: &str,
     file: File,
     content_type: &str,
-) -> Result<()> {
+) -> Result<String> {
     let compressed_data = gzip_file(file)?;
     let rest_client = create_rest_client(ctx).context("failed to create rest client")?;
 
@@ -38,7 +39,22 @@ pub async fn upload_gzipped_file(
             .unwrap_or_else(|_| "<failed to read body>".into());
         return Err(anyhow!("upload failed with http status {status}: {text}"));
     }
-    Ok(())
+    extract_job_id(res).await
+}
+
+/// Parses the `jobId` from a successful upload response.
+async fn extract_job_id(res: reqwest::Response) -> Result<String> {
+    let body_text = res
+        .text()
+        .await
+        .context("failed to read upload response body")?;
+    let body: serde_json::Value =
+        serde_json::from_str(&body_text).context("failed to parse upload response as JSON")?;
+    body.get("jobId")
+        .and_then(|v| v.as_str())
+        .filter(|s| !s.is_empty())
+        .map(String::from)
+        .ok_or_else(|| anyhow!("upload response did not include jobId: {body_text}"))
 }
 
 /// Be sure that the file's cursor is rewinded to the start before hand.
diff --git a/rust/crates/sift_cli/src/util/job.rs b/rust/crates/sift_cli/src/util/job.rs
index e24fccc63..0bb8ff581 100644
--- a/rust/crates/sift_cli/src/util/job.rs
+++ b/rust/crates/sift_cli/src/util/job.rs
@@ -3,7 +3,7 @@ use std::ops::{Deref, DerefMut};
 use anyhow::{Context, Result};
 use sift_rs::{
     SiftChannel,
-    jobs::v1::{Job, JobType, ListJobsRequest, job_service_client::JobServiceClient},
+    jobs::v1::{Job, ListJobsRequest, job_service_client::JobServiceClient},
 };
 
 pub struct JobServiceWrapper(JobServiceClient<SiftChannel>);
@@ -28,27 +28,6 @@ impl JobServiceWrapper {
         JobServiceWrapper(job_service)
     }
 
-    pub async fn get_latest_job_for_user(
-        &mut self,
-        user_id: &str,
-        job_type: JobType,
-    ) -> Result<Option<Job>> {
-        let jt = job_type.as_str_name();
-
-        let res = self
-            .list_jobs(ListJobsRequest {
-                page_size: 1,
-                filter: format!("job_type == '{jt}' && created_by_user_id == '{user_id}'"),
-                order_by: "created_date desc".into(),
-                ..Default::default()
-            })
-            .await
-            .context("failed to retrieve latest user job")?
-            .into_inner();
-
-        Ok(res.jobs.first().cloned())
-    }
-
     pub async fn get_job(&mut self, job_id: &str) -> Result<Option<Job>> {
         let res = self
             .list_jobs(ListJobsRequest {
diff --git a/rust/crates/sift_cli/src/util/mod.rs b/rust/crates/sift_cli/src/util/mod.rs
index e1e3fe2f3..634ba6379 100644
--- a/rust/crates/sift_cli/src/util/mod.rs
+++ b/rust/crates/sift_cli/src/util/mod.rs
@@ -4,4 +4,3 @@ pub mod channel;
 pub mod job;
 pub mod progress;
 pub mod tty;
-pub mod user;
diff --git a/rust/crates/sift_cli/src/util/user.rs b/rust/crates/sift_cli/src/util/user.rs
deleted file mode 100644
index 6331cb7d8..000000000
--- a/rust/crates/sift_cli/src/util/user.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-use anyhow::{Context, Result};
-use sift_rs::{
-    SiftChannel,
-    me::v2::{GetMeRequest, me_service_client::MeServiceClient},
-};
-
-pub async fn get_user_id(grpc_channel: SiftChannel) -> Result<String> {
-    let mut me_service = MeServiceClient::new(grpc_channel);
-    let res = me_service
-        .get_me(GetMeRequest::default())
-        .await
-        .context("failed to retrieve user info from provided --profile")?
-        .into_inner();
-    Ok(res.user_id)
-}

From 83f2cc026e635a3d87b78693a6efbf6b0daf45df Mon Sep 17 00:00:00 2001
From: Benji Nguyen <45523555+solidiquis@users.noreply.github.com>
Date: Wed, 20 May 2026 10:10:21 -0700
Subject: [PATCH 06/15] rust(feature): bootstrap test utils + mcp (#578)

---
 Cargo.toml                                    |  38 +-
 rust/crates/sift_mcp/Cargo.toml               |  27 ++
 rust/crates/sift_mcp/src/error/mod.rs         |  39 ++
 rust/crates/sift_mcp/src/lib.rs               |  31 ++
 rust/crates/sift_mcp/src/server/mod.rs        |  29 ++
 rust/crates/sift_mcp/src/tool/mod.rs          |   1 +
 rust/crates/sift_mcp/src/tool/resource/mod.rs | 142 +++++++
 .../crates/sift_mcp/src/tool/resource/test.rs | 357 ++++++++++++++++++
 rust/crates/sift_test_util/CLAUDE.md          |  92 +++++
 rust/crates/sift_test_util/Cargo.toml         |  25 ++
 rust/crates/sift_test_util/README.md          |   3 +
 rust/crates/sift_test_util/src/grpc/mod.rs    |  39 ++
 rust/crates/sift_test_util/src/grpc/test.rs   |  56 +++
 rust/crates/sift_test_util/src/lib.rs         |   5 +
 .../sift_test_util/src/mock/assets/mod.rs     |   1 +
 .../sift_test_util/src/mock/assets/v1.rs      |  51 +++
 rust/crates/sift_test_util/src/mock/mod.rs    |   6 +
 .../sift_test_util/src/mock/runs/mod.rs       |   1 +
 .../crates/sift_test_util/src/mock/runs/v2.rs |  74 ++++
 rust/crates/sift_test_util/src/mock/test.rs   | 174 +++++++++
 20 files changed, 1175 insertions(+), 16 deletions(-)
 create mode 100644 rust/crates/sift_mcp/Cargo.toml
 create mode 100644 rust/crates/sift_mcp/src/error/mod.rs
 create mode 100644 rust/crates/sift_mcp/src/lib.rs
 create mode 100644 rust/crates/sift_mcp/src/server/mod.rs
 create mode 100644 rust/crates/sift_mcp/src/tool/mod.rs
 create mode 100644 rust/crates/sift_mcp/src/tool/resource/mod.rs
 create mode 100644 rust/crates/sift_mcp/src/tool/resource/test.rs
 create mode 100644 rust/crates/sift_test_util/CLAUDE.md
 create mode 100644 rust/crates/sift_test_util/Cargo.toml
 create mode 100644 rust/crates/sift_test_util/README.md
 create mode 100644 rust/crates/sift_test_util/src/grpc/mod.rs
 create mode 100644 rust/crates/sift_test_util/src/grpc/test.rs
 create mode 100644 rust/crates/sift_test_util/src/lib.rs
 create mode 100644 rust/crates/sift_test_util/src/mock/assets/mod.rs
 create mode 100644 rust/crates/sift_test_util/src/mock/assets/v1.rs
 create mode 100644 rust/crates/sift_test_util/src/mock/mod.rs
 create mode 100644 rust/crates/sift_test_util/src/mock/runs/mod.rs
 create mode 100644 rust/crates/sift_test_util/src/mock/runs/v2.rs
 create mode 100644 rust/crates/sift_test_util/src/mock/test.rs

diff --git a/Cargo.toml b/Cargo.toml
index c8fcda367..c3510a20c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,14 +1,16 @@
 [workspace]
 resolver = "2"
 members = [
-  "rust/crates/sift_error",
-  "rust/crates/sift_rs",
-  "rust/crates/sift_stream",
-  "rust/crates/sift_error",
-  "rust/crates/sift_connect",
-  "rust/crates/sift_stream_bindings",
-  "rust/crates/sift_cli",
-  "rust/crates/sift_pbfs",
+    "rust/crates/sift_error",
+    "rust/crates/sift_rs",
+    "rust/crates/sift_stream",
+    "rust/crates/sift_error",
+    "rust/crates/sift_connect",
+    "rust/crates/sift_stream_bindings",
+    "rust/crates/sift_cli",
+    "rust/crates/sift_pbfs",
+    "rust/crates/sift_mcp",
+    "rust/crates/sift_test_util",
 ]
 
 [workspace.package]
@@ -28,8 +30,9 @@ arrow-array = "58.1.0"
 arrow-schema = "58.1.0"
 async-channel = "2.2"
 async-trait = "^0.1"
+mockall = "0.14.0"
 bytes = "1.11.1"
-bytesize = { version = "2" }
+bytesize = "2"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.6", features = ["cargo", "derive", "wrap_help"] }
 clap_complete = "4.6"
@@ -43,7 +46,7 @@ flate2 = "1.1"
 futures = { version = "0.3", default-features = false, features = ["alloc"] }
 futures-core = "0.3"
 hyper = { version = "1.8", features = ["server", "http1"] }
-hyper-util = { version = "0.1", features = ["service", "server", "tokio"] }
+hyper-util = { version = "0.1.20", features = ["service", "server", "tokio"] }
 indicatif = "0.18"
 indoc = "2.0"
 parquet = "58.0"
@@ -51,23 +54,24 @@ prost = "^0.14"
 prost-types = "^0.14"
 pbjson = "^0.9"
 pbjson-types = "^0.9"
-pyo3 = { version = "0.28" }
+pyo3 = "0.28"
 pyo3-async-runtimes = { version = "0.28", features = ["tokio-runtime"] }
-pyo3-stub-gen = { version = "0.10" }
+pyo3-stub-gen = "0.10" 
 rand = "0.10"
 reqwest = "0.13"
-serde = { version = "^1.0" }
-serde_json = { version = "^1.0" }
+rmcp = "1.7.0"
+serde = "^1.0"
+serde_json = "^1.0" 
 tdms = "0.3.0"
 hdf5 = { package = "hdf5-metno", version = "0.12", features = ["static"] }
 tempdir = "0.3"
-tokio = { version = "1" }
+tokio = "1"
 tokio-stream = "0.1"
 toml = "0.9"
 tonic = { version = "^0.14", features = ["gzip"] }
 tonic-prost = "^0.14"
 tower = "^0.5"
-tracing = { version = "0.1" }
+tracing = "0.1"
 tracing-appender = "0.2"
 tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
 tracing-test = { version = "0.2", features = ["no-env-filter"] }
@@ -79,6 +83,8 @@ sift_rs = { version = "0.9.1", path = "rust/crates/sift_rs" }
 sift_error = { version = "0.9.1", path = "rust/crates/sift_error" }
 sift_stream = { version = "0.9.1", path = "rust/crates/sift_stream" }
 sift_pbfs = { version = "0.9.1", path = "rust/crates/sift_pbfs" }
+sift_mcp = { version = "0.9.1", path = "rust/crates/sift_mcp" }
+sift_test_util = { version = "0.9.1", path = "rust/crates/sift_test_util" }
 
 sift_stream_bindings = { version = "0.3.0", path = "rust/crates/sift_stream_bindings" }
 
diff --git a/rust/crates/sift_mcp/Cargo.toml b/rust/crates/sift_mcp/Cargo.toml
new file mode 100644
index 000000000..3319932e7
--- /dev/null
+++ b/rust/crates/sift_mcp/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "sift_mcp"
+authors.workspace = true
+version.workspace = true
+edition.workspace = true
+categories.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+rmcp = { workspace = true, features = ["transport-io"] }
+serde.workspace = true
+serde_json.workspace = true
+sift_connect.workspace = true
+sift_rs.workspace = true
+sift_stream.workspace = true
+tokio.workspace = true
+tonic.workspace = true
+anyhow.workspace = true
+clap = { workspace = true, features = ["cargo"] }
+
+[dev-dependencies]
+sift_test_util.workspace = true
+tokio-stream.workspace = true
diff --git a/rust/crates/sift_mcp/src/error/mod.rs b/rust/crates/sift_mcp/src/error/mod.rs
new file mode 100644
index 000000000..a72f5b069
--- /dev/null
+++ b/rust/crates/sift_mcp/src/error/mod.rs
@@ -0,0 +1,39 @@
+use rmcp::model::{CallToolResult, ErrorCode};
+use serde_json::json;
+use tonic::{Code, Status};
+
+pub type McpResult = Result<CallToolResult, rmcp::ErrorData>;
+
+pub fn from_grpc_status(status: Status) -> rmcp::ErrorData {
+    let code = from_grpc_code(status.code());
+    let message = status.message().to_string();
+    let data = Some(json!({
+        "grpc_code": status.code().to_string(),
+    }));
+
+    rmcp::ErrorData {
+        code,
+        message: message.into(),
+        data,
+    }
+}
+
+pub fn from_anyhow(error: anyhow::Error) -> rmcp::ErrorData {
+    let code = ErrorCode::INTERNAL_ERROR;
+    let message = format!("{error:?}");
+
+    rmcp::ErrorData {
+        code,
+        message: message.into(),
+        data: None,
+    }
+}
+
+fn from_grpc_code(code: Code) -> ErrorCode {
+    match code {
+        Code::InvalidArgument | Code::OutOfRange => ErrorCode::INVALID_PARAMS,
+        Code::NotFound => ErrorCode::RESOURCE_NOT_FOUND,
+        Code::Unimplemented => ErrorCode::METHOD_NOT_FOUND,
+        _ => ErrorCode::INTERNAL_ERROR,
+    }
+}
diff --git a/rust/crates/sift_mcp/src/lib.rs b/rust/crates/sift_mcp/src/lib.rs
new file mode 100644
index 000000000..65e09a004
--- /dev/null
+++ b/rust/crates/sift_mcp/src/lib.rs
@@ -0,0 +1,31 @@
+use anyhow::{Context, Result};
+use clap::{crate_name, crate_version};
+use rmcp::{ServiceExt, transport::stdio};
+use sift_rs::{Credentials, SiftChannelBuilder};
+
+pub(crate) mod server;
+use server::SiftMcpServer;
+
+pub mod tool;
+
+mod error;
+
+pub async fn run(credentials: Credentials, use_tls: bool) -> Result<()> {
+    let channel = SiftChannelBuilder::new(credentials)
+        .use_tls(use_tls)
+        .user_agent(format!("{}/{}", crate_name!(), crate_version!()))
+        .build()
+        .context("failed to build gRPC channel to connect to Sift")?;
+
+    let service = SiftMcpServer::new(channel)
+        .serve(stdio())
+        .await
+        .context("failed to start MCP server")?;
+
+    service
+        .waiting()
+        .await
+        .context("MCP server terminated unexpectedly")?;
+
+    Ok(())
+}
diff --git a/rust/crates/sift_mcp/src/server/mod.rs b/rust/crates/sift_mcp/src/server/mod.rs
new file mode 100644
index 000000000..ae98d9e39
--- /dev/null
+++ b/rust/crates/sift_mcp/src/server/mod.rs
@@ -0,0 +1,29 @@
+use rmcp::{ServerHandler, handler::server::tool::ToolRouter, tool_handler};
+use sift_rs::SiftChannel;
+
+#[derive(Clone)]
+pub struct SiftMcpServer {
+    pub(crate) channel: SiftChannel,
+    pub tool_router: ToolRouter<Self>,
+}
+
+#[tool_handler(
+    router = self.tool_router,
+    name = "SiftMcp",
+    version = "0.1.0",
+    instructions = "Sift MCP Server",
+)]
+impl ServerHandler for SiftMcpServer {}
+
+impl SiftMcpServer {
+    pub fn new(channel: SiftChannel) -> Self {
+        // Add more routers here as new tool groups are introduced, e.g.
+        //   Self::resource_router().merge(Self::ingestion_router())
+
+        let tool_router = Self::resource_router();
+        Self {
+            channel,
+            tool_router,
+        }
+    }
+}
diff --git a/rust/crates/sift_mcp/src/tool/mod.rs b/rust/crates/sift_mcp/src/tool/mod.rs
new file mode 100644
index 000000000..c6bee0532
--- /dev/null
+++ b/rust/crates/sift_mcp/src/tool/mod.rs
@@ -0,0 +1 @@
+pub mod resource;
diff --git a/rust/crates/sift_mcp/src/tool/resource/mod.rs b/rust/crates/sift_mcp/src/tool/resource/mod.rs
new file mode 100644
index 000000000..195ae86b2
--- /dev/null
+++ b/rust/crates/sift_mcp/src/tool/resource/mod.rs
@@ -0,0 +1,142 @@
+use anyhow::Context;
+use rmcp::{
+    handler::server::wrapper::Parameters,
+    model::CallToolResult,
+    schemars::{self, JsonSchema},
+    tool, tool_router,
+};
+use serde::Deserialize;
+use sift_rs::{
+    assets::v1::{ListAssetsRequest, ListAssetsResponse, asset_service_client::AssetServiceClient},
+    runs::v2::{ListRunsRequest, ListRunsResponse, run_service_client::RunServiceClient},
+};
+
+use crate::{error, server::SiftMcpServer};
+
+#[cfg(test)]
+mod test;
+
+const PAGE_SIZE: u32 = 1000;
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct GetParams {
+    filter: String,
+    limit: Option<u32>,
+}
+
+#[tool_router(router = resource_router, vis = "pub(crate)")]
+impl SiftMcpServer {
+    #[tool(
+        name = "get_asset",
+        description = "
+            Retrieve and filter assets in Sift. The `filter` parameter is a Common Expression Language (CEL).
+            Available fields to filter by are `asset_id`, `created_by_user_id`, `modified_by_user_id`,
+            `created_date`, `modified_date`, `name`, 'name_lower', `tag_id`, `tag_name`, 'archived_date', `is_archived`, and `metadata`.
+            Metadata can be used in filters by using `metadata.{metadata_key_name}` as the field name.
+        ",
+        annotations(title = "Resource/get_asset", read_only_hint = true)
+    )]
+    pub async fn get_asset(&self, params: Parameters<GetParams>) -> error::McpResult {
+        let Parameters(GetParams { filter, limit }) = params;
+        let (page_size, record_limit) = paging(limit);
+
+        let mut client = AssetServiceClient::new(self.channel.clone());
+        let mut page_token = String::new();
+        let mut results = Vec::new();
+
+        loop {
+            let resp = client
+                .list_assets(ListAssetsRequest {
+                    filter: filter.clone(),
+                    page_size,
+                    page_token,
+                    ..Default::default()
+                })
+                .await
+                .map_err(error::from_grpc_status)?;
+
+            let ListAssetsResponse {
+                assets,
+                next_page_token,
+            } = resp.into_inner();
+            if assets.is_empty() {
+                break;
+            }
+            results.extend(assets);
+
+            if results.len() >= record_limit || next_page_token.is_empty() {
+                break;
+            }
+            page_token = next_page_token;
+        }
+
+        results.truncate(record_limit);
+        let out = serde_json::to_value(&results)
+            .context("failed to serialize assets")
+            .map_err(error::from_anyhow)?;
+
+        Ok(CallToolResult::structured(out))
+    }
+
+    #[tool(
+        name = "get_run",
+        description = "
+            Retrieve and filter runs in Sift. The `filter` parameter is a Common Expression Language (CEL).
+            Available fields to filter by are `run_id` `organization_id`, `asset_id`, `asset_name`, `client_key`, `name`,
+            `description`, `created_by_user_id`, `modified_by_user_id`, `created_date`, `modified_date`, `start_time`,
+            `stop_time`, `tag_id`, `asset_tag_id`, `duration`, 'duration_string', `annotation_comments_count`, `annotation_state`,
+            `archived_date`, `is_archived`, and `metadata`. Metadata can be used in filters by using
+            `metadata.{metadata_key_name}` as the field name. `duration` is in the format of elapsed seconds and `duration_string`
+            allows for `h`, `m`, `s`, `ms` suffixes (example: `duration_string > duration('10h')).
+        ",
+        annotations(title = "Resource/get_run", read_only_hint = true)
+    )]
+    pub async fn get_run(&self, params: Parameters<GetParams>) -> error::McpResult {
+        let Parameters(GetParams { filter, limit }) = params;
+        let (page_size, record_limit) = paging(limit);
+
+        let mut client = RunServiceClient::new(self.channel.clone());
+        let mut page_token = String::new();
+        let mut results = Vec::new();
+
+        loop {
+            let resp = client
+                .list_runs(ListRunsRequest {
+                    filter: filter.clone(),
+                    page_size,
+                    page_token,
+                    ..Default::default()
+                })
+                .await
+                .map_err(error::from_grpc_status)?;
+
+            let ListRunsResponse {
+                runs,
+                next_page_token,
+            } = resp.into_inner();
+            if runs.is_empty() {
+                break;
+            }
+            results.extend(runs);
+
+            if results.len() >= record_limit || next_page_token.is_empty() {
+                break;
+            }
+            page_token = next_page_token;
+        }
+
+        results.truncate(record_limit);
+        let out = serde_json::to_value(&results)
+            .context("failed to serialize runs")
+            .map_err(error::from_anyhow)?;
+
+        Ok(CallToolResult::structured(out))
+    }
+}
+
+fn paging(limit: Option<u32>) -> (u32, usize) {
+    match limit {
+        Some(lim) if lim <= PAGE_SIZE => (lim, lim as usize),
+        _ => (PAGE_SIZE, usize::MAX),
+    }
+}
diff --git a/rust/crates/sift_mcp/src/tool/resource/test.rs b/rust/crates/sift_mcp/src/tool/resource/test.rs
new file mode 100644
index 000000000..9896f3f45
--- /dev/null
+++ b/rust/crates/sift_mcp/src/tool/resource/test.rs
@@ -0,0 +1,357 @@
+use rmcp::{handler::server::wrapper::Parameters, model::ErrorCode};
+use serde_json::Value;
+use sift_rs::{
+    assets::v1::{Asset, ListAssetsResponse, asset_service_server::AssetServiceServer},
+    runs::v2::{ListRunsResponse, Run, run_service_server::RunServiceServer},
+};
+use sift_test_util::{
+    grpc::memory_sift_channel,
+    mock::{assets::v1::MockAssetServiceImpl, runs::v2::MockRunServiceImpl},
+};
+use tokio::task::JoinHandle;
+use tonic::{Response, Status, transport::Server};
+
+use super::{GetParams, PAGE_SIZE};
+use crate::server::SiftMcpServer;
+
+async fn server_with_mocks(
+    asset_mock: MockAssetServiceImpl,
+    run_mock: MockRunServiceImpl,
+) -> (SiftMcpServer, JoinHandle<()>) {
+    let (client, server) = tokio::io::duplex(1024);
+    let channel = memory_sift_channel(client).await;
+
+    let handle = tokio::spawn(async move {
+        Server::builder()
+            .add_service(AssetServiceServer::new(asset_mock))
+            .add_service(RunServiceServer::new(run_mock))
+            .serve_with_incoming(tokio_stream::once(Ok::<_, std::io::Error>(server)))
+            .await
+            .unwrap();
+    });
+
+    (SiftMcpServer::new(channel), handle)
+}
+
+fn params(filter: &str, limit: Option<u32>) -> Parameters<GetParams> {
+    Parameters(GetParams {
+        filter: filter.into(),
+        limit,
+    })
+}
+
+fn structured(result: rmcp::model::CallToolResult) -> Value {
+    result
+        .structured_content
+        .expect("expected structured content")
+}
+
+#[tokio::test]
+async fn get_asset_returns_single_page() {
+    let mut asset_mock = MockAssetServiceImpl::new();
+    asset_mock
+        .expect_list_assets()
+        .withf(|req| req.get_ref().filter == "name == \"engine\"")
+        .returning(|_| {
+            Ok(Response::new(ListAssetsResponse {
+                assets: vec![
+                    Asset {
+                        asset_id: "a1".into(),
+                        name: "engine".into(),
+                        ..Default::default()
+                    },
+                    Asset {
+                        asset_id: "a2".into(),
+                        name: "engine".into(),
+                        ..Default::default()
+                    },
+                ],
+                next_page_token: String::new(),
+            }))
+        });
+
+    let (server, _h) = server_with_mocks(asset_mock, MockRunServiceImpl::new()).await;
+
+    let resp = server
+        .get_asset(params("name == \"engine\"", None))
+        .await
+        .expect("get_asset failed");
+
+    let assets = structured(resp);
+    assert_eq!(assets.as_array().unwrap().len(), 2);
+    assert_eq!(assets[0]["assetId"], "a1");
+    assert_eq!(assets[1]["assetId"], "a2");
+}
+
+#[tokio::test]
+async fn get_asset_paginates_until_token_empty() {
+    let mut asset_mock = MockAssetServiceImpl::new();
+    asset_mock.expect_list_assets().returning(|req| {
+        let req = req.into_inner();
+        assert_eq!(req.page_size, PAGE_SIZE);
+        let (assets, next) = match req.page_token.as_str() {
+            "" => (
+                vec![Asset {
+                    asset_id: "a1".into(),
+                    ..Default::default()
+                }],
+                "page-2".to_string(),
+            ),
+            "page-2" => (
+                vec![Asset {
+                    asset_id: "a2".into(),
+                    ..Default::default()
+                }],
+                "page-3".to_string(),
+            ),
+            "page-3" => (
+                vec![Asset {
+                    asset_id: "a3".into(),
+                    ..Default::default()
+                }],
+                String::new(),
+            ),
+            other => return Err(Status::invalid_argument(format!("bad token: {other}"))),
+        };
+        Ok(Response::new(ListAssetsResponse {
+            assets,
+            next_page_token: next,
+        }))
+    });
+
+    let (server, _h) = server_with_mocks(asset_mock, MockRunServiceImpl::new()).await;
+
+    let resp = server
+        .get_asset(params("", None))
+        .await
+        .expect("get_asset failed");
+
+    let assets = structured(resp);
+    let ids: Vec<&str> = assets
+        .as_array()
+        .unwrap()
+        .iter()
+        .map(|a| a["assetId"].as_str().unwrap())
+        .collect();
+    assert_eq!(ids, vec!["a1", "a2", "a3"]);
+}
+
+#[tokio::test]
+async fn get_asset_respects_limit() {
+    let mut asset_mock = MockAssetServiceImpl::new();
+    asset_mock.expect_list_assets().times(1).returning(|req| {
+        let req = req.into_inner();
+        assert_eq!(req.page_size, 2);
+        Ok(Response::new(ListAssetsResponse {
+            assets: vec![
+                Asset {
+                    asset_id: "a1".into(),
+                    ..Default::default()
+                },
+                Asset {
+                    asset_id: "a2".into(),
+                    ..Default::default()
+                },
+            ],
+            next_page_token: "page-2".into(),
+        }))
+    });
+
+    let (server, _h) = server_with_mocks(asset_mock, MockRunServiceImpl::new()).await;
+
+    let resp = server
+        .get_asset(params("", Some(2)))
+        .await
+        .expect("get_asset failed");
+
+    let assets = structured(resp);
+    assert_eq!(assets.as_array().unwrap().len(), 2);
+}
+
+#[tokio::test]
+async fn get_asset_breaks_on_empty_page() {
+    let mut asset_mock = MockAssetServiceImpl::new();
+    asset_mock.expect_list_assets().times(1).returning(|_| {
+        Ok(Response::new(ListAssetsResponse {
+            assets: vec![],
+            next_page_token: "ignored".into(),
+        }))
+    });
+
+    let (server, _h) = server_with_mocks(asset_mock, MockRunServiceImpl::new()).await;
+
+    let resp = server
+        .get_asset(params("", None))
+        .await
+        .expect("get_asset failed");
+
+    assert!(structured(resp).as_array().unwrap().is_empty());
+}
+
+#[tokio::test]
+async fn get_asset_propagates_grpc_error() {
+    let mut asset_mock = MockAssetServiceImpl::new();
+    asset_mock
+        .expect_list_assets()
+        .returning(|_| Err(Status::invalid_argument("bad filter")));
+
+    let (server, _h) = server_with_mocks(asset_mock, MockRunServiceImpl::new()).await;
+
+    let err = server
+        .get_asset(params("nope", None))
+        .await
+        .expect_err("expected error");
+
+    assert_eq!(err.code, ErrorCode::INVALID_PARAMS);
+    assert!(err.message.contains("bad filter"));
+}
+
+#[tokio::test]
+async fn get_run_returns_single_page() {
+    let mut run_mock = MockRunServiceImpl::new();
+    run_mock
+        .expect_list_runs()
+        .withf(|req| req.get_ref().filter == "name == \"launch\"")
+        .returning(|_| {
+            Ok(Response::new(ListRunsResponse {
+                runs: vec![Run {
+                    run_id: "r1".into(),
+                    name: "launch".into(),
+                    ..Default::default()
+                }],
+                next_page_token: String::new(),
+            }))
+        });
+
+    let (server, _h) = server_with_mocks(MockAssetServiceImpl::new(), run_mock).await;
+
+    let resp = server
+        .get_run(params("name == \"launch\"", None))
+        .await
+        .expect("get_run failed");
+
+    let runs = structured(resp);
+    assert_eq!(runs.as_array().unwrap().len(), 1);
+    assert_eq!(runs[0]["runId"], "r1");
+}
+
+#[tokio::test]
+async fn get_run_paginates_until_token_empty() {
+    let mut run_mock = MockRunServiceImpl::new();
+    run_mock.expect_list_runs().returning(|req| {
+        let token = req.into_inner().page_token;
+        let (runs, next) = match token.as_str() {
+            "" => (
+                vec![Run {
+                    run_id: "r1".into(),
+                    ..Default::default()
+                }],
+                "page-2".to_string(),
+            ),
+            "page-2" => (
+                vec![Run {
+                    run_id: "r2".into(),
+                    ..Default::default()
+                }],
+                String::new(),
+            ),
+            other => return Err(Status::invalid_argument(format!("bad token: {other}"))),
+        };
+        Ok(Response::new(ListRunsResponse {
+            runs,
+            next_page_token: next,
+        }))
+    });
+
+    let (server, _h) = server_with_mocks(MockAssetServiceImpl::new(), run_mock).await;
+
+    let resp = server
+        .get_run(params("", None))
+        .await
+        .expect("get_run failed");
+
+    let runs = structured(resp);
+    let ids: Vec<&str> = runs
+        .as_array()
+        .unwrap()
+        .iter()
+        .map(|r| r["runId"].as_str().unwrap())
+        .collect();
+    assert_eq!(ids, vec!["r1", "r2"]);
+}
+
+#[tokio::test]
+async fn get_run_truncates_to_limit_across_pages() {
+    let mut run_mock = MockRunServiceImpl::new();
+    run_mock.expect_list_runs().returning(|req| {
+        let req = req.into_inner();
+        assert_eq!(req.page_size, 3);
+        let (runs, next) = match req.page_token.as_str() {
+            "" => (
+                vec![
+                    Run {
+                        run_id: "r1".into(),
+                        ..Default::default()
+                    },
+                    Run {
+                        run_id: "r2".into(),
+                        ..Default::default()
+                    },
+                ],
+                "page-2".to_string(),
+            ),
+            "page-2" => (
+                vec![
+                    Run {
+                        run_id: "r3".into(),
+                        ..Default::default()
+                    },
+                    Run {
+                        run_id: "r4".into(),
+                        ..Default::default()
+                    },
+                ],
+                String::new(),
+            ),
+            other => return Err(Status::invalid_argument(format!("bad token: {other}"))),
+        };
+        Ok(Response::new(ListRunsResponse {
+            runs,
+            next_page_token: next,
+        }))
+    });
+
+    let (server, _h) = server_with_mocks(MockAssetServiceImpl::new(), run_mock).await;
+
+    let resp = server
+        .get_run(params("", Some(3)))
+        .await
+        .expect("get_run failed");
+
+    let runs = structured(resp);
+    let ids = runs
+        .as_array()
+        .unwrap()
+        .iter()
+        .map(|r| r["runId"].as_str().unwrap())
+        .collect::<Vec<_>>();
+    assert_eq!(ids, vec!["r1", "r2", "r3"]);
+}
+
+#[tokio::test]
+async fn get_run_propagates_grpc_error() {
+    let mut run_mock = MockRunServiceImpl::new();
+    run_mock
+        .expect_list_runs()
+        .returning(|_| Err(Status::not_found("no such run")));
+
+    let (server, _h) = server_with_mocks(MockAssetServiceImpl::new(), run_mock).await;
+
+    let err = server
+        .get_run(params("", None))
+        .await
+        .expect_err("expected error");
+
+    assert_eq!(err.code, ErrorCode::RESOURCE_NOT_FOUND);
+    assert!(err.message.contains("no such run"));
+}
diff --git a/rust/crates/sift_test_util/CLAUDE.md b/rust/crates/sift_test_util/CLAUDE.md
new file mode 100644
index 000000000..a6b60167a
--- /dev/null
+++ b/rust/crates/sift_test_util/CLAUDE.md
@@ -0,0 +1,92 @@
+# sift_test_util — guidance for Claude
+
+This crate exposes `mockall`-generated mocks for `sift_rs` gRPC service traits. When the user asks you to "create a mock for `sift_rs::<path>::<ServiceTrait>`", follow the recipe below.
+
+## Reference example
+
+`src/mock/assets/v1.rs` is the canonical example. It mocks `sift_rs::assets::v1::asset_service_server::AssetService`. Mirror its structure exactly when adding a new mock.
+
+## Recipe: adding a mock for a `sift_rs` service trait
+
+Given a request like *"create me a mock for `sift_rs::assets::v1::asset_service_server::AssetService`"*:
+
+### 1. Locate the generated trait
+
+The trait lives in `rust/crates/sift_rs/src/gen/sift/<service>/<version>/sift.<service>.<version>.tonic.rs`. For example:
+
+- `sift_rs::assets::v1::asset_service_server::AssetService` → `rust/crates/sift_rs/src/gen/sift/assets/v1/sift.assets.v1.tonic.rs`
+- `sift_rs::ingest::v1::ingest_service_server::IngestService` → `rust/crates/sift_rs/src/gen/sift/ingest/v1/sift.ingest.v1.tonic.rs`
+
+Inside that file, find the `pub trait <Name>: ... { ... }` block. Every `async fn` in it must appear in the mock — copy the signatures verbatim.
+
+### 2. Pick the file path
+
+The mock path mirrors the `sift_rs` path:
+
+| `sift_rs` trait | mock file |
+| --- | --- |
+| `sift_rs::<service>::<version>::<svc>_service_server::<Svc>Service` | `src/mock/<service>/<version>.rs` |
+
+For example `sift_rs::ingest::v1::ingest_service_server::IngestService` → `src/mock/ingest/v1.rs`. Create intermediate `mod.rs` files (`src/mock/<service>/mod.rs`) if the service directory doesn't exist yet, and add `pub mod <service>;` to `src/mock/mod.rs`.
+
+### 3. Write the mock file
+
+Pattern (taken from `src/mock/assets/v1.rs`):
+
+```rust
+use async_trait::async_trait;
+use sift_rs::<service>::<version>::{
+    <svc>_service_server::<Svc>Service,
+    // ...every request/response type referenced by the trait methods...
+};
+use mockall::mock;
+use tonic::{Request, Response, Status};
+
+mock! {
+    pub <Svc>ServiceImpl {}
+
+    #[async_trait]
+    impl <Svc>Service for <Svc>ServiceImpl {
+        async fn <method_name>(
+            &self,
+            request: Request<<MethodRequest>>,
+        ) -> std::result::Result<
+            Response<<MethodResponse>>,
+            Status,
+        >;
+        // ...one entry per method on the trait...
+    }
+}
+```
+
+Rules:
+
+- **Struct name is `Mock<Svc>ServiceImpl`** — `mockall` prepends `Mock` automatically, so write `pub <Svc>ServiceImpl {}` in the source. For `AssetService` the user-facing type is `MockAssetServiceImpl`.
+- **Include every method** from the generated trait, in any order. Missing methods make the `impl` incomplete and the trait won't be satisfied.
+- **Match the tonic signatures.** Use `Request<T>` / `Response<T>` (not `tonic::Request` / `tonic::Response`) because `Request` and `Status` are already in scope. Use the fully-qualified `std::result::Result<...>` form — this is what `mockall` expects.
+- **Streaming methods**: if the trait has associated stream types (e.g. `type FooStream = ...`) or returns `Response<Self::FooStream>`, the `mock!` macro needs `type FooStream = <concrete stream type>;` inside the `impl` block. Look at the trait's associated-type defaults in the generated file and pick a concrete type like `tonic::Streaming<T>` or a boxed stream — verify against any existing streaming mocks before guessing.
+- **Don't add doc comments to the methods.** They aren't load-bearing and clutter the diff.
+
+### 4. Wire it up
+
+After creating `src/mock/<service>/v1.rs`:
+
+1. Ensure `src/mock/<service>/mod.rs` exists and contains `pub mod v1;` (and any other versions present).
+2. Ensure `src/mock/mod.rs` contains `pub mod <service>;`.
+
+### 5. Verify it compiles
+
+Run from the workspace root:
+
+```sh
+cargo check -p sift_test_util
+```
+
+If the user asks for an accompanying test, add it to `src/mock/test.rs` following the existing patterns there (canned response, `withf` request matching, `times(N)` call counts, request-driven responses, pagination). Do not invent new patterns — match what's already in the file.
+
+## Things to avoid
+
+- Don't add the mock to `[dependencies]` of other workspace crates as a runtime dep — `sift_test_util` should appear under `[dev-dependencies]` only.
+- Don't re-export `sift_rs` types from the mock module. Callers import them directly from `sift_rs`.
+- Don't implement methods inside the `mock!` block — `mockall` generates them. Bodies are configured per-test via `mock.expect_<method>().returning(...)`.
+- Don't use `with(eq(...))` to match on `tonic::Request` — `Request` does not implement `PartialEq`. Use `.withf(|req| ...)` instead, as shown in `src/mock/test.rs`.
diff --git a/rust/crates/sift_test_util/Cargo.toml b/rust/crates/sift_test_util/Cargo.toml
new file mode 100644
index 000000000..c3c30e27a
--- /dev/null
+++ b/rust/crates/sift_test_util/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "sift_test_util"
+authors.workspace = true
+version.workspace = true
+edition.workspace = true
+categories.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+async-trait.workspace = true
+hyper-util.workspace = true
+mockall.workspace = true
+sift_connect.workspace = true
+sift_rs.workspace = true
+tokio.workspace = true
+tokio-stream.workspace = true
+tonic.workspace = true
+tower.workspace = true
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["macros", "rt"] }
diff --git a/rust/crates/sift_test_util/README.md b/rust/crates/sift_test_util/README.md
new file mode 100644
index 000000000..e451ad4ff
--- /dev/null
+++ b/rust/crates/sift_test_util/README.md
@@ -0,0 +1,3 @@
+# sift_test_util
+
+Internal testing utilities for Sift's Rust crates. This crate is not published.
diff --git a/rust/crates/sift_test_util/src/grpc/mod.rs b/rust/crates/sift_test_util/src/grpc/mod.rs
new file mode 100644
index 000000000..9fbb9ae60
--- /dev/null
+++ b/rust/crates/sift_test_util/src/grpc/mod.rs
@@ -0,0 +1,39 @@
+use hyper_util::rt::TokioIo;
+use sift_connect::grpc::AuthInterceptor;
+use std::io::Error;
+use tonic::{
+    service::InterceptorLayer,
+    transport::{Endpoint, Uri},
+};
+use tower::{ServiceBuilder, service_fn};
+
+/// A test showing how to use [memory_sift_channel].
+#[cfg(test)]
+mod test;
+
+/// In-memory channel for testing.
+pub async fn memory_sift_channel(client: tokio::io::DuplexStream) -> sift_connect::SiftChannel {
+    let mut client = Some(client);
+
+    let channel = Endpoint::try_from("http://[::]:50051")
+        .unwrap()
+        .connect_with_connector(service_fn(move |_: Uri| {
+            let client = client.take();
+
+            async move {
+                if let Some(client) = client {
+                    Ok(TokioIo::new(client))
+                } else {
+                    Err(Error::other("Client already taken"))
+                }
+            }
+        }))
+        .await
+        .expect("failed to create gRPC memory channel");
+
+    ServiceBuilder::new()
+        .layer(InterceptorLayer::new(AuthInterceptor {
+            apikey: "sift-api-key".into(),
+        }))
+        .service(channel)
+}
diff --git a/rust/crates/sift_test_util/src/grpc/test.rs b/rust/crates/sift_test_util/src/grpc/test.rs
new file mode 100644
index 000000000..f569c47f7
--- /dev/null
+++ b/rust/crates/sift_test_util/src/grpc/test.rs
@@ -0,0 +1,56 @@
+use sift_rs::assets::v1::{
+    Asset, GetAssetRequest, GetAssetResponse, asset_service_client::AssetServiceClient,
+    asset_service_server::AssetServiceServer,
+};
+use tonic::{Request, Response, transport::Server};
+
+use super::memory_sift_channel;
+use crate::mock::assets::v1::MockAssetServiceImpl;
+
+/// Demonstrates how to use [memory_sift_channel] for testing.
+#[tokio::test]
+async fn test_memory_sift_channel() {
+    // Setup the mock
+    let asset_grpc = {
+        let mut mock = MockAssetServiceImpl::new();
+
+        mock.expect_get_asset().returning(|_req| {
+            Ok(Response::new(GetAssetResponse {
+                asset: Some(Asset {
+                    asset_id: "asset-1".into(),
+                    name: "engine".into(),
+                    ..Default::default()
+                }),
+            }))
+        });
+
+        AssetServiceServer::new(mock)
+    };
+
+    let (client, server) = tokio::io::duplex(1024);
+
+    let sift_channel = memory_sift_channel(client).await;
+
+    // Spawn the server
+    tokio::spawn(async move {
+        Server::builder()
+            .add_service(asset_grpc)
+            .serve_with_incoming(tokio_stream::once(Ok::<_, std::io::Error>(server)))
+            .await
+            .unwrap();
+    });
+
+    let mut client = AssetServiceClient::new(sift_channel);
+
+    let resp = client
+        .get_asset(Request::new(GetAssetRequest {
+            asset_id: "asset-1".into(),
+        }))
+        .await
+        .expect("get_asset rpc failed")
+        .into_inner();
+
+    let asset = resp.asset.expect("expected asset in response");
+    assert_eq!(asset.asset_id, "asset-1");
+    assert_eq!(asset.name, "engine");
+}
diff --git a/rust/crates/sift_test_util/src/lib.rs b/rust/crates/sift_test_util/src/lib.rs
new file mode 100644
index 000000000..0c7377086
--- /dev/null
+++ b/rust/crates/sift_test_util/src/lib.rs
@@ -0,0 +1,5 @@
+/// Mock implementations for protobuf services.
+pub mod mock;
+
+/// gRPC specific testing utils e.g. creating an in-memory transport channel
+pub mod grpc;
diff --git a/rust/crates/sift_test_util/src/mock/assets/mod.rs b/rust/crates/sift_test_util/src/mock/assets/mod.rs
new file mode 100644
index 000000000..a3a6d96c3
--- /dev/null
+++ b/rust/crates/sift_test_util/src/mock/assets/mod.rs
@@ -0,0 +1 @@
+pub mod v1;
diff --git a/rust/crates/sift_test_util/src/mock/assets/v1.rs b/rust/crates/sift_test_util/src/mock/assets/v1.rs
new file mode 100644
index 000000000..692c3cd31
--- /dev/null
+++ b/rust/crates/sift_test_util/src/mock/assets/v1.rs
@@ -0,0 +1,51 @@
+use async_trait::async_trait;
+use mockall::mock;
+use sift_rs::assets::v1::{
+    ArchiveAssetRequest, ArchiveAssetResponse, DeleteAssetRequest, DeleteAssetResponse,
+    GetAssetRequest, GetAssetResponse, ListAssetsRequest, ListAssetsResponse, UpdateAssetRequest,
+    UpdateAssetResponse, asset_service_server::AssetService,
+};
+use tonic::{Request, Response, Status};
+
+mock! {
+    pub AssetServiceImpl {}
+
+    #[async_trait]
+    impl AssetService for AssetServiceImpl {
+        async fn delete_asset(
+            &self,
+            request: Request<DeleteAssetRequest>,
+        ) -> std::result::Result<
+            Response<DeleteAssetResponse>,
+            Status,
+        >;
+        async fn get_asset(
+            &self,
+            request: Request<GetAssetRequest>,
+        ) -> std::result::Result<
+            Response<GetAssetResponse>,
+            Status,
+        >;
+        async fn list_assets(
+            &self,
+            request: Request<ListAssetsRequest>,
+        ) -> std::result::Result<
+            Response<ListAssetsResponse>,
+            Status,
+        >;
+        async fn update_asset(
+            &self,
+            request: Request<UpdateAssetRequest>,
+        ) -> std::result::Result<
+            Response<UpdateAssetResponse>,
+            Status,
+        >;
+        async fn archive_asset(
+            &self,
+            request: Request<ArchiveAssetRequest>,
+        ) -> std::result::Result<
+            Response<ArchiveAssetResponse>,
+            Status,
+        >;
+    }
+}
diff --git a/rust/crates/sift_test_util/src/mock/mod.rs b/rust/crates/sift_test_util/src/mock/mod.rs
new file mode 100644
index 000000000..8020760d4
--- /dev/null
+++ b/rust/crates/sift_test_util/src/mock/mod.rs
@@ -0,0 +1,6 @@
+pub mod assets;
+pub mod runs;
+
+/// A test demonstrating a little bit of everything of how to leverage the mock API.
+#[cfg(test)]
+mod test;
diff --git a/rust/crates/sift_test_util/src/mock/runs/mod.rs b/rust/crates/sift_test_util/src/mock/runs/mod.rs
new file mode 100644
index 000000000..7083bd82d
--- /dev/null
+++ b/rust/crates/sift_test_util/src/mock/runs/mod.rs
@@ -0,0 +1 @@
+pub mod v2;
diff --git a/rust/crates/sift_test_util/src/mock/runs/v2.rs b/rust/crates/sift_test_util/src/mock/runs/v2.rs
new file mode 100644
index 000000000..3c3b43f56
--- /dev/null
+++ b/rust/crates/sift_test_util/src/mock/runs/v2.rs
@@ -0,0 +1,74 @@
+use async_trait::async_trait;
+use mockall::mock;
+use sift_rs::runs::v2::{
+    CreateAdhocRunRequest, CreateAdhocRunResponse, CreateAutomaticRunAssociationForAssetsRequest,
+    CreateAutomaticRunAssociationForAssetsResponse, CreateRunRequest, CreateRunResponse,
+    DeleteRunRequest, DeleteRunResponse, GetRunRequest, GetRunResponse, ListRunsRequest,
+    ListRunsResponse, StopRunRequest, StopRunResponse, UpdateRunRequest, UpdateRunResponse,
+    run_service_server::RunService,
+};
+use tonic::{Request, Response, Status};
+
+mock! {
+    pub RunServiceImpl {}
+
+    #[async_trait]
+    impl RunService for RunServiceImpl {
+        async fn get_run(
+            &self,
+            request: Request<GetRunRequest>,
+        ) -> std::result::Result<
+            Response<GetRunResponse>,
+            Status,
+        >;
+        async fn list_runs(
+            &self,
+            request: Request<ListRunsRequest>,
+        ) -> std::result::Result<
+            Response<ListRunsResponse>,
+            Status,
+        >;
+        async fn create_run(
+            &self,
+            request: Request<CreateRunRequest>,
+        ) -> std::result::Result<
+            Response<CreateRunResponse>,
+            Status,
+        >;
+        async fn create_adhoc_run(
+            &self,
+            request: Request<CreateAdhocRunRequest>,
+        ) -> std::result::Result<
+            Response<CreateAdhocRunResponse>,
+            Status,
+        >;
+        async fn update_run(
+            &self,
+            request: Request<UpdateRunRequest>,
+        ) -> std::result::Result<
+            Response<UpdateRunResponse>,
+            Status,
+        >;
+        async fn delete_run(
+            &self,
+            request: Request<DeleteRunRequest>,
+        ) -> std::result::Result<
+            Response<DeleteRunResponse>,
+            Status,
+        >;
+        async fn stop_run(
+            &self,
+            request: Request<StopRunRequest>,
+        ) -> std::result::Result<
+            Response<StopRunResponse>,
+            Status,
+        >;
+        async fn create_automatic_run_association_for_assets(
+            &self,
+            request: Request<CreateAutomaticRunAssociationForAssetsRequest>,
+        ) -> std::result::Result<
+            Response<CreateAutomaticRunAssociationForAssetsResponse>,
+            Status,
+        >;
+    }
+}
diff --git a/rust/crates/sift_test_util/src/mock/test.rs b/rust/crates/sift_test_util/src/mock/test.rs
new file mode 100644
index 000000000..ef09ea198
--- /dev/null
+++ b/rust/crates/sift_test_util/src/mock/test.rs
@@ -0,0 +1,174 @@
+use crate::mock::assets::v1::MockAssetServiceImpl;
+use sift_rs::assets::v1::{
+    Asset, GetAssetRequest, GetAssetResponse, ListAssetsRequest, ListAssetsResponse,
+    asset_service_server::AssetService,
+};
+use tonic::{Code, Request, Status};
+
+/// The simplest case: stub a single method to return a fixed response.
+#[tokio::test]
+async fn returns_a_canned_response() {
+    let mut mock = MockAssetServiceImpl::new();
+
+    mock.expect_get_asset().returning(|_req| {
+        Ok(tonic::Response::new(GetAssetResponse {
+            asset: Some(Asset {
+                asset_id: "asset-1".into(),
+                name: "engine".into(),
+                ..Default::default()
+            }),
+        }))
+    });
+
+    let resp = mock
+        .get_asset(Request::new(GetAssetRequest {
+            asset_id: "asset-1".into(),
+        }))
+        .await
+        .unwrap()
+        .into_inner();
+
+    let asset = resp.asset.unwrap();
+    assert_eq!(asset.asset_id, "asset-1");
+    assert_eq!(asset.name, "engine");
+}
+
+/// Match on request arguments with `withf(...)` so the mock only fires for
+/// the expected input. A second expectation handles the "not found" case.
+/// `withf` is used instead of `with(eq(...))` because `tonic::Request` does
+/// not implement `PartialEq`.
+#[tokio::test]
+async fn matches_on_request_arguments() {
+    let mut mock = MockAssetServiceImpl::new();
+
+    mock.expect_get_asset()
+        .withf(|req| req.get_ref().asset_id == "asset-1")
+        .returning(|_| {
+            Ok(tonic::Response::new(GetAssetResponse {
+                asset: Some(Asset {
+                    asset_id: "asset-1".into(),
+                    name: "engine".into(),
+                    ..Default::default()
+                }),
+            }))
+        });
+
+    mock.expect_get_asset()
+        .withf(|req| req.get_ref().asset_id == "missing")
+        .returning(|_| Err(Status::not_found("asset not found")));
+
+    let found = mock
+        .get_asset(Request::new(GetAssetRequest {
+            asset_id: "asset-1".into(),
+        }))
+        .await
+        .unwrap()
+        .into_inner();
+    assert_eq!(found.asset.unwrap().name, "engine");
+
+    let err = mock
+        .get_asset(Request::new(GetAssetRequest {
+            asset_id: "missing".into(),
+        }))
+        .await
+        .unwrap_err();
+    assert_eq!(err.code(), Code::NotFound);
+}
+
+/// Assert the method was called exactly N times. The expectation panics on
+/// drop if the count doesn't match, so this doubles as a usage check.
+#[tokio::test]
+async fn enforces_call_count() {
+    let mut mock = MockAssetServiceImpl::new();
+
+    mock.expect_list_assets().times(2).returning(|_| {
+        Ok(tonic::Response::new(ListAssetsResponse {
+            assets: vec![],
+            next_page_token: String::new(),
+        }))
+    });
+
+    for _ in 0..2 {
+        mock.list_assets(Request::new(ListAssetsRequest::default()))
+            .await
+            .unwrap();
+    }
+}
+
+/// Compute the response from the request. Useful when behavior depends on
+/// input rather than being a fixed value.
+#[tokio::test]
+async fn computes_response_from_request() {
+    let mut mock = MockAssetServiceImpl::new();
+
+    mock.expect_get_asset().returning(|req| {
+        let asset_id = req.into_inner().asset_id;
+        Ok(tonic::Response::new(GetAssetResponse {
+            asset: Some(Asset {
+                name: format!("name-for-{asset_id}"),
+                asset_id,
+                ..Default::default()
+            }),
+        }))
+    });
+
+    let resp = mock
+        .get_asset(Request::new(GetAssetRequest {
+            asset_id: "abc".into(),
+        }))
+        .await
+        .unwrap()
+        .into_inner();
+    assert_eq!(resp.asset.unwrap().name, "name-for-abc");
+}
+
+/// Combine a page-aware mock with pagination on the client side. Shows how a
+/// single expectation with a dynamic closure can simulate multi-page state.
+#[tokio::test]
+async fn simulates_pagination() {
+    let mut mock = MockAssetServiceImpl::new();
+
+    mock.expect_list_assets().returning(|req| {
+        let page = req.into_inner().page_token;
+        let (assets, next) = match page.as_str() {
+            "" => (
+                vec![Asset {
+                    asset_id: "a1".into(),
+                    ..Default::default()
+                }],
+                "page-2".to_string(),
+            ),
+            "page-2" => (
+                vec![Asset {
+                    asset_id: "a2".into(),
+                    ..Default::default()
+                }],
+                String::new(),
+            ),
+            other => return Err(Status::invalid_argument(format!("bad token: {other}"))),
+        };
+        Ok(tonic::Response::new(ListAssetsResponse {
+            assets,
+            next_page_token: next,
+        }))
+    });
+
+    let mut ids = Vec::new();
+    let mut token = String::new();
+    loop {
+        let resp = mock
+            .list_assets(Request::new(ListAssetsRequest {
+                page_token: token.clone(),
+                ..Default::default()
+            }))
+            .await
+            .unwrap()
+            .into_inner();
+        ids.extend(resp.assets.into_iter().map(|a| a.asset_id));
+        if resp.next_page_token.is_empty() {
+            break;
+        }
+        token = resp.next_page_token;
+    }
+    assert_eq!(ids, vec!["a1", "a2"]);
+}

From e73894afc01b02b684cf74bc161f703720a77dde Mon Sep 17 00:00:00 2001
From: Wei Lu <wei@siftstack.com>
Date: Thu, 21 May 2026 17:10:52 -0700
Subject: [PATCH 07/15] python(feat): support nested calculated channel
 references (#580)

---
 .../_internal/low_level_wrappers/exports.py   | 23 +++--
 .../_internal/low_level_wrappers/rules.py     | 14 ++-
 .../sift_client/_internal/util/channels.py    |  3 +
 .../_tests/_internal/test_channels.py         | 19 ++++
 .../sift_types/test_calculated_channel.py     | 63 ++++++++++++
 .../_tests/sift_types/test_channel.py         | 96 ++++++++++++++++++-
 .../_tests/sift_types/test_rule.py            | 73 ++++++++++++++
 .../sift_types/calculated_channel.py          | 34 ++++++-
 python/lib/sift_client/sift_types/channel.py  | 51 +++++++++-
 python/lib/sift_client/sift_types/rule.py     |  7 +-
 10 files changed, 362 insertions(+), 21 deletions(-)

diff --git a/python/lib/sift_client/_internal/low_level_wrappers/exports.py b/python/lib/sift_client/_internal/low_level_wrappers/exports.py
index 63aa200cd..fd438fac0 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/exports.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/exports.py
@@ -26,10 +26,25 @@
 if TYPE_CHECKING:
     from datetime import datetime
 
+    from sift_client.sift_types.channel import ChannelReference
     from sift_client.sift_types.export import ExportOutputFormat
     from sift_client.transport.grpc_transport import GrpcClient
 
 
+def _abstract_ref_to_proto(ref: ChannelReference) -> CalculatedChannelAbstractChannelReference:
+    # After ChannelReference validation, calculated_channel is always a version_id string.
+    if ref.calculated_channel:
+        assert isinstance(ref.calculated_channel, str)
+        return CalculatedChannelAbstractChannelReference(
+            channel_reference=ref.channel_reference,
+            calculated_channel_version_id=ref.calculated_channel,
+        )
+    return CalculatedChannelAbstractChannelReference(
+        channel_reference=ref.channel_reference,
+        channel_identifier=ref.channel_identifier or "",
+    )
+
+
 def _build_calc_channel_configs(
     calculated_channels: list[CalculatedChannel | CalculatedChannelCreate] | None,
 ) -> list[CalculatedChannelConfig]:
@@ -46,13 +61,7 @@ def _build_calc_channel_configs(
             CalculatedChannelConfig(
                 name=cc.name,
                 expression=cc.expression or "",
-                channel_references=[
-                    CalculatedChannelAbstractChannelReference(
-                        channel_reference=ref.channel_reference,
-                        channel_identifier=ref.channel_identifier,
-                    )
-                    for ref in refs
-                ],
+                channel_references=[_abstract_ref_to_proto(ref) for ref in refs],
                 units=cc.units,
             )
         )
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/rules.py b/python/lib/sift_client/_internal/low_level_wrappers/rules.py
index 3658a0698..364ee37e1 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/rules.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/rules.py
@@ -75,6 +75,16 @@
 logger = logging.getLogger(__name__)
 
 
+def _channel_reference_to_proto(ref: ChannelReference) -> ChannelReferenceProto:
+    # ChannelReference's validator normalizes calculated_channel to a version_id str
+    # and guarantees exactly one of calculated_channel / channel_identifier is set.
+    if ref.calculated_channel:
+        return ChannelReferenceProto(
+            calculated_channel_version_id=cast("str", ref.calculated_channel)
+        )
+    return ChannelReferenceProto(name=cast("str", ref.channel_identifier))
+
+
 class RulesLowLevelClient(LowLevelClientBase, WithGrpcClient):
     """Low-level client for the RulesAPI.
 
@@ -126,7 +136,7 @@ def _update_rule_request_from_create(self, create: RuleCreate) -> UpdateRuleRequ
             calculated_channel=CalculatedChannelConfig(
                 expression=create.expression,
                 channel_references={
-                    c.channel_reference: ChannelReferenceProto(name=c.channel_identifier)
+                    c.channel_reference: _channel_reference_to_proto(c)
                     for c in create.channel_references
                 },
             )
@@ -259,7 +269,7 @@ def _update_rule_request_from_update(
                 CalculatedChannelConfig(
                     expression=expression,
                     channel_references={
-                        c.channel_reference: ChannelReferenceProto(name=c.channel_identifier)
+                        c.channel_reference: _channel_reference_to_proto(c)
                         for c in channel_references
                     },
                 )
diff --git a/python/lib/sift_client/_internal/util/channels.py b/python/lib/sift_client/_internal/util/channels.py
index 8c3d39d82..e7c37396b 100644
--- a/python/lib/sift_client/_internal/util/channels.py
+++ b/python/lib/sift_client/_internal/util/channels.py
@@ -32,6 +32,9 @@ async def resolve_calculated_channels(
 
         resolved_refs: list[ChannelReference] = []
         for ref in refs:
+            if ref.calculated_channel:
+                resolved_refs.append(ref)
+                continue
             channel = await channels_api.find(
                 name=ref.channel_identifier,
                 assets=cc.asset_ids,
diff --git a/python/lib/sift_client/_tests/_internal/test_channels.py b/python/lib/sift_client/_tests/_internal/test_channels.py
index 3b6be637d..792f82ec5 100644
--- a/python/lib/sift_client/_tests/_internal/test_channels.py
+++ b/python/lib/sift_client/_tests/_internal/test_channels.py
@@ -41,6 +41,25 @@ async def test_resolves_name_to_uuid(self):
         assert refs is not None
         assert refs[0].channel_identifier == "resolved-uuid"
 
+    @pytest.mark.asyncio
+    async def test_skips_lookup_for_calculated_channel_version_id(self):
+        api = MagicMock()
+        api.find = AsyncMock()
+        cc = CalculatedChannelCreate(
+            name="nested",
+            expression="$1 + 1",
+            expression_channel_references=[
+                ChannelReference(channel_reference="$1", calculated_channel="v-nested")
+            ],
+        )
+        result = await resolve_calculated_channels([cc], channels_api=api)
+        api.find.assert_not_awaited()
+        assert result is not None
+        refs = result[0].expression_channel_references
+        assert refs is not None
+        assert refs[0].calculated_channel == "v-nested"
+        assert refs[0].channel_identifier is None
+
     @pytest.mark.asyncio
     async def test_keeps_identifier_when_not_found(self):
         api = MagicMock()
diff --git a/python/lib/sift_client/_tests/sift_types/test_calculated_channel.py b/python/lib/sift_client/_tests/sift_types/test_calculated_channel.py
index e5b1059c5..d0ac06c93 100644
--- a/python/lib/sift_client/_tests/sift_types/test_calculated_channel.py
+++ b/python/lib/sift_client/_tests/sift_types/test_calculated_channel.py
@@ -7,6 +7,7 @@
 
 from sift_client.sift_types import CalculatedChannel
 from sift_client.sift_types.calculated_channel import (
+    CalculatedChannelCreate,
     CalculatedChannelUpdate,
 )
 from sift_client.sift_types.channel import ChannelReference
@@ -151,6 +152,68 @@ def test_expression_validator_rejects_references_without_expression(self):
                 ],
             )
 
+    def test_nested_calculated_channel_reference_serialized_to_version_id_oneof(self):
+        """A ChannelReference with calculated_channel set routes to the proto oneof."""
+        update = CalculatedChannelUpdate(
+            expression="$1 + $2",
+            expression_channel_references=[
+                ChannelReference(channel_reference="$1", channel_identifier="channel1"),
+                ChannelReference(channel_reference="$2", calculated_channel="v-nested"),
+            ],
+        )
+        update.resource_id = "test_calc_channel_id"
+
+        proto, _ = update.to_proto_with_mask()
+
+        refs = proto.calculated_channel_configuration.query_configuration.sel.expression_channel_references
+        assert len(refs) == 2
+        assert refs[0].channel_identifier == "channel1"
+        assert refs[0].WhichOneof("calculated_channel_reference") is None
+        assert refs[1].WhichOneof("calculated_channel_reference") == "calculated_channel_version_id"
+        assert refs[1].calculated_channel_version_id == "v-nested"
+        assert refs[1].channel_identifier == ""
+
+    def test_create_serializes_nested_calculated_channel_reference(self):
+        """CalculatedChannelCreate.to_proto routes the version_id into the proto oneof."""
+        create = CalculatedChannelCreate(
+            name="nested-cc",
+            expression="$1 * 2",
+            expression_channel_references=[
+                ChannelReference(channel_reference="$1", calculated_channel="v-nested"),
+            ],
+            all_assets=True,
+        )
+
+        proto = create.to_proto()
+
+        refs = proto.calculated_channel_configuration.query_configuration.sel.expression_channel_references
+        assert len(refs) == 1
+        assert refs[0].WhichOneof("calculated_channel_reference") == "calculated_channel_version_id"
+        assert refs[0].calculated_channel_version_id == "v-nested"
+
+    def test_create_serializes_nested_reference_from_calculated_channel_object(
+        self, mock_calculated_channel
+    ):
+        """Passing a CalculatedChannel object to ChannelReference also serializes correctly."""
+        create = CalculatedChannelCreate(
+            name="nested-cc",
+            expression="$1 * 2",
+            expression_channel_references=[
+                ChannelReference(
+                    channel_reference="$1", calculated_channel=mock_calculated_channel
+                ),
+            ],
+            all_assets=True,
+        )
+
+        proto = create.to_proto()
+
+        refs = proto.calculated_channel_configuration.query_configuration.sel.expression_channel_references
+        assert len(refs) == 1
+        assert refs[0].WhichOneof("calculated_channel_reference") == "calculated_channel_version_id"
+        # mock_calculated_channel fixture has version_id="v1"
+        assert refs[0].calculated_channel_version_id == "v1"
+
     def test_expression_validator_accepts_both_set(self):
         """Test validator accepts expression and channel references together."""
         # Should not raise
diff --git a/python/lib/sift_client/_tests/sift_types/test_channel.py b/python/lib/sift_client/_tests/sift_types/test_channel.py
index ab8fa01c2..76bbafc78 100644
--- a/python/lib/sift_client/_tests/sift_types/test_channel.py
+++ b/python/lib/sift_client/_tests/sift_types/test_channel.py
@@ -6,7 +6,7 @@
 import pytest
 
 from sift_client.sift_types import Channel
-from sift_client.sift_types.channel import ChannelDataType
+from sift_client.sift_types.channel import ChannelDataType, ChannelReference
 
 
 @pytest.fixture
@@ -104,6 +104,100 @@ def test_data_method_as_arrow(self, mock_channel, mock_client):
         mock_client.channels.get_data.assert_not_called()
         assert result == mock_data
 
+    def test_channel_reference_requires_one_target(self):
+        """ChannelReference must specify exactly one of identifier or calculated_channel."""
+        with pytest.raises(ValueError, match="exactly one"):
+            ChannelReference(channel_reference="$1")
+        with pytest.raises(ValueError, match="exactly one"):
+            ChannelReference(
+                channel_reference="$1",
+                channel_identifier="ch",
+                calculated_channel="v-id",
+            )
+
+    def test_channel_reference_accepts_version_id_string(self):
+        """A plain version_id string is stored as-is."""
+        ref = ChannelReference(channel_reference="$1", calculated_channel="v-abc")
+        assert ref.calculated_channel == "v-abc"
+        assert ref.channel_identifier is None
+
+    def test_channel_reference_accepts_calculated_channel_object(self):
+        """Passing a CalculatedChannel normalizes to its version_id string."""
+        from sift_client.sift_types.calculated_channel import CalculatedChannel
+
+        cc = CalculatedChannel(
+            proto=MagicMock(),
+            id_="cc-id",
+            name="parent",
+            description="",
+            expression="$1",
+            channel_references=[],
+            is_archived=False,
+            units=None,
+            asset_ids=["asset-1"],
+            tag_ids=None,
+            all_assets=False,
+            organization_id=None,
+            client_key=None,
+            archived_date=None,
+            version_id="v-abc",
+            version=1,
+            change_message=None,
+            user_notes=None,
+            created_date=datetime.now(timezone.utc),
+            modified_date=datetime.now(timezone.utc),
+            created_by_user_id="u",
+            modified_by_user_id="u",
+        )
+
+        ref = ChannelReference(channel_reference="$1", calculated_channel=cc)
+        assert ref.calculated_channel == "v-abc"
+
+    def test_channel_reference_rejects_calculated_channel_without_version_id(self):
+        """A CalculatedChannel missing version_id is unusable as a reference."""
+        from sift_client.sift_types.calculated_channel import CalculatedChannel
+
+        cc = CalculatedChannel(
+            proto=MagicMock(),
+            id_="cc-id",
+            name="parent",
+            description="",
+            expression="$1",
+            channel_references=[],
+            is_archived=False,
+            units=None,
+            asset_ids=["asset-1"],
+            tag_ids=None,
+            all_assets=False,
+            organization_id=None,
+            client_key=None,
+            archived_date=None,
+            version_id=None,
+            version=None,
+            change_message=None,
+            user_notes=None,
+            created_date=datetime.now(timezone.utc),
+            modified_date=datetime.now(timezone.utc),
+            created_by_user_id="u",
+            modified_by_user_id="u",
+        )
+
+        with pytest.raises(ValueError, match="no version_id"):
+            ChannelReference(channel_reference="$1", calculated_channel=cc)
+
+    def test_channel_reference_from_proto_reads_version_id_oneof(self):
+        """_from_proto picks calculated_channel when the proto oneof selects it."""
+        from sift.calculated_channels.v2.calculated_channels_pb2 import (
+            CalculatedChannelAbstractChannelReference,
+        )
+
+        proto = CalculatedChannelAbstractChannelReference(
+            channel_reference="$1", calculated_channel_version_id="v-abc"
+        )
+        ref = ChannelReference._from_proto(proto)
+        assert ref.calculated_channel == "v-abc"
+        assert ref.channel_identifier is None
+
     def test_data_method_with_minimal_params(self, mock_channel, mock_client):
         """Test that data() method works with minimal parameters."""
         mock_data = {"test_channel": MagicMock()}
diff --git a/python/lib/sift_client/_tests/sift_types/test_rule.py b/python/lib/sift_client/_tests/sift_types/test_rule.py
index 05a15381f..f91443c6a 100644
--- a/python/lib/sift_client/_tests/sift_types/test_rule.py
+++ b/python/lib/sift_client/_tests/sift_types/test_rule.py
@@ -147,3 +147,76 @@ def test_unarchive_calls_client_and_updates_self(self, mock_rule, mock_client):
             mock_update.assert_called_once_with(unarchived_rule)
             # Verify it returns self
             assert result is mock_rule
+
+
+class TestRuleChannelReferenceSerialization:
+    """Nested CC references flow through the low-level wrapper and back."""
+
+    def test_helper_routes_version_id_into_proto(self):
+        from sift_client._internal.low_level_wrappers.rules import (
+            _channel_reference_to_proto,
+        )
+
+        proto = _channel_reference_to_proto(
+            ChannelReference(channel_reference="$1", calculated_channel="v-abc")
+        )
+        assert proto.HasField("calculated_channel_version_id")
+        assert proto.calculated_channel_version_id == "v-abc"
+        assert proto.name == ""
+
+    def test_helper_routes_identifier_into_name(self):
+        from sift_client._internal.low_level_wrappers.rules import (
+            _channel_reference_to_proto,
+        )
+
+        proto = _channel_reference_to_proto(
+            ChannelReference(channel_reference="$1", channel_identifier="my-channel")
+        )
+        assert not proto.HasField("calculated_channel_version_id")
+        assert proto.name == "my-channel"
+
+    def test_from_proto_reads_version_id_when_present(self):
+        from google.protobuf.timestamp_pb2 import Timestamp
+        from sift.rules.v1.rules_pb2 import (
+            CalculatedChannelConfig,
+            RuleCondition,
+            RuleConditionExpression,
+        )
+        from sift.rules.v1.rules_pb2 import (
+            ChannelReference as ChannelReferenceProto,
+        )
+        from sift.rules.v1.rules_pb2 import (
+            Rule as RuleProto,
+        )
+        from sift.rules.v1.rules_pb2 import (
+            RuleAction as RuleActionProto,
+        )
+
+        ts = Timestamp()
+        ts.GetCurrentTime()
+        proto = RuleProto(
+            rule_id="r1",
+            name="r",
+            description="",
+            created_date=ts,
+            modified_date=ts,
+            conditions=[
+                RuleCondition(
+                    expression=RuleConditionExpression(
+                        calculated_channel=CalculatedChannelConfig(
+                            expression="$1 > 0",
+                            channel_references={
+                                "$1": ChannelReferenceProto(calculated_channel_version_id="v-xyz"),
+                            },
+                        )
+                    ),
+                    actions=[RuleActionProto(created_date=ts, modified_date=ts)],
+                )
+            ],
+        )
+        rule = Rule._from_proto(proto)
+        assert rule.channel_references is not None
+        assert len(rule.channel_references) == 1
+        ref = rule.channel_references[0]
+        assert ref.calculated_channel == "v-xyz"
+        assert ref.channel_identifier is None
diff --git a/python/lib/sift_client/sift_types/calculated_channel.py b/python/lib/sift_client/sift_types/calculated_channel.py
index d1fca2523..f150431b4 100644
--- a/python/lib/sift_client/sift_types/calculated_channel.py
+++ b/python/lib/sift_client/sift_types/calculated_channel.py
@@ -26,6 +26,28 @@
     from sift_client.client import SiftClient
 
 
+def _channel_reference_to_proto(
+    channel_reference: str,
+    channel_identifier: str | None = None,
+    calculated_channel: str | None = None,
+) -> CalculatedChannelAbstractChannelReference:
+    """Convert a ChannelReference dict (from model_dump) into its proto form.
+
+    Maps the ``calculated_channel`` field onto the proto's ``calculated_channel_version_id``
+    oneof. After ChannelReference validation, ``calculated_channel`` is always a
+    version_id string (or None).
+    """
+    if calculated_channel:
+        return CalculatedChannelAbstractChannelReference(
+            channel_reference=channel_reference,
+            calculated_channel_version_id=calculated_channel,
+        )
+    return CalculatedChannelAbstractChannelReference(
+        channel_reference=channel_reference,
+        channel_identifier=channel_identifier or "",
+    )
+
+
 class CalculatedChannel(BaseType[CalculatedChannelProto, "CalculatedChannel"]):
     """Model of the Sift Calculated Channel."""
 
@@ -108,10 +130,7 @@ def _from_proto(
             description=proto.description,
             expression=proto.calculated_channel_configuration.query_configuration.sel.expression,
             channel_references=[
-                ChannelReference(
-                    channel_reference=ref_proto.channel_reference,
-                    channel_identifier=ref_proto.channel_identifier,
-                )
+                ChannelReference._from_proto(ref_proto)
                 for ref_proto in proto.calculated_channel_configuration.query_configuration.sel.expression_channel_references
             ],
             organization_id=proto.organization_id,
@@ -163,7 +182,7 @@ class CalculatedChannelBase(ModelCreateUpdateBase):
         "expression_channel_references": MappingHelper(
             proto_attr_path="calculated_channel_configuration.query_configuration.sel.expression_channel_references",
             update_field="query_configuration",
-            converter=CalculatedChannelAbstractChannelReference,
+            converter=_channel_reference_to_proto,
         ),
         "tag_ids": MappingHelper(
             proto_attr_path="calculated_channel_configuration.asset_configuration.selection.tag_ids",
@@ -225,3 +244,8 @@ def _add_resource_id_to_proto(self, proto_msg: CalculatedChannelProto):
         if self._resource_id is None:
             raise ValueError("Resource ID must be set before adding to proto")
         proto_msg.calculated_channel_id = self._resource_id
+
+
+# Resolve the forward reference to CalculatedChannel in ChannelReference now that
+# CalculatedChannel is defined in this module's namespace.
+ChannelReference.model_rebuild(_types_namespace={"CalculatedChannel": CalculatedChannel})  # type: ignore[arg-type]
diff --git a/python/lib/sift_client/sift_types/channel.py b/python/lib/sift_client/sift_types/channel.py
index 69ba4b8ed..402f5d25a 100644
--- a/python/lib/sift_client/sift_types/channel.py
+++ b/python/lib/sift_client/sift_types/channel.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING
 
 import sift.common.type.v1.channel_data_type_pb2 as channel_pb
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 from sift.channels.v3.channels_pb2 import Channel as ChannelProto
 from sift.common.type.v1.channel_bit_field_element_pb2 import (
     ChannelBitFieldElement as ChannelBitFieldElementPb,
@@ -32,6 +32,7 @@
 
     from sift_client.client import SiftClient
     from sift_client.sift_types.asset import Asset
+    from sift_client.sift_types.calculated_channel import CalculatedChannel
     from sift_client.sift_types.run import Run
 
 
@@ -344,13 +345,53 @@ def runs(self) -> list[Run]:
 
 
 class ChannelReference(BaseModel):
-    """Channel reference for calculated channel or rule."""
-
-    channel_reference: str  # The key of the channel in the expression i.e. $1, $2, etc.
-    channel_identifier: str  # The name of the channel
+    """Channel reference for a calculated channel or rule expression.
+
+    Exactly one of `channel_identifier` or `calculated_channel` must be set.
+    To reference another calculated channel (nested CC), pass either a fetched
+    `CalculatedChannel` (its `version_id` is used) or a `version_id` string
+    directly. Both normalize to the `version_id` string after validation.
+
+    Attributes:
+        channel_reference: The key of the channel in the expression (e.g. ``$1``, ``$2``).
+        channel_identifier: The name (or ID) of an existing channel.
+        calculated_channel: A ``CalculatedChannel`` or its ``version_id``. Normalized
+            to the ``version_id`` string after validation.
+    """
+
+    channel_reference: str
+    channel_identifier: str | None = None
+    calculated_channel: CalculatedChannel | str | None = None
+
+    @model_validator(mode="after")
+    def _normalize_and_validate(self) -> ChannelReference:
+        # Lazy import avoids a circular dependency at module load time.
+        from sift_client.sift_types.calculated_channel import CalculatedChannel
+
+        if isinstance(self.calculated_channel, CalculatedChannel):
+            if not self.calculated_channel.version_id:
+                raise ValueError(
+                    "ChannelReference: provided CalculatedChannel has no version_id. "
+                    "Fetch it via client.calculated_channels.get(...) first."
+                )
+            self.calculated_channel = self.calculated_channel.version_id
+
+        has_identifier = bool(self.channel_identifier)
+        has_calc_channel = bool(self.calculated_channel)
+        if has_identifier == has_calc_channel:
+            raise ValueError(
+                "ChannelReference requires exactly one of channel_identifier or "
+                "calculated_channel to be set"
+            )
+        return self
 
     @classmethod
     def _from_proto(cls, proto) -> ChannelReference:
+        if proto.WhichOneof("calculated_channel_reference") == "calculated_channel_version_id":
+            return cls(
+                channel_reference=proto.channel_reference,
+                calculated_channel=proto.calculated_channel_version_id,
+            )
         return cls(
             channel_reference=proto.channel_reference,
             channel_identifier=proto.channel_identifier,
diff --git a/python/lib/sift_client/sift_types/rule.py b/python/lib/sift_client/sift_types/rule.py
index 0b241625a..5112b964c 100644
--- a/python/lib/sift_client/sift_types/rule.py
+++ b/python/lib/sift_client/sift_types/rule.py
@@ -130,7 +130,12 @@ def _from_proto(cls, proto: RuleProto, sift_client: SiftClient | None = None) ->
             description=proto.description,
             expression=expression,
             channel_references=[
-                ChannelReference(channel_reference=ref, channel_identifier=c.name)
+                ChannelReference(
+                    channel_reference=ref,
+                    calculated_channel=c.calculated_channel_version_id,
+                )
+                if c.HasField("calculated_channel_version_id")
+                else ChannelReference(channel_reference=ref, channel_identifier=c.name)
                 for ref, c in proto.conditions[
                     0
                 ].expression.calculated_channel.channel_references.items()

From 329ec11bcb2e3d8b448acc09b3ea1d51ec75833a Mon Sep 17 00:00:00 2001
From: Wei Lu <wei@siftstack.com>
Date: Thu, 21 May 2026 17:21:20 -0700
Subject: [PATCH 08/15] python(fix): infer parquet time column for int64/uint64
 timestamps (#582)

---
 .../_tests/resources/test_data_imports.py     | 154 +++++++++++++++++-
 .../lib/sift_client/resources/data_imports.py |  44 ++++-
 2 files changed, 195 insertions(+), 3 deletions(-)

diff --git a/python/lib/sift_client/_tests/resources/test_data_imports.py b/python/lib/sift_client/_tests/resources/test_data_imports.py
index 6b8dcf03e..18e9c1ca5 100644
--- a/python/lib/sift_client/_tests/resources/test_data_imports.py
+++ b/python/lib/sift_client/_tests/resources/test_data_imports.py
@@ -1,11 +1,31 @@
 """Unit tests for data import config models and helpers."""
 
+from __future__ import annotations
+
 from datetime import datetime, timezone
+from typing import TYPE_CHECKING, cast
 
 import pytest
+from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto
+from sift.data_imports.v2.data_imports_pb2 import (
+    ParquetColumn,
+    ParquetConfig,
+    ParquetFlatDatasetConfig,
+    ParquetSingleChannelPerRowConfig,
+)
+from sift.data_imports.v2.data_imports_pb2 import (
+    ParquetDataColumn as ParquetDataColumnProto,
+)
+from sift.data_imports.v2.data_imports_pb2 import (
+    ParquetTimeColumn as ParquetTimeColumnProto,
+)
 
 from sift_client.resources import DataImportAPI, DataImportAPIAsync
-from sift_client.resources.data_imports import _resolve_data_type_key
+from sift_client.resources.data_imports import (
+    _infer_time_column,
+    _parse_parquet_detect_response,
+    _resolve_data_type_key,
+)
 from sift_client.sift_types.channel import ChannelDataType
 from sift_client.sift_types.data_import import (
     CsvDataColumn,
@@ -22,6 +42,11 @@
     TimeFormat,
 )
 
+if TYPE_CHECKING:
+    from sift.common.type.v1.channel_data_type_pb2 import (
+        ChannelDataType as ChannelDataTypeProto,
+    )
+
 
 @pytest.mark.integration
 def test_client_binding(sift_client):
@@ -362,3 +387,130 @@ def test_explicit_data_type_overrides_extension(self):
     def test_unknown_extension_raises(self):
         with pytest.raises(ValueError, match="Unsupported file extension"):
             _resolve_data_type_key(".xyz", None)
+
+
+class TestInferTimeColumn:
+    def test_picks_canonical_skips_other_columns(self):
+        path = _infer_time_column(
+            [
+                ("delta_time", ChannelDataType.INT_64, "delta_time"),
+                ("voltage", ChannelDataType.DOUBLE, "voltage"),
+                ("timestamp", ChannelDataType.INT_64, "timestamp"),
+            ]
+        )
+        assert path == "timestamp"
+
+    def test_accepts_uint64(self):
+        path = _infer_time_column([("time", ChannelDataType.UINT_64, "time")])
+        assert path == "time"
+
+    def test_case_insensitive(self):
+        path = _infer_time_column([("TimeStamp", ChannelDataType.INT_64, "TimeStamp")])
+        assert path == "TimeStamp"
+
+    def test_multiple_candidates_sorted_alphabetically(self):
+        path = _infer_time_column(
+            [
+                ("timestamp", ChannelDataType.INT_64, "timestamp"),
+                ("time", ChannelDataType.INT_64, "time"),
+                ("ts", ChannelDataType.INT_64, "ts"),
+            ]
+        )
+        assert path == "time"
+
+    def test_returns_none_when_no_canonical_int_column(self):
+        path = _infer_time_column(
+            [
+                ("timestamp", ChannelDataType.DOUBLE, "timestamp"),
+                ("event_time", ChannelDataType.INT_64, "event_time"),
+            ]
+        )
+        assert path is None
+
+
+def _make_flat_dataset_response(
+    time_path: str, data_columns: list[tuple[str, int]]
+) -> ParquetConfig:
+    return ParquetConfig(
+        flat_dataset=ParquetFlatDatasetConfig(
+            time_column=ParquetTimeColumnProto(path=time_path),
+            data_columns=[
+                ParquetDataColumnProto(
+                    path=path,
+                    channel_config=ChannelConfigProto(
+                        name=path,
+                        data_type=cast("ChannelDataTypeProto.ValueType", data_type),
+                    ),
+                )
+                for path, data_type in data_columns
+            ],
+        )
+    )
+
+
+def _make_scpr_response(time_path: str, columns: list[tuple[str, int]]) -> ParquetConfig:
+    return ParquetConfig(
+        single_channel_per_row=ParquetSingleChannelPerRowConfig(
+            time_column=ParquetTimeColumnProto(path=time_path),
+            columns=[
+                ParquetColumn(
+                    path=path,
+                    column_config=ChannelConfigProto(
+                        name=path,
+                        data_type=cast("ChannelDataTypeProto.ValueType", data_type),
+                    ),
+                )
+                for path, data_type in columns
+            ],
+        )
+    )
+
+
+class TestParseParquetDetectResponseTimeFallback:
+    def test_flat_dataset_infers_int64_time_column(self):
+        proto = _make_flat_dataset_response(
+            time_path="",
+            data_columns=[
+                ("voltage", ChannelDataType.DOUBLE.value),
+                ("timestamp", ChannelDataType.INT_64.value),
+                ("status", ChannelDataType.INT_32.value),
+            ],
+        )
+        config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0)
+        assert isinstance(config, ParquetFlatDatasetImportConfig)
+        assert config.time_column.path == "timestamp"
+        assert [dc.path for dc in config.data_columns] == ["voltage", "status"]
+
+    def test_flat_dataset_keeps_server_time_column_when_set(self):
+        proto = _make_flat_dataset_response(
+            time_path="server_ts",
+            data_columns=[
+                ("server_ts", ChannelDataType.INT_64.value),
+                ("timestamp", ChannelDataType.INT_64.value),
+                ("voltage", ChannelDataType.DOUBLE.value),
+            ],
+        )
+        config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0)
+        assert config.time_column.path == "server_ts"
+        assert [dc.path for dc in config.data_columns] == ["timestamp", "voltage"]
+
+    def test_flat_dataset_no_int64_match_leaves_time_empty(self):
+        proto = _make_flat_dataset_response(
+            time_path="",
+            data_columns=[("voltage", ChannelDataType.DOUBLE.value)],
+        )
+        config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0)
+        assert config.time_column.path == ""
+        assert [dc.path for dc in config.data_columns] == ["voltage"]
+
+    def test_scpr_infers_int64_time_column(self):
+        proto = _make_scpr_response(
+            time_path="",
+            columns=[
+                ("voltage", ChannelDataType.DOUBLE.value),
+                ("timestamp", ChannelDataType.INT_64.value),
+            ],
+        )
+        config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0)
+        assert isinstance(config, ParquetSingleChannelPerRowImportConfig)
+        assert config.time_column.path == "timestamp"
diff --git a/python/lib/sift_client/resources/data_imports.py b/python/lib/sift_client/resources/data_imports.py
index 401b77e62..045c0537f 100644
--- a/python/lib/sift_client/resources/data_imports.py
+++ b/python/lib/sift_client/resources/data_imports.py
@@ -8,6 +8,7 @@
 from sift_client._internal.util.file import extract_parquet_footer, upload_file
 from sift_client.resources._base import ResourceBase
 from sift_client.sift_types.asset import Asset
+from sift_client.sift_types.channel import ChannelDataType
 from sift_client.sift_types.data_import import (
     EXTENSION_TO_DATA_TYPE_KEY,
     CsvImportConfig,
@@ -15,10 +16,13 @@
     ImportConfig,
     ParquetFlatDatasetImportConfig,
     ParquetSingleChannelPerRowImportConfig,
+    ParquetTimeColumn,
 )
 from sift_client.sift_types.run import Run
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from sift_client.client import SiftClient
     from sift_client.sift_types.job import Job
 
@@ -320,6 +324,28 @@ def _parse_csv_detect_response(proto) -> CsvImportConfig:
     return csv_config
 
 
+_TIME_COLUMN_NAMES: frozenset[str] = frozenset({"ts", "timestamp", "time"})
+_TIME_COLUMN_TYPES: frozenset[ChannelDataType] = frozenset(
+    {ChannelDataType.INT_64, ChannelDataType.UINT_64}
+)
+
+
+def _infer_time_column(
+    columns: Iterable[tuple[str, ChannelDataType, str]],
+) -> str | None:
+    """Pick a likely time column when the server couldn't identify one.
+
+    Returns the path of an INT64 or UINT64 column whose name
+    (case-insensitive) matches one of ``ts``, ``timestamp``, or ``time``.
+    Returns None otherwise.
+    """
+    data_columns = sorted(columns, key=lambda c: c[0].lower())
+    for name, data_type, path in data_columns:
+        if data_type in _TIME_COLUMN_TYPES and name.lower() in _TIME_COLUMN_NAMES:
+            return path
+    return None
+
+
 def _parse_parquet_detect_response(
     proto, filename: str, footer_offset: int, footer_length: int
 ) -> ParquetFlatDatasetImportConfig | ParquetSingleChannelPerRowImportConfig:
@@ -328,16 +354,30 @@ def _parse_parquet_detect_response(
         parquet_config = ParquetFlatDatasetImportConfig._from_proto(
             proto, footer_offset=footer_offset, footer_length=footer_length
         )
-        time_path = parquet_config.time_column.path
+        time_path: str | None = parquet_config.time_column.path
+        if not time_path:
+            time_path = _infer_time_column(
+                (dc.name, dc.data_type, dc.path) for dc in parquet_config.data_columns
+            )
+            if time_path:
+                parquet_config.time_column = ParquetTimeColumn(path=time_path)
         if time_path:
             parquet_config.data_columns = [
                 dc for dc in parquet_config.data_columns if dc.path != time_path
             ]
         return parquet_config
     elif proto.HasField("single_channel_per_row"):
-        return ParquetSingleChannelPerRowImportConfig._from_proto(
+        scpr_config = ParquetSingleChannelPerRowImportConfig._from_proto(
             proto, footer_offset=footer_offset, footer_length=footer_length
         )
+        if not scpr_config.time_column.path:
+            time_path = _infer_time_column(
+                (col.column_config.name, ChannelDataType(col.column_config.data_type), col.path)
+                for col in proto.single_channel_per_row.columns
+            )
+            if time_path is not None:
+                scpr_config.time_column = ParquetTimeColumn(path=time_path)
+        return scpr_config
     raise ValueError(f"Unsupported parquet layout in DetectConfig response for '{filename}'.")
 
 

From 065ddf19915ce3bb384f07fc00aaab2ed57436fb Mon Sep 17 00:00:00 2001
From: Wei Lu <wei@siftstack.com>
Date: Thu, 21 May 2026 17:42:31 -0700
Subject: [PATCH 09/15] python(chore): v0.16.2 prep (#583)

---
 python/CHANGELOG.md   | 6 ++++++
 python/pyproject.toml | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index e60084db8..cc692ecce 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -3,6 +3,12 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## [v0.16.2] - May 21, 2026
+
+### Bugfixes
+- Resolve nested calculated channel references when fetching calculated channels so expressions that depend on other calculated channels evaluate correctly. ([#580](https://github.com/sift-stack/sift/pull/580))
+- Infer the parquet time column for integer timestamp columns in data imports so files with integer epoch time columns no longer fail detection. ([#582](https://github.com/sift-stack/sift/pull/582))
+
 ## [v0.16.1] - May 19, 2026
 
 ### Bugfixes
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 629ef6174..6a3a34d83 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.16.1"
+version = "0.16.2"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }

From f7f445e2c18deb10c80e4f7d39523b320484b994 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Thu, 14 May 2026 14:12:46 -0700
Subject: [PATCH 10/15] init commit

---
 .../_tests/util/_step_status_capture.py       |  65 ++
 .../_tests/util/step_status_states.md         |  99 +++
 .../_tests/util/test_step_status_states.py    | 587 ++++++++++++++++++
 3 files changed, 751 insertions(+)
 create mode 100644 python/lib/sift_client/_tests/util/_step_status_capture.py
 create mode 100644 python/lib/sift_client/_tests/util/step_status_states.md
 create mode 100644 python/lib/sift_client/_tests/util/test_step_status_states.py

diff --git a/python/lib/sift_client/_tests/util/_step_status_capture.py b/python/lib/sift_client/_tests/util/_step_status_capture.py
new file mode 100644
index 000000000..6838fdcc5
--- /dev/null
+++ b/python/lib/sift_client/_tests/util/_step_status_capture.py
@@ -0,0 +1,65 @@
+"""Shared state for the step-status characterization suite.
+
+The outer test in ``test_step_status_states.py`` runs inner pytest sessions
+via ``pytester``. The inner session installs a fake ``sift_client`` (see
+``_INNER_CONFTEST_SRC`` in that file) which records every step status
+write into this module's ``CAPTURED_STEPS`` dict so the outer test can
+assert on what the plugin produced.
+
+This lives in its own module (rather than inside the test file) because
+the inner ``conftest.py`` runs in a fresh pytester tmp dir and needs an
+importable, package-reachable handle to the same dict object.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from sift_client.sift_types.test_report import TestStatus
+
+
+@dataclass
+class CapturedStep:
+    step_id: str
+    name: str
+    step_path: str
+    parent_step_id: str | None
+    statuses: list[TestStatus] = field(default_factory=list)
+
+
+CAPTURED_STEPS: dict[str, CapturedStep] = {}
+
+
+def reset() -> None:
+    CAPTURED_STEPS.clear()
+
+
+def steps_by_name(name: str) -> list[CapturedStep]:
+    return [s for s in CAPTURED_STEPS.values() if s.name == name]
+
+
+def test_step(name: str) -> CapturedStep | None:
+    """The step the autouse ``step`` fixture creates for the test function.
+
+    There can be a deeper step with the same name when the ``makereport``
+    hook also records one (e.g. ``pytest.skip()`` inside the test body, or
+    an ``xfail`` mark). The autouse step is the shallowest of those, so
+    pick by step_path depth.
+    """
+    matches = [s for s in CAPTURED_STEPS.values() if s.name == name]
+    if not matches:
+        return None
+    return min(matches, key=lambda s: s.step_path.count("."))
+
+
+def child_steps(parent: CapturedStep) -> list[CapturedStep]:
+    return [s for s in CAPTURED_STEPS.values() if s.parent_step_id == parent.step_id]
+
+
+def final_status(name: str) -> TestStatus | None:
+    step = test_step(name)
+    if step is None or not step.statuses:
+        return None
+    return step.statuses[-1]
diff --git a/python/lib/sift_client/_tests/util/step_status_states.md b/python/lib/sift_client/_tests/util/step_status_states.md
new file mode 100644
index 000000000..6d48f7c75
--- /dev/null
+++ b/python/lib/sift_client/_tests/util/step_status_states.md
@@ -0,0 +1,99 @@
+# Pytest-plugin step-status: observed vs. target
+
+Companion document to `test_step_status_states.py`. Each row corresponds to
+one scenario in that suite. The **observed** column is the status the Sift
+pytest plugin records for the test's outer step today; the **target** column
+is what the audit recommends. Rows where the two differ are the work items
+for the fix.
+
+`TestStatus` values referenced below come from
+`sift_client.sift_types.test_report.TestStatus`: `PASSED`, `FAILED`, `ERROR`,
+`SKIPPED`, plus the proposed `XFAILED` / `XPASSED` / `ABORTED` additions
+called out in the audit.
+
+## Call-phase exit paths
+
+| Scenario                                | Trigger                                       | Observed today              | Target                                     | Status |
+| --------------------------------------- | --------------------------------------------- | --------------------------- | ------------------------------------------ | ------ |
+| Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
+| Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
+| Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
+| `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `ERROR`                     | `FAILED`                                   | Gap    |
+| `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ABORTED` (proposed) or documented `ERROR` | Gap    |
+| `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `PASSED` (session aborts before the plugin sees the interrupt) | `ABORTED` (proposed) | Gap |
+
+## Skip paths
+
+| Scenario                                | Trigger                                       | Observed today                                                              | Target                                                          | Status |
+| --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
+| Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
+| Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `ERROR`; a nested step with the same name records `SKIPPED`      | Outer step `SKIPPED`; no duplicate nested step                  | Gap    |
+| Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `PASSED`; a nested `SKIPPED` step is created by the makereport hook | Outer step `SKIPPED` with `phase=setup`; no duplicate nested step | Gap |
+
+## xfail / xpass
+
+| Scenario                                | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
+| --------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
+| xfail-marked test that fails            | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `FAILED`; nested `SKIPPED` substep from the makereport hook                          | Outer step `XFAILED`; no duplicate nested step        | Gap    |
+| Strict xfail that unexpectedly passes   | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `PASSED` (plugin never sees pytest's "strict xpass" failure attached to the report)  | Outer step `XPASSED`                                  | Gap    |
+| Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`               | Outer step `PASSED` (pytest reports outcome="passed" with `wasxfail` set; plugin ignores it)    | Outer step `XPASSED`                                  | Gap    |
+| `xfail(raises=...)` with wrong exception | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | Outer step `ERROR` (treated as a generic non-assertion exception)                           | `FAILED` (the `raises=` mismatch is a real test failure) | Gap |
+| `xfail(run=False)`                      | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (only the makereport hook records a step)                                              | `XFAILED`                                            | Gap    |
+
+## Setup / teardown phases
+
+| Scenario                                | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
+| --------------------------------------- | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
+| Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs    | Outer step does not exist or lands `PASSED`; the plugin does not consult `report.when`                                                    | `ERROR` with `phase=setup` annotation                   | Gap    |
+| Teardown-phase fixture failure          | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `PASSED` — it closes before the failing teardown runs, so the error is invisible                                              | `FAILED` with `phase=teardown` annotation               | Gap    |
+| Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is silently lost                                              | `FAILED` with a `phase=teardown` annotation so the teardown error is also visible | Gap |
+
+## Collection / fixture-resolution failures
+
+| Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
+| --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
+| Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `PASSED` — the autouse `step` fixture's setup still runs before pytest detects the missing fixture; the user sees a green step for a test that never executed | `ERROR` with `phase=setup`                              | Gap    |
+
+## Plugin-API exit paths (in-test mutations)
+
+| Scenario                                | Trigger                                                                | Observed today | Target   | Status |
+| --------------------------------------- | ---------------------------------------------------------------------- | -------------- | -------- | ------ |
+| Manual status override                  | `step.current_step.update({"status": TestStatus.FAILED})`              | `FAILED`       | `FAILED` | OK     |
+| `report_outcome(result=False)`          | `step.report_outcome("the_check", False, "did not match")`             | `FAILED`       | `FAILED` | OK     |
+| `measure(...)` out-of-bounds            | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`  | `FAILED`       | `FAILED` | OK     |
+
+## Out of scope for this characterization run
+
+- **Timeout** — needs `pytest-timeout` or a manual signal harness. Add as a
+  follow-up once the audit picks a timeout strategy.
+- **Signal (SIGKILL / SIGTERM)** — cannot be caught from inside the process;
+  needs a subprocess-level harness.
+- **`pytest.exit("...")`** — niche; the "aborts subsequent tests" behavior
+  is hard to characterize cleanly because each `pytester` invocation is its
+  own session. Document the expectation alongside `SystemExit`.
+- **`os._exit()`** — bypasses Python cleanup entirely; can't be tested
+  in-process because it would kill the outer pytest run. Document as a
+  guaranteed data-loss case alongside `SystemExit` / `SIGKILL`.
+- **Parametrize-level marks** (`pytest.param(..., marks=pytest.mark.xfail / skip)`)
+  — routes through a different selection path but produces the same
+  `report.outcome`, so behavior should match the function-level marks
+  already covered above. Add only if the plugin's eventual phase-aware
+  handler diverges between the two.
+- **Import error / syntax error / `conftest.py` error** — these fail
+  collection entirely; no `item` is produced and no plugin hook fires.
+  Document explicitly that no Sift step is recorded.
+- **No-data / indeterminate** — tracked separately as part of the sibling
+  status-semantics work.
+
+## How to refresh this table
+
+Run the suite locally:
+
+```
+pytest lib/sift_client/_tests/util/test_step_status_states.py -v
+```
+
+Every "Gap" row corresponds to a `# AUDIT:` comment in the test file naming
+the target status. When the plugin fix lands, the regression edit is
+mechanical: flip the assertion in each gap row to its target, then update
+the **Observed today** column here to match.
diff --git a/python/lib/sift_client/_tests/util/test_step_status_states.py b/python/lib/sift_client/_tests/util/test_step_status_states.py
new file mode 100644
index 000000000..42c245457
--- /dev/null
+++ b/python/lib/sift_client/_tests/util/test_step_status_states.py
@@ -0,0 +1,587 @@
+"""Characterization suite: maps each pytest exit path to the ``TestStatus``
+the plugin currently records on the step.
+
+Each scenario writes a tiny inner test file and runs it through pytester
+with a fake ``sift_client`` injected via a generated conftest. The fake
+records every step status write into ``_step_status_capture.CAPTURED_STEPS``
+so this outer test can assert on what the plugin produced.
+
+The expected statuses below reflect **current** behavior. Where current
+behavior contradicts the audit target (setup-phase, teardown-phase, xfail),
+the assertion is paired with an ``AUDIT:`` comment naming the target.
+Updating these tests is the regression check once the fix lands.
+"""
+
+from __future__ import annotations
+
+import textwrap
+
+import pytest
+
+from sift_client._tests.util import _step_status_capture as capture
+from sift_client.sift_types.test_report import TestStatus
+
+pytest_plugins = ["pytester"]
+
+
+_INNER_CONFTEST_SRC = '''
+"""Auto-generated conftest for the step-status characterization suite.
+
+Installs the Sift pytest plugin and a fake ``sift_client`` that records
+every step status write into the outer test's CAPTURED_STEPS dict.
+"""
+
+from __future__ import annotations
+
+import uuid
+
+import pytest
+
+# Bring the Sift fixtures + the makereport hook into this inner session.
+from sift_client.util.test_results import *  # noqa: F401,F403
+
+from sift_client._tests.util._step_status_capture import CAPTURED_STEPS, CapturedStep
+from sift_client.sift_types.test_report import (
+    TestMeasurement,
+    TestReport,
+    TestStep,
+)
+
+
+
+class _FakeTestResults:
+    def __init__(self, client):
+        self._client = client
+
+    def create(self, test_report, log_file=None):
+        report = TestReport(
+            id_=str(uuid.uuid4()),
+            status=test_report.status,
+            name=test_report.name,
+            test_system_name=test_report.test_system_name,
+            test_case=test_report.test_case,
+            start_time=test_report.start_time,
+            end_time=test_report.end_time,
+            metadata=test_report.metadata or {},
+            is_archived=False,
+        )
+        report._apply_client_to_instance(self._client)
+        return report
+
+    def update(self, test_report, update, log_file=None):
+        return test_report
+
+    def create_step(self, test_step, log_file=None):
+        step_id = str(uuid.uuid4())
+        CAPTURED_STEPS[step_id] = CapturedStep(
+            step_id=step_id,
+            name=test_step.name,
+            step_path=test_step.step_path,
+            parent_step_id=test_step.parent_step_id,
+            statuses=[test_step.status],
+        )
+        step = TestStep(
+            id_=step_id,
+            test_report_id=test_step.test_report_id,
+            parent_step_id=test_step.parent_step_id,
+            name=test_step.name,
+            description=test_step.description,
+            step_type=test_step.step_type,
+            step_path=test_step.step_path,
+            status=test_step.status,
+            start_time=test_step.start_time,
+            end_time=test_step.end_time,
+            error_info=test_step.error_info,
+        )
+        step._apply_client_to_instance(self._client)
+        return step
+
+    def update_step(self, test_step, update, log_file=None):
+        new_status = (
+            update.get("status") if isinstance(update, dict) else update.status
+        )
+        if test_step.id_ in CAPTURED_STEPS and new_status is not None:
+            CAPTURED_STEPS[test_step.id_].statuses.append(new_status)
+        merged_status = new_status if new_status is not None else test_step.status
+        updated = TestStep(
+            id_=test_step.id_,
+            test_report_id=test_step.test_report_id,
+            parent_step_id=test_step.parent_step_id,
+            name=test_step.name,
+            description=test_step.description,
+            step_type=test_step.step_type,
+            step_path=test_step.step_path,
+            status=merged_status,
+            start_time=test_step.start_time,
+            end_time=test_step.end_time,
+            error_info=test_step.error_info,
+        )
+        updated._apply_client_to_instance(self._client)
+        return updated
+
+    def create_measurement(self, test_measurement, update_step=False, log_file=None):
+        measurement = TestMeasurement(
+            id_=str(uuid.uuid4()),
+            measurement_type=test_measurement.measurement_type,
+            name=test_measurement.name,
+            test_step_id=test_measurement.test_step_id,
+            numeric_value=test_measurement.numeric_value,
+            string_value=test_measurement.string_value,
+            boolean_value=test_measurement.boolean_value,
+            unit=test_measurement.unit,
+            numeric_bounds=test_measurement.numeric_bounds,
+            string_expected_value=test_measurement.string_expected_value,
+            passed=test_measurement.passed,
+            timestamp=test_measurement.timestamp,
+        )
+        measurement._apply_client_to_instance(self._client)
+        return measurement
+
+
+class _FakePing:
+    def ping(self):
+        return None
+
+
+class _FakeSiftClient:
+    def __init__(self):
+        self.test_results = _FakeTestResults(self)
+        self.ping = _FakePing()
+
+
+@pytest.fixture(scope="session")
+def sift_client():
+    return _FakeSiftClient()
+'''
+
+
+_RUN_ARGS = (
+    "--sift-test-results-log-file=false",
+    "--no-sift-test-results-git-metadata",
+)
+
+
+@pytest.fixture
+def inner(pytester):
+    """Reset the capture state and install the inner conftest. Returns ``pytester``."""
+    capture.reset()
+    pytester.makeconftest(_INNER_CONFTEST_SRC)
+    return pytester
+
+
+def _run(pytester, body: str) -> None:
+    pytester.makepyfile(textwrap.dedent(body))
+    pytester.runpytest_inprocess(*_RUN_ARGS)
+
+
+# ---------------------------------------------------------------------------
+# Call-phase exit paths
+# ---------------------------------------------------------------------------
+
+
+def test_pass_maps_to_passed(inner):
+    _run(
+        inner,
+        """
+        def test_x():
+            assert True
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.PASSED
+
+
+def test_assert_failure_maps_to_failed(inner):
+    _run(
+        inner,
+        """
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_generic_exception_maps_to_error(inner):
+    _run(
+        inner,
+        """
+        def test_x():
+            raise ValueError("boom")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_system_exit_maps_to_error(inner):
+    _run(
+        inner,
+        """
+        import sys
+        def test_x():
+            sys.exit(1)
+        """,
+    )
+    # AUDIT: SystemExit is currently routed through the generic-exception
+    # path because it isn't an AssertionError. The audit may want a
+    # dedicated bucket (e.g., ABORTED) since the test didn't really "error"
+    # so much as exit deliberately.
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_pytest_fail_maps_to_error(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.fail("intentional failure")
+        """,
+    )
+    # AUDIT: target is FAILED. pytest.fail raises a Failed OutcomeException,
+    # which the plugin treats as a generic exception (not AssertionError)
+    # and routes to ERROR. Users expect pytest.fail and assert-fail to land
+    # in the same bucket.
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_keyboard_interrupt_aborts_session(inner):
+    # KeyboardInterrupt propagates out of pytester's inprocess runner, so
+    # we catch it here. Pytest aborts the session before the call phase's
+    # makereport fires, so the plugin never sees the interrupt; the step
+    # fixture's teardown runs with no rep_call.excinfo and resolves the
+    # step to PASSED.
+    try:
+        _run(
+            inner,
+            """
+            def test_x():
+                raise KeyboardInterrupt
+            """,
+        )
+    except KeyboardInterrupt:
+        pass
+    # AUDIT: target is ABORTED (or similar). The step is recorded as PASSED
+    # despite the test having been aborted, which is misleading.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+# ---------------------------------------------------------------------------
+# Skip paths
+# ---------------------------------------------------------------------------
+
+
+def test_pytest_skip_in_body_records_skipped_substep(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.skip("not today")
+        """,
+    )
+    # The plugin's makereport hook creates a separate SKIPPED step nested
+    # under the autouse outer step. The outer step itself is driven by the
+    # call-phase excinfo (a Skipped exception), which the plugin treats as
+    # a generic exception -> ERROR.
+    skipped_steps = [
+        s
+        for s in capture.steps_by_name("test_x")
+        if s.statuses and s.statuses[-1] == TestStatus.SKIPPED
+    ]
+    assert skipped_steps, "expected at least one SKIPPED step for a skipped test"
+    # AUDIT: the outer step records ERROR for an in-body pytest.skip().
+    # Target: outer step should be SKIPPED; no nested duplicate.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+def test_pytest_mark_skip_records_skipped(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skip(reason="collection-time skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # Collection-time skip: the autouse step fixture never runs. Only the
+    # makereport hook creates a step, with status SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_skip_inside_fixture_setup(inner):
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def skipping_fixture():
+            pytest.skip("environment not ready")
+
+        def test_x(skipping_fixture):
+            assert True
+        """,
+    )
+    # AUDIT: target is SKIPPED with phase=setup on the outer step. Today
+    # the autouse outer step lands in PASSED (its own setup ran, no failure
+    # was recorded), and a separate nested SKIPPED step is created by the
+    # makereport hook from the setup-phase skip report.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+    nested_skipped = [
+        s
+        for s in capture.steps_by_name("test_x")
+        if s.parent_step_id is not None and s.statuses[-1] == TestStatus.SKIPPED
+    ]
+    assert nested_skipped, "fixture-skip currently produces a nested SKIPPED step"
+
+
+# ---------------------------------------------------------------------------
+# xfail / xpass
+# ---------------------------------------------------------------------------
+
+
+def test_xfail_marked_test_that_fails(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="known issue")
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    # AUDIT: target is XFAILED (distinct from SKIPPED). Today, pytest reports
+    # outcome="skipped" for an xfailed test, so the makereport hook records
+    # a SKIPPED nested step. The outer autouse step records FAILED from the
+    # call-phase AssertionError.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+    skipped_substeps = [
+        s
+        for s in capture.steps_by_name("test_x")
+        if s.parent_step_id is not None and s.statuses[-1] == TestStatus.SKIPPED
+    ]
+    assert skipped_substeps, "xfailed test currently produces a nested SKIPPED step"
+
+
+def test_xfail_strict_unexpected_pass(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(strict=True, reason="should fail")
+        def test_x():
+            assert True
+        """,
+    )
+    # AUDIT: target is XPASSED. The test body raises no exception, so the
+    # plugin records PASSED and never sees pytest's later "strict xfail
+    # passed" failure attached to the report.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_xfail_non_strict_unexpected_pass(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="might pass sometimes")
+        def test_x():
+            assert True
+        """,
+    )
+    # AUDIT: target is XPASSED. Non-strict xfail that passes is reported by
+    # pytest as outcome="passed" with wasxfail set; the plugin ignores
+    # wasxfail and records PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_xfail_raises_mismatch(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(raises=ValueError, reason="expected ValueError")
+        def test_x():
+            raise KeyError("wrong exception")
+        """,
+    )
+    # AUDIT: target is FAILED. Pytest treats a `raises=` mismatch as a real
+    # call-phase failure; the plugin sees a non-assertion exception in
+    # excinfo and routes it to ERROR.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+def test_xfail_run_false(inner):
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(run=False, reason="never run")
+        def test_x():
+            assert False
+        """,
+    )
+    # AUDIT: target is XFAILED. With run=False pytest reports the test as
+    # skipped/xfailed without executing it, so today only the makereport
+    # hook records a step, with status SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+# ---------------------------------------------------------------------------
+# Setup-phase / teardown-phase fixture failures
+# ---------------------------------------------------------------------------
+
+
+def test_setup_phase_fixture_failure(inner):
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_setup():
+            raise RuntimeError("setup boom")
+
+        def test_x(bad_setup):
+            assert True
+        """,
+    )
+    # AUDIT: target is ERROR with phase=setup. Today the plugin doesn't
+    # consult report.when, so the outer step (if it exists) lands in PASSED
+    # because the call phase never ran and no failure was recorded.
+    outer = capture.test_step("test_x")
+    if outer is not None:
+        assert outer.statuses[-1] == TestStatus.PASSED, (
+            f"setup-fail outer step status was {outer.statuses[-1]}; "
+            "audit target is ERROR with phase=setup"
+        )
+
+
+def test_teardown_phase_fixture_failure(inner):
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert True
+        """,
+    )
+    # AUDIT: target is FAILED with phase=teardown. Today the outer autouse
+    # step closes BEFORE the failing fixture's teardown runs, so the test
+    # body's success is recorded as PASSED and the teardown error is
+    # invisible to the step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED, (
+        f"teardown-fail outer step status was {outer.statuses[-1]}; "
+        "audit target is FAILED with phase=teardown"
+    )
+
+
+def test_call_fail_plus_teardown_fail(inner):
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert 1 == 2
+        """,
+    )
+    # AUDIT: the call-phase failure dominates (status FAILED), and the
+    # teardown error is silently lost. Target: status should reflect both
+    # signals, e.g. FAILED with a teardown-phase annotation so the
+    # teardown error is not invisible.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+# ---------------------------------------------------------------------------
+# Collection-phase failures
+# ---------------------------------------------------------------------------
+
+
+def test_missing_fixture_records_passed_step(inner):
+    _run(
+        inner,
+        """
+        def test_x(nonexistent_fixture):
+            assert True
+        """,
+    )
+    # AUDIT: target is ERROR with phase=setup. Today the autouse `step`
+    # fixture's setup still runs (because it has no dependency on the
+    # missing fixture), creates an outer step, then the missing-fixture
+    # error aborts setup. The autouse step's teardown runs with no
+    # rep_call.excinfo and resolves to PASSED -- so the user sees a
+    # green step in Sift for a test that never executed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+# ---------------------------------------------------------------------------
+# Plugin-API exit paths (in-test mutations)
+# ---------------------------------------------------------------------------
+
+
+def test_manual_status_update_to_failed(inner):
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            step.current_step.update({"status": TestStatus.FAILED})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_report_outcome_false_maps_to_failed(inner):
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.report_outcome("the_check", False, "did not match")
+        """,
+    )
+    # Outer step sees a failed substep and rolls up to FAILED.
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_measure_out_of_bounds_maps_to_failed(inner):
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED

From 490fbf14957f472bbb3a2f30fe141c3c8dcb7913 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 15 May 2026 14:55:07 -0700
Subject: [PATCH 11/15] document pass/fail states and add tests to validate
 them

---
 python/docs/examples/pytest_plugin.md         | 700 +----------------
 .../pytest_plugin/pass_fail_behavior.md       | 140 ++++
 python/docs/guides/pytest_plugin/usage.md     | 702 ++++++++++++++++++
 .../_tests/util/step_status_states.md         | 113 +--
 .../_tests/util/test_step_status_states.py    | 246 +++---
 python/mkdocs.yml                             |  11 +-
 6 files changed, 1070 insertions(+), 842 deletions(-)
 create mode 100644 python/docs/guides/pytest_plugin/pass_fail_behavior.md
 create mode 100644 python/docs/guides/pytest_plugin/usage.md

diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index cf56dd75e..d14662e97 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -1,695 +1,11 @@
-# Pytest Plugin
-
-The Sift Python client ships a pytest plugin that turns a pytest run into a
-`TestReport` in Sift. Each test function becomes a `TestStep`, measurements
-land as rows under that step, and failures propagate up through nested
-substeps to the report itself.
-
-This page walks through wiring the plugin into a project, the fixtures and
-hooks it provides, and the patterns you'll use day-to-day.
-
-!!! info "Where the plugin lives"
-    The plugin is part of `sift_client.util.test_results`. It is **not**
-    registered as a `pytest11` entry point. Projects opt in with a
-    `from sift_client.util.test_results import *` in their `conftest.py`.
-    That import is what wires up the fixtures, the CLI options, and the
-    `pytest_runtest_makereport` hook.
-
-## Install
-
-```bash
-pip install sift-stack-py pytest python-dotenv
-```
-
-Set the connection details in a `.env` next to your tests:
-
-```bash
-SIFT_API_KEY="your-api-key"
-SIFT_GRPC_URI="..."
-SIFT_REST_URI="..."
-```
-
-The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your Sift organization. You can find these on the Sift Manage page as well as generate an API key.
-
-## Wire the plugin into `conftest.py`
-
-Two things are required: a session-scoped `sift_client` fixture (the plugin's
-`report_context` fixture resolves it by name), and a star-import that registers
-the plugin's fixtures into the conftest's namespace.
-
-```python title="conftest.py"
-import os
-
-import pytest
-from dotenv import load_dotenv
-
-from sift_client import SiftClient, SiftConnectionConfig
-
-# Star-import wires fixtures + hooks + CLI options into pytest collection.
-from sift_client.util.test_results import *
-
-load_dotenv()
-
-
-@pytest.fixture(scope="session")
-def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI")
-    rest_url = os.getenv("SIFT_REST_URI")
-    api_key = os.getenv("SIFT_API_KEY")
-    
-    return SiftClient(
-        connection_config=SiftConnectionConfig(
-            api_key=api_key,
-            grpc_url=grpc_url,
-            rest_url=rest_url,
-        )
-    )
-```
-
-That's the whole setup. Every test in the session will now create a step on a
-single shared `TestReport`.
-
-## Plugin provided fixtures
-
-| Name | Kind | Scope | Purpose |
-|---|---|---|---|
-| `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
-| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
-| `module_substep` | fixture (autouse) | module | One step per test file with each function nested as a substep. |
-| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted only when `--sift-test-results-check-connection` is set. |
-
-### CLI options
-
-| Flag | Default | Effect |
-|---|---|---|
-| `--sift-test-results-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. |
-| `--no-sift-test-results-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
-| `--sift-test-results-check-connection` | off | Make `report_context`, `step`, and `module_substep` no-op (yield `None`) when `client_has_connection` is `False`. Lets the same suite run locally without a Sift backend. |
-
-These can be set permanently in `pytest.ini`:
-
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-test-results-check-connection
-```
-
-!!! warning "FedRAMP / shared environments"
-    Pass `--sift-test-results-log-file=false` to skip the temp file + worker
-    pipeline. Create/update calls then run inline against the API instead of
-    being deferred through a subprocess.
-
-### Report metadata captured automatically
-
-Every report the plugin creates includes:
-
-- `name` and `test_case`: derived from the first positional argument to `pytest`. When it resolves to an existing path the plugin uses the basename for `name` and the full path string for `test_case`; otherwise both fall back to `pytest <args>`. `name` always has a UTC ISO timestamp appended. See examples below.
-- `test_system_name`: `socket.gethostname()`.
-- `system_operator`: `getpass.getuser()`.
-- `start_time` / `end_time`: set on session enter/exit.
-- `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
-- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-test-results-git-metadata` or when not in a git repo.
-
-Example invocations:
-
-| Pytest invocation | Report `name` | Report `test_case` |
-|---|---|---|
-| `pytest tests/test_battery.py` | `test_battery.py 2026-05-04T12:00:00.123456+00:00` | `tests/test_battery.py` |
-| `pytest tests/` | `tests 2026-05-04T12:00:00.123456+00:00` | `tests` |
-| `pytest -k voltage` | `pytest -k voltage 2026-05-04T12:00:00.123456+00:00` | `pytest -k voltage` |
-
-To override defaults (e.g. set a serial number, system operator, or extra
-metadata), call `report_context.report.update({...})` from any test or
-fixture. See [Linking a Run](#linking-a-run-to-the-report) for the same
-pattern applied to `run_id`.
-
-## Basic usage
-
-With the conftest in place, the simplest test needs nothing extra. The `step`
-fixture is `autouse=True` and pytest test failures and skips are mapped to
-step statuses automatically.
-
-```python title="test_basic.py"
-def test_no_fixtures_still_creates_a_step():
-    """Autouse `step` records this function as a step on the session report."""
-    assert 1 + 1 == 2
-
-
-def test_measure_a_single_value(step):
-    """Take `step` explicitly when you want to record a measurement."""
-    voltage = 4.97
-    passed = step.measure(
-        name="battery_voltage",
-        value=voltage,
-        bounds={"min": 4.8, "max": 5.2},
-        unit="V",
-    )
-    assert passed, f"voltage {voltage}V out of bounds"
-
-
-def test_measure_strings_and_booleans(step):
-    """`bounds` accepts a string or `True`/`False` for non-numeric values."""
-    step.measure(name="firmware_version", value="1.4.2", bounds="1.4.2")
-    step.measure(name="self_test_passed", value=True, bounds=True)
-
-
-def test_docstring_becomes_step_description(step):
-    """This docstring is the step's description in Sift.
-
-    The plugin pulls `request.node.obj.__doc__` when it creates the step.
-    Helper functions called from within the test do not get this treatment;
-    pass `description="..."` explicitly on `substep(...)` instead.
-    """
-    assert step.current_step.description is not None
-```
-
-!!! tip "Measurements never raise"
-    `step.measure(...)` returns `True` if the value is in bounds and `False`
-    otherwise. A `False` result marks the enclosing step as failed but does
-    not raise. Chain measurements freely and inspect the boolean if you need
-    custom flow control.
-
-### Status semantics for failures
-
-The plugin uses the step exit handler in `NewStep.__exit__` to translate test
-outcomes into `TestStatus`:
-
-| Outcome | Resulting `TestStatus` |
-|---|---|
-| In-bounds measurements only | `PASSED` |
-| Failed measurement, failed `report_outcome`, failed substep, or `AssertionError` raised by the test | `FAILED` (no traceback is attached, since pytest already prints it in the runner output) |
-| Non-`AssertionError` exception escapes the test (e.g. `ValueError`, `TimeoutError`) | `ERROR`, with the formatted traceback (last 10 frames plus the first frame) on `step.error_info.error_message` |
-| Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
-
-A failure or error at any depth propagates upward: the parent substep, the
-function step, the module step (if `module_substep` is active), and the
-session report all get marked failed.
-
-## Nested steps
-
-Use `step.substep(name=...)` to open a child step. Substeps nest arbitrarily
-deep, and a failure at any depth propagates up to fail the parent and the
-report.
-
-```python title="test_nested_steps.py"
-import time
-
-
-def test_phased_check(step):
-    """Phase a single test into setup/exercise/verify substeps."""
-    with step.substep(name="setup", description="Power on and wait for boot") as setup:
-        setup.measure(name="boot_time_s", value=2.1, bounds={"max": 5.0}, unit="s")
-
-    with step.substep(name="exercise", description="Drive the test sequence"):
-        time.sleep(0.01)
-
-    with step.substep(name="verify", description="Read final state") as verify:
-        verify.measure(name="final_state", value="IDLE", bounds="IDLE")
-
-
-def test_deeply_nested(step):
-    """A failure at the bottom fails everyone above it."""
-    with step.substep(name="level_1") as l1:
-        with l1.substep(name="level_2") as l2:
-            with l2.substep(name="level_3") as l3:
-                l3.measure(name="leaf_value", value=42, bounds={"min": 0, "max": 100})
-```
-
-Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …)
-assigned by `ReportContext`. Sibling substeps within the same parent
-auto-increment; opening a new top-level step starts a new branch.
-
-### One step per file
-
-`module_substep` is autouse and module-scoped. When it's active (it's pulled
-in by the star-import in `conftest.py`), each file becomes a parent step and
-every function in it nests one level down. Its name is the test file's
-basename and its description is the module's docstring (if any).
-
-### Linking a Run to the report
-
-`report_context` is the session-scoped fixture; mutating it in one test
-affects the whole report.
-
-```python
-def test_link_run_to_report(report_context, sift_client):
-    run = sift_client.runs.create(...)  # however you create your run
-    report_context.report.update({"run_id": run.id_})
-```
-
-The same `update({...})` pattern works for any field on `TestReportUpdate`,
-including `serial_number`, `part_number`, `system_operator`, and `metadata`.
-
-## How pytest layout maps to a Sift report
-
-The plugin builds the report tree by hooking pytest's collection: every test
-node it sees becomes a step. What you control is which constructs create
-nodes and where you nest substeps inside them. Common layouts and the
-resulting report trees:
-
-### Flat module of test functions
-
-The default. Each function is one step directly under the report.
-
-```python title="test_battery.py"
-def test_voltage(step): ...
-def test_current(step): ...
-def test_temperature(step): ...
-```
-
-```text title="Sift report"
-TestReport
-├── test_voltage
-├── test_current
-└── test_temperature
-```
-
-### One step per file with `module_substep`
-
-`module_substep` is autouse and module-scoped. Every file becomes a parent
-step and every function in it nests one level down.
-
-```python title="test_battery.py"
-def test_voltage(step): ...
-def test_current(step): ...
-```
-
-```python title="test_thermal.py"
-def test_idle_temp(step): ...
-def test_load_temp(step): ...
-```
-
-```text title="Sift report"
-TestReport
-├── test_battery.py
-│   ├── test_voltage
-│   └── test_current
-└── test_thermal.py
-    ├── test_idle_temp
-    └── test_load_temp
-```
-
-### Test classes
-
-Pytest classes (`class TestFoo: ...`) do not create a parent step on their
-own. The plugin keys off the test node's `name`, which is just the method
-name. To group a class's methods under a class-level step, add a class-scoped
-fixture that opens a step with `report_context.new_step(...)`:
-
-```python title="test_charging.py"
-import pytest
-
-
-class TestCharging:
-    @pytest.fixture(scope="class", autouse=True)
-    def class_step(self, report_context):
-        with report_context.new_step(
-            name="TestCharging",
-            description="Charging subsystem",
-        ) as parent:
-            yield parent
-
-    def test_starts_at_zero(self, step): ...
-    def test_reaches_full(self, step): ...
-    def test_thermal_throttle(self, step): ...
-```
-
-```text title="Sift report"
-TestReport
-└── TestCharging
-    ├── test_starts_at_zero
-    ├── test_reaches_full
-    └── test_thermal_throttle
-```
-
-!!! note "Combining with `module_substep`"
-    `module_substep` and a class-scoped step both open at module/class scope,
-    so they each grab the next sibling slot under the report and the inner
-    one nests under the outer. If you want both layers (file → class →
-    method), make the class step itself open via the active outer step
-    rather than the report root.
-
-### Parametrized tests
-
-Each parametrize case is a distinct pytest node, so each gets its own step.
-The step name includes the parameter id pytest generates.
-
-```python
-@pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
-def test_rail(step, voltage):
-    step.measure(name="rail_v", value=voltage, bounds={"min": 0.0})
-```
+---
+title: Pytest Plugin (moved)
+search:
+  exclude: true
+---
 
-```text title="Sift report"
-TestReport
-├── test_rail[3.3]
-├── test_rail[5.0]
-└── test_rail[12.0]
-```
+<meta http-equiv="refresh" content="0; url=../../guides/pytest_plugin/usage/">
 
-### Helper functions
-
-Helpers called from a test do not auto-create a step. The plugin only sees
-pytest-collected nodes. To represent helper work in the report, open a
-substep at the call site and pass it into the helper:
-
-```python
-def measure_rail(step, name, value, bounds):
-    return step.measure(name=name, value=value, bounds=bounds, unit="V")
-
-
-def test_power_rails(step):
-    with step.substep(name="3.3V rail") as rail_3v3:
-        measure_rail(rail_3v3, "rail_v", 3.31, {"min": 3.2, "max": 3.4})
-
-    with step.substep(name="5V rail") as rail_5v:
-        measure_rail(rail_5v, "rail_v", 5.02, {"min": 4.9, "max": 5.1})
-```
-
-```text title="Sift report"
-TestReport
-└── test_power_rails
-    ├── 3.3V rail
-    │   └── rail_v        (measurement)
-    └── 5V rail
-        └── rail_v        (measurement)
-```
-
-!!! tip "Docstring-as-description is top-level only"
-    The plugin reads the test function's docstring and uses it as the step
-    description. Docstrings on helper functions are not picked up. Pass
-    `description="..."` explicitly on `substep(...)` if you want one.
-
-### Fixtures that contribute steps
-
-A fixture can open its own substep around setup/teardown by using `step` (for
-function-scope) or `report_context.new_step(...)` (for any scope). The substep
-ends when the fixture's `yield` returns, which makes the report tree mirror
-the lifecycle.
-
-```python
-@pytest.fixture
-def warmed_up_dut(step):
-    with step.substep(name="warmup", description="Bring DUT to operating temp"):
-        # ... do warmup work ...
-        yield "dut-handle"
-
-
-def test_steady_state(step, warmed_up_dut):
-    step.measure(name="temp_c", value=37.2, bounds={"min": 35.0, "max": 40.0})
-```
-
-```text title="Sift report"
-TestReport
-└── test_steady_state
-    ├── warmup        (from fixture)
-    └── temp_c        (measurement)
-```
-
-## Measurement variants
-
-`step.measure(...)` records exactly one measurement. For datasets coming off a
-sensor or calculated channel, use one of the bulk variants.
-
-### `measure_avg`: one row, the mean
-
-`measure_avg` accepts a Python list, a NumPy array, or a pandas `Series`,
-takes the mean, and evaluates it against bounds.
-
-```python
-import numpy as np
-import pandas as pd
-
-
-def test_avg_with_list(step):
-    samples = [4.97, 5.01, 5.03, 4.99, 5.02]
-    step.measure_avg(
-        name="bus_voltage_avg",
-        values=samples,
-        bounds={"min": 4.9, "max": 5.1},
-        unit="V",
-    )
-
-
-def test_avg_with_numpy(step):
-    samples = np.linspace(99.5, 100.5, num=50)
-    step.measure_avg(
-        name="cpu_temp_avg",
-        values=samples,
-        bounds={"min": 95.0, "max": 105.0},
-        unit="C",
-    )
-
-
-def test_avg_with_pandas(step):
-    series = pd.Series([0.998, 1.001, 0.999, 1.002, 1.000])
-    step.measure_avg(
-        name="reference_clock_ratio",
-        values=series,
-        bounds={"min": 0.99, "max": 1.01},
-    )
-```
-
-### `measure_all`: only out-of-bounds rows
-
-Records measurements only for samples that fail bounds, so an all-pass
-dataset of N samples doesn't add N rows to the report. Returns `True` when
-every sample is in bounds.
-
-```python
-def test_only_outliers_recorded(step):
-    samples = [10.1, 10.2, 10.3, 99.9, 10.0, 10.1]  # 99.9 is the outlier
-    all_in_bounds = step.measure_all(
-        name="pressure_psi",
-        values=samples,
-        bounds={"min": 9.0, "max": 11.0},
-        unit="psi",
-    )
-    # Returns False because 99.9 is out of bounds. The step is already
-    # marked failed; raise here only if you also want pytest to fail.
-    assert all_in_bounds
-```
-
-!!! note "`measure_all` requires at least one bound"
-    Passing `bounds={}` raises `ValueError("No bounds provided")`. At
-    least one of `min` or `max` must be set.
-
-### `report_outcome`: externally computed pass/fail
-
-When the decision is computed elsewhere, drop it onto the report as a
-named substep with an optional reason. Returns the result you passed in,
-so you can use it inline.
-
-```python
-def test_external_checks(step):
-    step.report_outcome(
-        name="config_loaded",
-        result=True,
-        reason="loaded /etc/dut/config.yaml",
-    )
-
-    # Failures show up as a failed substep without raising.
-    rare_warning_seen = False
-    step.report_outcome(
-        name="no_rare_warning",
-        result=not rare_warning_seen,
-        reason="grep'd dmesg for the known-flaky warning",
-    )
-```
-
-### Bounds reference
-
-| Pass to `bounds=` | Value type | Effect |
-|---|---|---|
-| `{"min": x, "max": y}` (either key optional) | `int` / `float` | Numeric window. One-sided is fine. |
-| `NumericBounds(min=x, max=y)` | `int` / `float` | Same as the dict form, explicit. |
-| `"expected-string"` | `str` (or `bool`) | Exact equality. For `bool` values, compares lowercased string (`"true"`/`"false"`). |
-| `True` or `False` | `bool` (or `str`) | Exact equality. For `str` values, compares lowercased strings. |
-| `None` | any | Records the value but does not evaluate it; measurement is recorded as `passed=True`. |
-
-The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
-
-## Skip handling
-
-- `@pytest.mark.skip` and `@pytest.mark.skipif`: the plugin's
-  `pytest_runtest_makereport` hook sees the skipped outcome and creates a
-  step with `TestStatus.SKIPPED`.
-- Inside a test function, you can mark just one substep as skipped without
-  aborting the whole test:
-
-  ```python
-  from sift_client.sift_types.test_report import TestStatus
-
-
-  def test_runtime_skip(step):
-      with step.substep(name="optional_calibration") as cal:
-          if not precondition_met():
-              cal.current_step.update({"status": TestStatus.SKIPPED})
-  ```
-
-  A manually-resolved status is honored by the step's exit handler. No
-  further bookkeeping required. `SKIPPED` does not propagate as a failure.
-
-## Running the suite
-
-```bash
-# Full run against your Sift tenant
-pytest
-
-# Pin the log file so you can replay it later if the import worker dies
-pytest --sift-test-results-log-file=./sift-results.jsonl
-```
-
-See [Running offline](#running-offline) for the same suite running with or
-without a reachable Sift server.
-
-## Running offline
-
-The plugin supports two offline workflows, depending on whether you want a
-Sift report at all when the test environment can't reach Sift. The first
-turns the plugin into a no-op when the server is unreachable. The second
-keeps the plugin running normally and writes every create/update to a local
-JSONL file that you upload from a connected machine afterward.
-
-| Pattern | Flag | Runtime behavior | Follow-up |
-|---|---|---|---|
-| Skip when offline | `--sift-test-results-check-connection` | Fixtures yield `None`, no log file, no report. Pytest still reports pass/fail. | None. |
-| Capture locally, upload later | `--sift-test-results-log-file=<path>` | Plugin writes every create/update to the JSONL file. | `import-test-result-log <path>` from a connected machine. |
-
-Pattern 1 suits laptop dev and CI without Sift secrets. Pattern 2 suits
-field tests, vehicles on remote sites, and air-gapped labs.
-
-### Pattern 1: skip when offline
-
-`--sift-test-results-check-connection` makes the plugin ping Sift once at
-session start through the `client_has_connection` fixture (which by default
-calls `sift_client.ping.ping()`). On a failed ping, `report_context`,
-`step`, and `module_substep` yield `None` for the rest of the session.
-Pytest still runs the tests and still reports pass/fail.
-
-```bash
-pytest --sift-test-results-check-connection
-```
-
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-test-results-check-connection
-```
-
-#### Handling `None` in tests
-
-Calls on `step` raise `AttributeError` when it's `None`, so tests that take
-`step` as a parameter need a guard. The cleanest fix is to shadow the
-plugin's `step` fixture in your conftest and turn the `None` case into an
-automatic skip.
-
-```python title="conftest.py"
-import pytest
-
-from sift_client.util.test_results import *
-
-
-@pytest.fixture(autouse=True)
-def step(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    yield step
-```
-
-The `step` parameter on the override resolves to the plugin's fixture, not
-to the override itself. `autouse=True` is required so the skip applies to
-tests that don't request `step` directly. The same shadowing trick works
-for `module_substep` and `report_context`.
-
-For one-off tests that don't share a conftest, an inline guard works just
-as well:
-
-```python
-def test_battery_voltage(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    step.measure(name="battery_voltage", value=4.97, bounds={"min": 4.8, "max": 5.2})
-```
-
-If you'd rather have tests pass through silently than skip them, wrap the
-calls in a helper that no-ops on `None`:
-
-```python
-def safe_measure(step, **kwargs):
-    if step is None:
-        return True
-    return step.measure(**kwargs)
-```
-
-#### Overriding the connection check
-
-The default `client_has_connection` fixture calls `sift_client.ping.ping()`.
-Override it in your conftest if pinging is the wrong signal for your
-environment, for example a token cache that's only warm when authenticated:
-
-```python title="conftest.py"
-from pathlib import Path
-
-import pytest
-
-
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client) -> bool:
-    return Path("~/.sift-token-cache").expanduser().is_file()
-```
-
-The plugin only consults this fixture when `--sift-test-results-check-connection`
-is set, so an unused override has no effect on a normal run.
-
-### Pattern 2: capture locally, upload later
-
-This pattern keeps the plugin running normally even when Sift is
-unreachable. The plugin writes to the log file, the worker dies on connect,
-and the file is left on disk for you to upload later. Pin the log file path
-so you can find it afterward, and don't pass
-`--sift-test-results-check-connection`, which would suppress the logging
-this pattern relies on.
-
-```bash
-pytest --sift-test-results-log-file=./run.jsonl
-```
-
-What happens during the run:
-
-- Every report, step, and measurement create/update is written to
-  `run.jsonl`. The plugin doesn't contact the Sift API for any of these
-  calls; they return simulated responses keyed by UUIDs that the replay
-  later maps to real IDs.
-- The `import-test-result-log --incremental` worker subprocess starts and
-  exits early when it can't reach Sift. The session does not fail when the
-  worker exits before the run ends.
-- Tests run against a real `step` fixture, so `step.measure(...)`,
-  substeps, parametrize, fixtures, and `module_substep` behave exactly as
-  they do online. No conftest changes are needed.
-
-Once you have connectivity, replay the file:
-
-```bash
-import-test-result-log ./run.jsonl
-```
-
-The replay creates the report, steps, and measurements against Sift in one
-batch. See [Replaying a saved log file](#replaying-a-saved-log-file) for
-details on cleanup and the incremental flag.
-
-!!! warning "Pin the log path for Pattern 2"
-    Without `--sift-test-results-log-file=<path>`, the plugin writes to a
-    `tempfile.NamedTemporaryFile` and only surfaces the path via a
-    `logger.info` line. Always pin a known path when you intend to replay
-    the file later.
-
-## Replaying a saved log file
-
-When the worker doesn't finish cleanly the plugin will print a hint mentioning
-`import-test-result-log`. To import:
-
-```bash
-import-test-result-log <path-to-log.jsonl>
-```
+# Pytest Plugin
 
-That replays the saved JSONL log as a single batch (no `--incremental`) and
-deletes the file when it lives under the system temp dir.
\ No newline at end of file
+This page has moved. See [Guides → Pytest Plugin → Usage](../guides/pytest_plugin/usage.md).
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
new file mode 100644
index 000000000..c29116366
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -0,0 +1,140 @@
+# Pass/Fail Behavior
+
+How pytest outcomes, measurement results, and manual status updates map to
+step and report statuses, and how failures propagate up the tree.
+
+See [Usage](usage.md) for the broader plugin reference.
+
+## Step status from test outcome
+
+When a test function returns (or raises), the plugin's step exit handler in
+`NewStep.__exit__` resolves the step's `TestStatus`:
+
+| Outcome inside the test | Resulting `TestStatus` | Notes |
+|---|---|---|
+| All measurements in-bounds, no raised exception | `PASSED` | |
+| A `measure*` call returned `False`, a `report_outcome` returned `False`, a substep failed, the test raised `AssertionError`, or the test called `pytest.fail(...)` | `FAILED` | No traceback is attached — pytest already prints one in the runner output. |
+| Test raised any other exception (e.g. `ValueError`, `TimeoutError`) | `ERROR` | The formatted traceback (last 10 frames plus the first frame) is attached to `step.error_info.error_message`. |
+| Test was skipped via `@pytest.mark.skip` / `skipif` / `pytest.skip(...)` — whether at collection time, from the test body, or from a fixture | `SKIPPED` | |
+| Code in the test called `step.current_step.update({"status": ...})` | Whatever you set | Manual status is honored over the inferred one. |
+
+See [xfail / xpass](#xfail--xpass), [Setup and teardown phases](#setup-and-teardown-phases),
+and [Hard exits](#hard-exits) below for cases the table above does not cover.
+
+## Measurement results never raise
+
+`step.measure(...)`, `step.measure_avg(...)`, `step.measure_all(...)`, and
+`step.report_outcome(...)` all return a boolean and mark the enclosing step
+`FAILED` on `False`. None of them raise. Chain them freely:
+
+```python
+def test_power_on(step):
+    voltage_ok = step.measure(name="rail_v", value=4.97, bounds={"min": 4.8, "max": 5.2})
+    config_ok = step.report_outcome(name="config_loaded", result=True)
+    # The step is already FAILED if either was False. Assert here only if you
+    # also want pytest to fail the test.
+    assert voltage_ok and config_ok
+```
+
+This decouples "the step failed in Sift" from "the test failed in pytest."
+Skip the `assert` and the test passes in pytest while the step is still
+recorded as failed in Sift — useful for keeping a suite running through
+known soft failures.
+
+## Failure propagation
+
+A failed step propagates upward through every parent that wraps it:
+
+```text
+substep (level_3)  ──FAILED──┐
+substep (level_2)         ◄──┘──FAILED──┐
+substep (level_1)                    ◄──┘──FAILED──┐
+function step                                   ◄──┘──FAILED──┐
+module step (if module_substep is active)                  ◄──┘──FAILED──┐
+TestReport                                                            ◄──┘
+```
+
+When a leaf is marked `FAILED`, every parent up to the `TestReport` inherits
+`FAILED` at session exit. `ERROR` propagates the same way; the report
+finalizes to `FAILED` (not `ERROR`) if any step errored or any exception
+escaped the session.
+
+`SKIPPED` does **not** propagate as a failure. Marking a substep `SKIPPED`
+leaves its parent's resolved status unchanged.
+
+## xfail / xpass
+
+The plugin maps pytest's `xfail` outcomes onto existing statuses:
+
+| Scenario | `TestStatus` |
+|---|---|
+| `xfail` test fails as expected | `PASSED` |
+| `xfail(strict=False)` test passes (xpass) | `PASSED` |
+| `xfail(strict=True)` test passes (xpass) | `FAILED` |
+| `xfail(raises=X)` raises a different exception | `FAILED` |
+| `xfail(run=False)` (test is never executed) | `SKIPPED` |
+
+The strict-xpass and `raises=` mismatch cases route to `FAILED` because both
+signal that the `xfail` mark no longer matches reality: either the bug was
+fixed (and the mark should be removed) or the failure mode shifted (and the
+mark needs updating). Conflating them with `PASSED` would hide the signal.
+
+## Setup and teardown phases
+
+A pytest test runs in three phases: `setup` (fixtures initialize up to
+`yield`), `call` (the test body), and `teardown` (fixtures run cleanup after
+`yield`). The plugin records the failing phase's outcome on the outer step:
+
+| Phase that failed | Step status |
+|---|---|
+| Setup — fixture raised before `yield`, or a required fixture was missing | `ERROR` |
+| Call — test body raised or a measurement failed | `FAILED` or `ERROR` per the table at the top of this page |
+| Teardown — fixture raised after `yield` | `FAILED` |
+
+When the call phase and the teardown phase both fail, the call-phase outcome
+determines the step's status and the teardown error is surfaced alongside it
+rather than being silently discarded.
+
+## Hard exits
+
+| Trigger in the test body | `TestStatus` |
+|---|---|
+| `pytest.fail("...")` | `FAILED` |
+| `pytest.skip("...")` | `SKIPPED` |
+| `SystemExit` | `ERROR` |
+| `KeyboardInterrupt` | `ERROR` when the plugin sees the interrupt; a session-aborting `Ctrl-C` may bypass the plugin and leave the step in `IN_PROGRESS`. |
+
+A dedicated `ABORTED` status for `SystemExit` and `KeyboardInterrupt` is
+planned; today both map to `ERROR`.
+
+## Manually skipping a substep
+
+Inside a test, mark a single substep skipped without aborting the test or
+failing its parent:
+
+```python
+from sift_client.sift_types.test_report import TestStatus
+
+
+def test_runtime_skip(step):
+    with step.substep(name="optional_calibration") as cal:
+        if not precondition_met():
+            cal.current_step.update(
+                {"status": TestStatus.SKIPPED},
+                log_file=step.report_context.log_file,
+            )
+```
+
+The exit handler honors the manually-resolved `SKIPPED`. The enclosing
+function step still resolves on its own measurements and remaining substeps.
+
+## Report status
+
+The session-scoped `TestReport`:
+
+- Starts at `IN_PROGRESS` on session enter.
+- Finalizes on session exit to:
+    - `FAILED` if any step finalized as `FAILED` or `ERROR`, or any exception escaped the session.
+    - `PASSED` otherwise.
+
+A run with only `PASSED` and `SKIPPED` steps resolves to `PASSED`.
diff --git a/python/docs/guides/pytest_plugin/usage.md b/python/docs/guides/pytest_plugin/usage.md
new file mode 100644
index 000000000..7275d8b58
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/usage.md
@@ -0,0 +1,702 @@
+# Pytest Plugin
+
+The Sift Python client ships a pytest plugin that turns a pytest run into a
+`TestReport` in Sift. Each test function becomes a `TestStep`, measurements
+land as rows under that step, and failures propagate up through nested
+substeps to the report itself.
+
+This page walks through wiring the plugin into a project, the fixtures and
+hooks it provides, and the patterns you'll use day-to-day.
+
+!!! info "Where the plugin lives"
+    The plugin is part of `sift_client.util.test_results`. It is **not**
+    registered as a `pytest11` entry point. Projects opt in with a
+    `from sift_client.util.test_results import *` in their `conftest.py`.
+    That import is what wires up the fixtures, the CLI options, and the
+    `pytest_runtest_makereport` hook.
+
+## Install
+
+```bash
+pip install sift-stack-py pytest python-dotenv
+```
+
+Set the connection details in a `.env` next to your tests:
+
+```bash
+SIFT_API_KEY="your-api-key"
+SIFT_GRPC_URI="..."
+SIFT_REST_URI="..."
+```
+
+The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your Sift organization. You can find these on the Sift Manage page as well as generate an API key.
+
+## Wire the plugin into `conftest.py`
+
+Two things are required: a session-scoped `sift_client` fixture (the plugin's
+`report_context` fixture resolves it by name), and a star-import that registers
+the plugin's fixtures into the conftest's namespace.
+
+```python title="conftest.py"
+import os
+
+import pytest
+from dotenv import load_dotenv
+
+from sift_client import SiftClient, SiftConnectionConfig
+
+# Star-import wires fixtures + hooks + CLI options into pytest collection.
+from sift_client.util.test_results import *
+
+load_dotenv()
+
+
+@pytest.fixture(scope="session")
+def sift_client() -> SiftClient:
+    grpc_url = os.getenv("SIFT_GRPC_URI")
+    rest_url = os.getenv("SIFT_REST_URI")
+    api_key = os.getenv("SIFT_API_KEY")
+    
+    return SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key=api_key,
+            grpc_url=grpc_url,
+            rest_url=rest_url,
+        )
+    )
+```
+
+That's the whole setup. Every test in the session will now create a step on a
+single shared `TestReport`.
+
+## Plugin provided fixtures
+
+| Name | Kind | Scope | Purpose |
+|---|---|---|---|
+| `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
+| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
+| `module_substep` | fixture (autouse) | module | One step per test file with each function nested as a substep. |
+| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted only when `--sift-test-results-check-connection` is set. |
+
+### CLI options
+
+| Flag | Default | Effect |
+|---|---|---|
+| `--sift-test-results-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. |
+| `--no-sift-test-results-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
+| `--sift-test-results-check-connection` | off | Make `report_context`, `step`, and `module_substep` no-op (yield `None`) when `client_has_connection` is `False`. Lets the same suite run locally without a Sift backend. |
+
+These can be set permanently in `pytest.ini`:
+
+```ini title="pytest.ini"
+[pytest]
+addopts = --sift-test-results-check-connection
+```
+
+!!! warning "FedRAMP / shared environments"
+    Pass `--sift-test-results-log-file=false` to skip the temp file + worker
+    pipeline. Create/update calls then run inline against the API instead of
+    being deferred through a subprocess.
+
+### Report metadata captured automatically
+
+Every report the plugin creates includes:
+
+- `name` and `test_case`: derived from the first positional argument to `pytest`. When it resolves to an existing path the plugin uses the basename for `name` and the full path string for `test_case`; otherwise both fall back to `pytest <args>`. `name` always has a UTC ISO timestamp appended. See examples below.
+- `test_system_name`: `socket.gethostname()`.
+- `system_operator`: `getpass.getuser()`.
+- `start_time` / `end_time`: set on session enter/exit.
+- `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
+- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-test-results-git-metadata` or when not in a git repo.
+
+Example invocations:
+
+| Pytest invocation | Report `name` | Report `test_case` |
+|---|---|---|
+| `pytest tests/test_battery.py` | `test_battery.py 2026-05-04T12:00:00.123456+00:00` | `tests/test_battery.py` |
+| `pytest tests/` | `tests 2026-05-04T12:00:00.123456+00:00` | `tests` |
+| `pytest -k voltage` | `pytest -k voltage 2026-05-04T12:00:00.123456+00:00` | `pytest -k voltage` |
+
+To override defaults (e.g. set a serial number, system operator, or extra
+metadata), call `report_context.report.update({...})` from any test or
+fixture. See [Linking a Run](#linking-a-run-to-the-report) for the same
+pattern applied to `run_id`.
+
+## Basic usage
+
+With the conftest in place, the simplest test needs nothing extra. The `step`
+fixture is `autouse=True` and pytest test failures and skips are mapped to
+step statuses automatically.
+
+```python title="test_basic.py"
+def test_no_fixtures_still_creates_a_step():
+    """Autouse `step` records this function as a step on the session report."""
+    assert 1 + 1 == 2
+
+
+def test_measure_a_single_value(step):
+    """Take `step` explicitly when you want to record a measurement."""
+    voltage = 4.97
+    passed = step.measure(
+        name="battery_voltage",
+        value=voltage,
+        bounds={"min": 4.8, "max": 5.2},
+        unit="V",
+    )
+    assert passed, f"voltage {voltage}V out of bounds"
+
+
+def test_measure_strings_and_booleans(step):
+    """`bounds` accepts a string or `True`/`False` for non-numeric values."""
+    step.measure(name="firmware_version", value="1.4.2", bounds="1.4.2")
+    step.measure(name="self_test_passed", value=True, bounds=True)
+
+
+def test_docstring_becomes_step_description(step):
+    """This docstring is the step's description in Sift.
+
+    The plugin pulls `request.node.obj.__doc__` when it creates the step.
+    Helper functions called from within the test do not get this treatment;
+    pass `description="..."` explicitly on `substep(...)` instead.
+    """
+    assert step.current_step.description is not None
+```
+
+!!! tip "Measurements never raise"
+    `step.measure(...)` returns `True` if the value is in bounds and `False`
+    otherwise. A `False` result marks the enclosing step as failed but does
+    not raise. Chain measurements freely and inspect the boolean if you need
+    custom flow control.
+
+### Status semantics for failures
+
+The plugin uses the step exit handler in `NewStep.__exit__` to translate test
+outcomes into `TestStatus`:
+
+| Outcome | Resulting `TestStatus` |
+|---|---|
+| In-bounds measurements only | `PASSED` |
+| Failed measurement, failed `report_outcome`, failed substep, or `AssertionError` raised by the test | `FAILED` (no traceback is attached, since pytest already prints it in the runner output) |
+| Non-`AssertionError` exception escapes the test (e.g. `ValueError`, `TimeoutError`) | `ERROR`, with the formatted traceback (last 10 frames plus the first frame) on `step.error_info.error_message` |
+| Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
+
+A failure or error at any depth propagates upward: the parent substep, the
+function step, the module step (if `module_substep` is active), and the
+session report all get marked failed.
+
+See [Pass/Fail Behavior](pass_fail_behavior.md) for the full mapping from
+test outcomes to step statuses, propagation rules, and how to manually
+override a step's status.
+
+## Nested steps
+
+Use `step.substep(name=...)` to open a child step. Substeps nest arbitrarily
+deep, and a failure at any depth propagates up to fail the parent and the
+report.
+
+```python title="test_nested_steps.py"
+import time
+
+
+def test_phased_check(step):
+    """Phase a single test into setup/exercise/verify substeps."""
+    with step.substep(name="setup", description="Power on and wait for boot") as setup:
+        setup.measure(name="boot_time_s", value=2.1, bounds={"max": 5.0}, unit="s")
+
+    with step.substep(name="exercise", description="Drive the test sequence"):
+        time.sleep(0.01)
+
+    with step.substep(name="verify", description="Read final state") as verify:
+        verify.measure(name="final_state", value="IDLE", bounds="IDLE")
+
+
+def test_deeply_nested(step):
+    """A failure at the bottom fails everyone above it."""
+    with step.substep(name="level_1") as l1:
+        with l1.substep(name="level_2") as l2:
+            with l2.substep(name="level_3") as l3:
+                l3.measure(name="leaf_value", value=42, bounds={"min": 0, "max": 100})
+```
+
+Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …)
+assigned by `ReportContext`. Sibling substeps within the same parent
+auto-increment; opening a new top-level step starts a new branch.
+
+### One step per file
+
+`module_substep` is autouse and module-scoped. When it's active (it's pulled
+in by the star-import in `conftest.py`), each file becomes a parent step and
+every function in it nests one level down. Its name is the test file's
+basename and its description is the module's docstring (if any).
+
+### Linking a Run to the report
+
+`report_context` is the session-scoped fixture; mutating it in one test
+affects the whole report.
+
+```python
+def test_link_run_to_report(report_context, sift_client):
+    run = sift_client.runs.create(...)  # however you create your run
+    report_context.report.update({"run_id": run.id_})
+```
+
+The same `update({...})` pattern works for any field on `TestReportUpdate`,
+including `serial_number`, `part_number`, `system_operator`, and `metadata`.
+
+## How pytest layout maps to a Sift report
+
+The plugin builds the report tree by hooking pytest's collection: every test
+node it sees becomes a step. What you control is which constructs create
+nodes and where you nest substeps inside them. Common layouts and the
+resulting report trees:
+
+### Flat module of test functions
+
+The default. Each function is one step directly under the report.
+
+```python title="test_battery.py"
+def test_voltage(step): ...
+def test_current(step): ...
+def test_temperature(step): ...
+```
+
+```text title="Sift report"
+TestReport
+├── test_voltage
+├── test_current
+└── test_temperature
+```
+
+### One step per file with `module_substep`
+
+`module_substep` is autouse and module-scoped. Every file becomes a parent
+step and every function in it nests one level down.
+
+```python title="test_battery.py"
+def test_voltage(step): ...
+def test_current(step): ...
+```
+
+```python title="test_thermal.py"
+def test_idle_temp(step): ...
+def test_load_temp(step): ...
+```
+
+```text title="Sift report"
+TestReport
+├── test_battery.py
+│   ├── test_voltage
+│   └── test_current
+└── test_thermal.py
+    ├── test_idle_temp
+    └── test_load_temp
+```
+
+### Test classes
+
+Pytest classes (`class TestFoo: ...`) do not create a parent step on their
+own. The plugin keys off the test node's `name`, which is just the method
+name. To group a class's methods under a class-level step, add a class-scoped
+fixture that opens a step with `report_context.new_step(...)`:
+
+```python title="test_charging.py"
+import pytest
+
+
+class TestCharging:
+    @pytest.fixture(scope="class", autouse=True)
+    def class_step(self, report_context):
+        with report_context.new_step(
+            name="TestCharging",
+            description="Charging subsystem",
+        ) as parent:
+            yield parent
+
+    def test_starts_at_zero(self, step): ...
+    def test_reaches_full(self, step): ...
+    def test_thermal_throttle(self, step): ...
+```
+
+```text title="Sift report"
+TestReport
+└── TestCharging
+    ├── test_starts_at_zero
+    ├── test_reaches_full
+    └── test_thermal_throttle
+```
+
+!!! note "Combining with `module_substep`"
+    `module_substep` and a class-scoped step both open at module/class scope,
+    so they each grab the next sibling slot under the report and the inner
+    one nests under the outer. If you want both layers (file → class →
+    method), make the class step itself open via the active outer step
+    rather than the report root.
+
+### Parametrized tests
+
+Each parametrize case is a distinct pytest node, so each gets its own step.
+The step name includes the parameter id pytest generates.
+
+```python
+@pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
+def test_rail(step, voltage):
+    step.measure(name="rail_v", value=voltage, bounds={"min": 0.0})
+```
+
+```text title="Sift report"
+TestReport
+├── test_rail[3.3]
+├── test_rail[5.0]
+└── test_rail[12.0]
+```
+
+### Helper functions
+
+Helpers called from a test do not auto-create a step. The plugin only sees
+pytest-collected nodes. To represent helper work in the report, open a
+substep at the call site and pass it into the helper:
+
+```python
+def measure_rail(step, name, value, bounds):
+    return step.measure(name=name, value=value, bounds=bounds, unit="V")
+
+
+def test_power_rails(step):
+    with step.substep(name="3.3V rail") as rail_3v3:
+        measure_rail(rail_3v3, "rail_v", 3.31, {"min": 3.2, "max": 3.4})
+
+    with step.substep(name="5V rail") as rail_5v:
+        measure_rail(rail_5v, "rail_v", 5.02, {"min": 4.9, "max": 5.1})
+```
+
+```text title="Sift report"
+TestReport
+└── test_power_rails
+    ├── 3.3V rail
+    │   └── rail_v        (measurement)
+    └── 5V rail
+        └── rail_v        (measurement)
+```
+
+!!! tip "Docstring-as-description is top-level only"
+    The plugin reads the test function's docstring and uses it as the step
+    description. Docstrings on helper functions are not picked up. Pass
+    `description="..."` explicitly on `substep(...)` if you want one.
+
+### Fixtures that contribute steps
+
+A fixture can open its own substep around setup/teardown by using `step` (for
+function-scope) or `report_context.new_step(...)` (for any scope). The substep
+ends when the fixture's `yield` returns, which makes the report tree mirror
+the lifecycle.
+
+```python
+@pytest.fixture
+def warmed_up_dut(step):
+    with step.substep(name="warmup", description="Bring DUT to operating temp"):
+        # ... do warmup work ...
+        yield "dut-handle"
+
+
+def test_steady_state(step, warmed_up_dut):
+    step.measure(name="temp_c", value=37.2, bounds={"min": 35.0, "max": 40.0})
+```
+
+```text title="Sift report"
+TestReport
+└── test_steady_state
+    ├── warmup        (from fixture)
+    └── temp_c        (measurement)
+```
+
+## Measurement variants
+
+`step.measure(...)` records exactly one measurement. For datasets coming off a
+sensor or calculated channel, use one of the bulk variants.
+
+### `measure_avg`: one row, the mean
+
+`measure_avg` accepts a Python list, a NumPy array, or a pandas `Series`,
+takes the mean, and evaluates it against bounds.
+
+```python
+import numpy as np
+import pandas as pd
+
+
+def test_avg_with_list(step):
+    samples = [4.97, 5.01, 5.03, 4.99, 5.02]
+    step.measure_avg(
+        name="bus_voltage_avg",
+        values=samples,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+
+
+def test_avg_with_numpy(step):
+    samples = np.linspace(99.5, 100.5, num=50)
+    step.measure_avg(
+        name="cpu_temp_avg",
+        values=samples,
+        bounds={"min": 95.0, "max": 105.0},
+        unit="C",
+    )
+
+
+def test_avg_with_pandas(step):
+    series = pd.Series([0.998, 1.001, 0.999, 1.002, 1.000])
+    step.measure_avg(
+        name="reference_clock_ratio",
+        values=series,
+        bounds={"min": 0.99, "max": 1.01},
+    )
+```
+
+### `measure_all`: only out-of-bounds rows
+
+Records measurements only for samples that fail bounds, so an all-pass
+dataset of N samples doesn't add N rows to the report. Returns `True` when
+every sample is in bounds.
+
+```python
+def test_only_outliers_recorded(step):
+    samples = [10.1, 10.2, 10.3, 99.9, 10.0, 10.1]  # 99.9 is the outlier
+    all_in_bounds = step.measure_all(
+        name="pressure_psi",
+        values=samples,
+        bounds={"min": 9.0, "max": 11.0},
+        unit="psi",
+    )
+    # Returns False because 99.9 is out of bounds. The step is already
+    # marked failed; raise here only if you also want pytest to fail.
+    assert all_in_bounds
+```
+
+!!! note "`measure_all` requires at least one bound"
+    Passing `bounds={}` raises `ValueError("No bounds provided")`. At
+    least one of `min` or `max` must be set.
+
+### `report_outcome`: externally computed pass/fail
+
+When the decision is computed elsewhere, drop it onto the report as a
+named substep with an optional reason. Returns the result you passed in,
+so you can use it inline.
+
+```python
+def test_external_checks(step):
+    step.report_outcome(
+        name="config_loaded",
+        result=True,
+        reason="loaded /etc/dut/config.yaml",
+    )
+
+    # Failures show up as a failed substep without raising.
+    rare_warning_seen = False
+    step.report_outcome(
+        name="no_rare_warning",
+        result=not rare_warning_seen,
+        reason="grep'd dmesg for the known-flaky warning",
+    )
+```
+
+### Bounds reference
+
+| Pass to `bounds=` | Value type | Effect |
+|---|---|---|
+| `{"min": x, "max": y}` (either key optional) | `int` / `float` | Numeric window. One-sided is fine. |
+| `NumericBounds(min=x, max=y)` | `int` / `float` | Same as the dict form, explicit. |
+| `"expected-string"` | `str` (or `bool`) | Exact equality. For `bool` values, compares lowercased string (`"true"`/`"false"`). |
+| `True` or `False` | `bool` (or `str`) | Exact equality. For `str` values, compares lowercased strings. |
+| `None` | any | Records the value but does not evaluate it; measurement is recorded as `passed=True`. |
+
+The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
+
+## Skip handling
+
+- `@pytest.mark.skip` and `@pytest.mark.skipif`: the plugin's
+  `pytest_runtest_makereport` hook sees the skipped outcome and creates a
+  step with `TestStatus.SKIPPED`.
+- Inside a test function, you can mark just one substep as skipped without
+  aborting the whole test:
+
+  ```python
+  from sift_client.sift_types.test_report import TestStatus
+
+
+  def test_runtime_skip(step):
+      with step.substep(name="optional_calibration") as cal:
+          if not precondition_met():
+              cal.current_step.update(
+                  {"status": TestStatus.SKIPPED},
+                  log_file=step.report_context.log_file,
+              )
+  ```
+
+  A manually-resolved status is honored by the step's exit handler. No
+  further bookkeeping required. `SKIPPED` does not propagate as a failure.
+
+## Running the suite
+
+```bash
+# Full run against your Sift tenant
+pytest
+
+# Pin the log file so you can replay it later if the import worker dies
+pytest --sift-test-results-log-file=./sift-results.jsonl
+```
+
+See [Running offline](#running-offline) for the same suite running with or
+without a reachable Sift server.
+
+## Running offline
+
+The plugin supports two offline workflows, depending on whether you want a
+Sift report at all when the test environment can't reach Sift. The first
+turns the plugin into a no-op when the server is unreachable. The second
+keeps the plugin running normally and writes every create/update to a local
+JSONL file that you upload from a connected machine afterward.
+
+| Pattern | Flag | Runtime behavior | Follow-up |
+|---|---|---|---|
+| Skip when offline | `--sift-test-results-check-connection` | Fixtures yield `None`, no log file, no report. Pytest still reports pass/fail. | None. |
+| Capture locally, upload later | `--sift-test-results-log-file=<path>` | Plugin writes every create/update to the JSONL file. | `import-test-result-log <path>` from a connected machine. |
+
+Pattern 1 suits laptop dev and CI without Sift secrets. Pattern 2 suits
+field tests, vehicles on remote sites, and air-gapped labs.
+
+### Pattern 1: skip when offline
+
+`--sift-test-results-check-connection` makes the plugin ping Sift once at
+session start through the `client_has_connection` fixture (which by default
+calls `sift_client.ping.ping()`). On a failed ping, `report_context`,
+`step`, and `module_substep` yield `None` for the rest of the session.
+Pytest still runs the tests and still reports pass/fail.
+
+```bash
+pytest --sift-test-results-check-connection
+```
+
+```ini title="pytest.ini"
+[pytest]
+addopts = --sift-test-results-check-connection
+```
+
+#### Handling `None` in tests
+
+Calls on `step` raise `AttributeError` when it's `None`, so tests that take
+`step` as a parameter need a guard. The cleanest fix is to shadow the
+plugin's `step` fixture in your conftest and turn the `None` case into an
+automatic skip.
+
+```python title="conftest.py"
+import pytest
+
+from sift_client.util.test_results import *
+
+
+@pytest.fixture(autouse=True)
+def step(step):
+    if step is None:
+        pytest.skip("Sift unavailable")
+    yield step
+```
+
+The `step` parameter on the override resolves to the plugin's fixture, not
+to the override itself. `autouse=True` is required so the skip applies to
+tests that don't request `step` directly. The same shadowing trick works
+for `module_substep` and `report_context`.
+
+For one-off tests that don't share a conftest, an inline guard works just
+as well:
+
+```python
+def test_battery_voltage(step):
+    if step is None:
+        pytest.skip("Sift unavailable")
+    step.measure(name="battery_voltage", value=4.97, bounds={"min": 4.8, "max": 5.2})
+```
+
+If you'd rather have tests pass through silently than skip them, wrap the
+calls in a helper that no-ops on `None`:
+
+```python
+def safe_measure(step, **kwargs):
+    if step is None:
+        return True
+    return step.measure(**kwargs)
+```
+
+#### Overriding the connection check
+
+The default `client_has_connection` fixture calls `sift_client.ping.ping()`.
+Override it in your conftest if pinging is the wrong signal for your
+environment, for example a token cache that's only warm when authenticated:
+
+```python title="conftest.py"
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def client_has_connection(sift_client) -> bool:
+    return Path("~/.sift-token-cache").expanduser().is_file()
+```
+
+The plugin only consults this fixture when `--sift-test-results-check-connection`
+is set, so an unused override has no effect on a normal run.
+
+### Pattern 2: capture locally, upload later
+
+This pattern keeps the plugin running normally even when Sift is
+unreachable. The plugin writes to the log file, the worker dies on connect,
+and the file is left on disk for you to upload later. Pin the log file path
+so you can find it afterward, and don't pass
+`--sift-test-results-check-connection`, which would suppress the logging
+this pattern relies on.
+
+```bash
+pytest --sift-test-results-log-file=./run.jsonl
+```
+
+What happens during the run:
+
+- Every report, step, and measurement create/update is written to
+  `run.jsonl`. The plugin doesn't contact the Sift API for any of these
+  calls; they return simulated responses keyed by UUIDs that the replay
+  later maps to real IDs.
+- The `import-test-result-log --incremental` worker subprocess starts and
+  exits early when it can't reach Sift. The session does not fail when the
+  worker exits before the run ends.
+- Tests run against a real `step` fixture, so `step.measure(...)`,
+  substeps, parametrize, fixtures, and `module_substep` behave exactly as
+  they do online. No conftest changes are needed.
+
+Once you have connectivity, replay the file:
+
+```bash
+import-test-result-log ./run.jsonl
+```
+
+The replay creates the report, steps, and measurements against Sift in one
+batch. See [Replaying a saved log file](#replaying-a-saved-log-file) for
+details on cleanup and the incremental flag.
+
+!!! warning "Pin the log path for Pattern 2"
+    Without `--sift-test-results-log-file=<path>`, the plugin writes to a
+    `tempfile.NamedTemporaryFile` and only surfaces the path via a
+    `logger.info` line. Always pin a known path when you intend to replay
+    the file later.
+
+## Replaying a saved log file
+
+When the worker doesn't finish cleanly the plugin will print a hint mentioning
+`import-test-result-log`. To import:
+
+```bash
+import-test-result-log <path-to-log.jsonl>
+```
+
+That replays the saved JSONL log as a single batch (no `--incremental`) and
+deletes the file when it lives under the system temp dir.
\ No newline at end of file
diff --git a/python/lib/sift_client/_tests/util/step_status_states.md b/python/lib/sift_client/_tests/util/step_status_states.md
index 6d48f7c75..9288a85d3 100644
--- a/python/lib/sift_client/_tests/util/step_status_states.md
+++ b/python/lib/sift_client/_tests/util/step_status_states.md
@@ -1,66 +1,94 @@
 # Pytest-plugin step-status: observed vs. target
 
 Companion document to `test_step_status_states.py`. Each row corresponds to
-one scenario in that suite. The **observed** column is the status the Sift
-pytest plugin records for the test's outer step today; the **target** column
-is what the audit recommends. Rows where the two differ are the work items
-for the fix.
+one scenario in that suite. The **target** column is the contract the suite
+asserts (sourced from
+[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md));
+the **observed today** column records what the plugin actually produces
+right now. Rows marked `Gap` are scenarios where the test fails today and
+the plugin needs to be fixed to match the contract.
 
 `TestStatus` values referenced below come from
 `sift_client.sift_types.test_report.TestStatus`: `PASSED`, `FAILED`, `ERROR`,
-`SKIPPED`, plus the proposed `XFAILED` / `XPASSED` / `ABORTED` additions
-called out in the audit.
+`SKIPPED`. The targets below map every scenario onto these four existing
+statuses. An `ABORTED` status for hard process exits (`SystemExit`,
+`KeyboardInterrupt`, signals) is a planned future addition; until it lands
+those cases baseline against `ERROR`. The user-facing contract these
+targets describe is documented in
+[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
+
+## Case ID scheme
+
+Each scenario has a stable case ID of the form `PREFIX-NN`, where the
+prefix names its section. Tests in `test_step_status_states.py` reference
+their case ID in a leading comment so a failing test can be traced back to
+this table without rereading the scenario:
+
+| Prefix  | Section                                  |
+| ------- | ---------------------------------------- |
+| `CALL`  | Call-phase exit paths                    |
+| `SKIP`  | Skip paths                               |
+| `XFAIL` | xfail / xpass                            |
+| `PHASE` | Setup / teardown phases                  |
+| `COLL`  | Collection / fixture-resolution failures |
+| `API`   | Plugin-API exit paths                    |
+
+IDs are stable: a new scenario in a section takes the next free number for
+that prefix; numbers are never reused or shifted when other sections grow.
 
 ## Call-phase exit paths
 
-| Scenario                                | Trigger                                       | Observed today              | Target                                     | Status |
-| --------------------------------------- | --------------------------------------------- | --------------------------- | ------------------------------------------ | ------ |
-| Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
-| Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
-| Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
-| `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `ERROR`                     | `FAILED`                                   | Gap    |
-| `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ABORTED` (proposed) or documented `ERROR` | Gap    |
-| `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `PASSED` (session aborts before the plugin sees the interrupt) | `ABORTED` (proposed) | Gap |
+| Case      | Scenario                                | Trigger                                       | Observed today              | Target                                     | Status |
+| --------- | --------------------------------------- | --------------------------------------------- | --------------------------- | ------------------------------------------ | ------ |
+| `CALL-01` | Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
+| `CALL-02` | Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
+| `CALL-03` | Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
+| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `ERROR`                     | `FAILED`                                   | Gap    |
+| `CALL-05` | `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ERROR` (baseline; `ABORTED` planned later) | OK    |
+| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `PASSED` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; document that a session-aborting interrupt may leave the step in `IN_PROGRESS` | Gap |
 
 ## Skip paths
 
-| Scenario                                | Trigger                                       | Observed today                                                              | Target                                                          | Status |
-| --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
-| Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
-| Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `ERROR`; a nested step with the same name records `SKIPPED`      | Outer step `SKIPPED`; no duplicate nested step                  | Gap    |
-| Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `PASSED`; a nested `SKIPPED` step is created by the makereport hook | Outer step `SKIPPED` with `phase=setup`; no duplicate nested step | Gap |
+| Case      | Scenario                                | Trigger                                       | Observed today                                                              | Target                                                          | Status |
+| --------- | --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
+| `SKIP-01` | Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
+| `SKIP-02` | Conditional collection-time skip        | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` (same route as `@pytest.mark.skip`)                               | `SKIPPED`                                                       | OK     |
+| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `ERROR`; a nested step with the same name records `SKIPPED`      | Outer step `SKIPPED`; no duplicate nested step                  | Gap    |
+| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `PASSED`; a nested `SKIPPED` step is created by the makereport hook | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | Gap |
 
 ## xfail / xpass
 
-| Scenario                                | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
-| --------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
-| xfail-marked test that fails            | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `FAILED`; nested `SKIPPED` substep from the makereport hook                          | Outer step `XFAILED`; no duplicate nested step        | Gap    |
-| Strict xfail that unexpectedly passes   | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `PASSED` (plugin never sees pytest's "strict xpass" failure attached to the report)  | Outer step `XPASSED`                                  | Gap    |
-| Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`               | Outer step `PASSED` (pytest reports outcome="passed" with `wasxfail` set; plugin ignores it)    | Outer step `XPASSED`                                  | Gap    |
-| `xfail(raises=...)` with wrong exception | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | Outer step `ERROR` (treated as a generic non-assertion exception)                           | `FAILED` (the `raises=` mismatch is a real test failure) | Gap |
-| `xfail(run=False)`                      | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (only the makereport hook records a step)                                              | `XFAILED`                                            | Gap    |
+| Case       | Scenario                                  | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
+| ---------- | ----------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `FAILED`; nested `SKIPPED` substep from the makereport hook                          | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | Gap    |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `PASSED` (plugin never sees pytest's "strict xpass" failure attached to the report)  | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Gap    |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED` (pytest reports outcome="passed" with `wasxfail` set; plugin ignores it)    | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | Outer step `ERROR` (treated as a generic non-assertion exception)                           | `FAILED` (the `raises=` mismatch is a real test failure) | Gap |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (only the makereport hook records a step)                                              | `SKIPPED` (the test never ran)                      | OK    |
 
 ## Setup / teardown phases
 
-| Scenario                                | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
-| --------------------------------------- | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs    | Outer step does not exist or lands `PASSED`; the plugin does not consult `report.when`                                                    | `ERROR` with `phase=setup` annotation                   | Gap    |
-| Teardown-phase fixture failure          | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `PASSED` — it closes before the failing teardown runs, so the error is invisible                                              | `FAILED` with `phase=teardown` annotation               | Gap    |
-| Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is silently lost                                              | `FAILED` with a `phase=teardown` annotation so the teardown error is also visible | Gap |
+| Case       | Scenario                                   | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
+| ---------- | ------------------------------------------ | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step does not exist or lands `PASSED`; the plugin does not consult `report.when`                                                    | `ERROR` with `phase=setup` annotation                   | Gap    |
+| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `PASSED` — it closes before the failing teardown runs, so the error is invisible                                              | `FAILED` with `phase=teardown` annotation               | Gap    |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is silently lost                                              | `FAILED` with a `phase=teardown` annotation so the teardown error is also visible | Gap |
 
 ## Collection / fixture-resolution failures
 
-| Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
-| --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `PASSED` — the autouse `step` fixture's setup still runs before pytest detects the missing fixture; the user sees a green step for a test that never executed | `ERROR` with `phase=setup`                              | Gap    |
+| Case      | Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
+| --------- | --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
+| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `PASSED` — the autouse `step` fixture's setup still runs before pytest detects the missing fixture; the user sees a green step for a test that never executed | `ERROR` with `phase=setup`                              | Gap    |
 
 ## Plugin-API exit paths (in-test mutations)
 
-| Scenario                                | Trigger                                                                | Observed today | Target   | Status |
-| --------------------------------------- | ---------------------------------------------------------------------- | -------------- | -------- | ------ |
-| Manual status override                  | `step.current_step.update({"status": TestStatus.FAILED})`              | `FAILED`       | `FAILED` | OK     |
-| `report_outcome(result=False)`          | `step.report_outcome("the_check", False, "did not match")`             | `FAILED`       | `FAILED` | OK     |
-| `measure(...)` out-of-bounds            | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`  | `FAILED`       | `FAILED` | OK     |
+| Case     | Scenario                                | Trigger                                                                | Observed today                                              | Target   | Status |
+| -------- | --------------------------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------------- | -------- | ------ |
+| `API-01` | Manual status override                  | `step.current_step.update({"status": TestStatus.FAILED})`              | `FAILED`                                                    | `FAILED` | OK     |
+| `API-02` | `report_outcome(result=False)`          | `step.report_outcome("the_check", False, "did not match")`             | `FAILED`                                                    | `FAILED` | OK     |
+| `API-03` | `measure(...)` out-of-bounds            | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`  | `FAILED`                                                    | `FAILED` | OK     |
+| `API-04` | Failed measurement on a substep         | `with step.substep(...) as s: s.measure(... out-of-bounds)`            | `FAILED` (propagates from substep to parent)                | `FAILED` | OK     |
+| `API-05` | Manually-skipped substep                | `with step.substep(...) as s: s.current_step.update({"status": SKIPPED})` | Parent step `PASSED` (skip does not propagate as a failure) | `PASSED` | OK     |
 
 ## Out of scope for this characterization run
 
@@ -93,7 +121,6 @@ Run the suite locally:
 pytest lib/sift_client/_tests/util/test_step_status_states.py -v
 ```
 
-Every "Gap" row corresponds to a `# AUDIT:` comment in the test file naming
-the target status. When the plugin fix lands, the regression edit is
-mechanical: flip the assertion in each gap row to its target, then update
-the **Observed today** column here to match.
+Gap rows fail today. When the plugin fix for a row lands, the matching
+test turns green; update the **Observed today** column here to match the
+target and flip the row's status to **OK**.
diff --git a/python/lib/sift_client/_tests/util/test_step_status_states.py b/python/lib/sift_client/_tests/util/test_step_status_states.py
index 42c245457..197fe3298 100644
--- a/python/lib/sift_client/_tests/util/test_step_status_states.py
+++ b/python/lib/sift_client/_tests/util/test_step_status_states.py
@@ -1,15 +1,17 @@
-"""Characterization suite: maps each pytest exit path to the ``TestStatus``
-the plugin currently records on the step.
+"""Contract suite: maps each pytest exit path to the ``TestStatus`` the
+Sift pytest plugin is required to record on the outer step.
 
 Each scenario writes a tiny inner test file and runs it through pytester
 with a fake ``sift_client`` injected via a generated conftest. The fake
 records every step status write into ``_step_status_capture.CAPTURED_STEPS``
 so this outer test can assert on what the plugin produced.
 
-The expected statuses below reflect **current** behavior. Where current
-behavior contradicts the audit target (setup-phase, teardown-phase, xfail),
-the assertion is paired with an ``AUDIT:`` comment naming the target.
-Updating these tests is the regression check once the fix lands.
+Assertions encode the contract from
+``docs/guides/pytest_plugin/pass_fail_behavior.md``. Tests for scenarios the
+plugin does not yet handle correctly are expected to **fail today** — they
+are the punch list. ``lib/sift_client/_tests/util/step_status_states.md``
+tracks each scenario's observed-today behavior next to the target so the
+remaining gaps are visible without running the suite.
 """
 
 from __future__ import annotations
@@ -180,6 +182,7 @@ def _run(pytester, body: str) -> None:
 
 
 def test_pass_maps_to_passed(inner):
+    # Case: CALL-01
     _run(
         inner,
         """
@@ -191,6 +194,7 @@ def test_x():
 
 
 def test_assert_failure_maps_to_failed(inner):
+    # Case: CALL-02
     _run(
         inner,
         """
@@ -202,6 +206,7 @@ def test_x():
 
 
 def test_generic_exception_maps_to_error(inner):
+    # Case: CALL-03
     _run(
         inner,
         """
@@ -213,6 +218,7 @@ def test_x():
 
 
 def test_system_exit_maps_to_error(inner):
+    # Case: CALL-05
     _run(
         inner,
         """
@@ -221,14 +227,11 @@ def test_x():
             sys.exit(1)
         """,
     )
-    # AUDIT: SystemExit is currently routed through the generic-exception
-    # path because it isn't an AssertionError. The audit may want a
-    # dedicated bucket (e.g., ABORTED) since the test didn't really "error"
-    # so much as exit deliberately.
     assert capture.final_status("test_x") == TestStatus.ERROR
 
 
-def test_pytest_fail_maps_to_error(inner):
+def test_pytest_fail_maps_to_failed(inner):
+    # Case: CALL-04
     _run(
         inner,
         """
@@ -237,19 +240,16 @@ def test_x():
             pytest.fail("intentional failure")
         """,
     )
-    # AUDIT: target is FAILED. pytest.fail raises a Failed OutcomeException,
-    # which the plugin treats as a generic exception (not AssertionError)
-    # and routes to ERROR. Users expect pytest.fail and assert-fail to land
-    # in the same bucket.
-    assert capture.final_status("test_x") == TestStatus.ERROR
+    assert capture.final_status("test_x") == TestStatus.FAILED
 
 
-def test_keyboard_interrupt_aborts_session(inner):
-    # KeyboardInterrupt propagates out of pytester's inprocess runner, so
-    # we catch it here. Pytest aborts the session before the call phase's
-    # makereport fires, so the plugin never sees the interrupt; the step
-    # fixture's teardown runs with no rep_call.excinfo and resolves the
-    # step to PASSED.
+def test_keyboard_interrupt_leaves_step_in_progress(inner):
+    # Case: CALL-06
+    # KeyboardInterrupt aborts the session before the call-phase makereport
+    # fires; the plugin can't observe the interrupt. The contract is that
+    # the step is left in IN_PROGRESS rather than being silently resolved
+    # to PASSED — a session-aborting interrupt should not look like a clean
+    # pass in the report.
     try:
         _run(
             inner,
@@ -260,11 +260,9 @@ def test_x():
         )
     except KeyboardInterrupt:
         pass
-    # AUDIT: target is ABORTED (or similar). The step is recorded as PASSED
-    # despite the test having been aborted, which is misleading.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.PASSED
+    assert outer.statuses[-1] == TestStatus.IN_PROGRESS
 
 
 # ---------------------------------------------------------------------------
@@ -272,7 +270,8 @@ def test_x():
 # ---------------------------------------------------------------------------
 
 
-def test_pytest_skip_in_body_records_skipped_substep(inner):
+def test_pytest_skip_in_body_maps_to_skipped(inner):
+    # Case: SKIP-03
     _run(
         inner,
         """
@@ -281,24 +280,19 @@ def test_x():
             pytest.skip("not today")
         """,
     )
-    # The plugin's makereport hook creates a separate SKIPPED step nested
-    # under the autouse outer step. The outer step itself is driven by the
-    # call-phase excinfo (a Skipped exception), which the plugin treats as
-    # a generic exception -> ERROR.
-    skipped_steps = [
-        s
-        for s in capture.steps_by_name("test_x")
-        if s.statuses and s.statuses[-1] == TestStatus.SKIPPED
-    ]
-    assert skipped_steps, "expected at least one SKIPPED step for a skipped test"
-    # AUDIT: the outer step records ERROR for an in-body pytest.skip().
-    # Target: outer step should be SKIPPED; no nested duplicate.
+    # Runtime skip in the body resolves the outer step to SKIPPED. The
+    # makereport hook must not create a duplicate nested step.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.ERROR
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, (
+        f"expected no duplicate nested step; got {len(duplicates)}"
+    )
 
 
 def test_pytest_mark_skip_records_skipped(inner):
+    # Case: SKIP-01
     _run(
         inner,
         """
@@ -313,7 +307,24 @@ def test_x():
     assert capture.final_status("test_x") == TestStatus.SKIPPED
 
 
+def test_pytest_mark_skipif_records_skipped(inner):
+    # Case: SKIP-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skipif(True, reason="conditional skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # `skipif` with a truthy condition follows the same path as
+    # `@pytest.mark.skip`; only the makereport hook records a step.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
 def test_skip_inside_fixture_setup(inner):
+    # Case: SKIP-04
     _run(
         inner,
         """
@@ -327,19 +338,15 @@ def test_x(skipping_fixture):
             assert True
         """,
     )
-    # AUDIT: target is SKIPPED with phase=setup on the outer step. Today
-    # the autouse outer step lands in PASSED (its own setup ran, no failure
-    # was recorded), and a separate nested SKIPPED step is created by the
-    # makereport hook from the setup-phase skip report.
+    # A setup-phase skip resolves the outer step to SKIPPED. The makereport
+    # hook must not create a duplicate nested step.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.PASSED
-    nested_skipped = [
-        s
-        for s in capture.steps_by_name("test_x")
-        if s.parent_step_id is not None and s.statuses[-1] == TestStatus.SKIPPED
-    ]
-    assert nested_skipped, "fixture-skip currently produces a nested SKIPPED step"
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, (
+        f"expected no duplicate nested step; got {len(duplicates)}"
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -348,6 +355,7 @@ def test_x(skipping_fixture):
 
 
 def test_xfail_marked_test_that_fails(inner):
+    # Case: XFAIL-01
     _run(
         inner,
         """
@@ -357,22 +365,19 @@ def test_x():
             assert 1 == 2
         """,
     )
-    # AUDIT: target is XFAILED (distinct from SKIPPED). Today, pytest reports
-    # outcome="skipped" for an xfailed test, so the makereport hook records
-    # a SKIPPED nested step. The outer autouse step records FAILED from the
-    # call-phase AssertionError.
+    # xfail + expected failure fulfills the contract; outer step resolves to
+    # PASSED. No duplicate nested step from the makereport hook.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.FAILED
-    skipped_substeps = [
-        s
-        for s in capture.steps_by_name("test_x")
-        if s.parent_step_id is not None and s.statuses[-1] == TestStatus.SKIPPED
-    ]
-    assert skipped_substeps, "xfailed test currently produces a nested SKIPPED step"
+    assert outer.statuses[-1] == TestStatus.PASSED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, (
+        f"expected no duplicate nested step; got {len(duplicates)}"
+    )
 
 
 def test_xfail_strict_unexpected_pass(inner):
+    # Case: XFAIL-02
     _run(
         inner,
         """
@@ -382,15 +387,15 @@ def test_x():
             assert True
         """,
     )
-    # AUDIT: target is XPASSED. The test body raises no exception, so the
-    # plugin records PASSED and never sees pytest's later "strict xfail
-    # passed" failure attached to the report.
+    # strict xfail that passes must surface as FAILED: either the bug was
+    # fixed (remove the mark) or the test stopped exercising what it claimed.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.PASSED
+    assert outer.statuses[-1] == TestStatus.FAILED
 
 
 def test_xfail_non_strict_unexpected_pass(inner):
+    # Case: XFAIL-03
     _run(
         inner,
         """
@@ -400,15 +405,15 @@ def test_x():
             assert True
         """,
     )
-    # AUDIT: target is XPASSED. Non-strict xfail that passes is reported by
-    # pytest as outcome="passed" with wasxfail set; the plugin ignores
-    # wasxfail and records PASSED.
+    # Non-strict xfail does not insist on the failure, so a passing run is
+    # PASSED.
     outer = capture.test_step("test_x")
     assert outer is not None
     assert outer.statuses[-1] == TestStatus.PASSED
 
 
 def test_xfail_raises_mismatch(inner):
+    # Case: XFAIL-04
     _run(
         inner,
         """
@@ -418,15 +423,15 @@ def test_x():
             raise KeyError("wrong exception")
         """,
     )
-    # AUDIT: target is FAILED. Pytest treats a `raises=` mismatch as a real
-    # call-phase failure; the plugin sees a non-assertion exception in
-    # excinfo and routes it to ERROR.
+    # `raises=` mismatch is a real test failure — the contract required a
+    # specific exception type and a different one was thrown.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.ERROR
+    assert outer.statuses[-1] == TestStatus.FAILED
 
 
 def test_xfail_run_false(inner):
+    # Case: XFAIL-05
     _run(
         inner,
         """
@@ -436,9 +441,7 @@ def test_x():
             assert False
         """,
     )
-    # AUDIT: target is XFAILED. With run=False pytest reports the test as
-    # skipped/xfailed without executing it, so today only the makereport
-    # hook records a step, with status SKIPPED.
+    # The test never ran; outer step is SKIPPED.
     assert capture.final_status("test_x") == TestStatus.SKIPPED
 
 
@@ -448,6 +451,7 @@ def test_x():
 
 
 def test_setup_phase_fixture_failure(inner):
+    # Case: PHASE-01
     _run(
         inner,
         """
@@ -461,18 +465,16 @@ def test_x(bad_setup):
             assert True
         """,
     )
-    # AUDIT: target is ERROR with phase=setup. Today the plugin doesn't
-    # consult report.when, so the outer step (if it exists) lands in PASSED
-    # because the call phase never ran and no failure was recorded.
+    # A fixture that raises before `yield` fails the setup phase. The outer
+    # step must surface this as ERROR; the test body never executed and a
+    # silently green step would hide the failure.
     outer = capture.test_step("test_x")
-    if outer is not None:
-        assert outer.statuses[-1] == TestStatus.PASSED, (
-            f"setup-fail outer step status was {outer.statuses[-1]}; "
-            "audit target is ERROR with phase=setup"
-        )
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
 
 
 def test_teardown_phase_fixture_failure(inner):
+    # Case: PHASE-02
     _run(
         inner,
         """
@@ -487,19 +489,16 @@ def test_x(bad_teardown):
             assert True
         """,
     )
-    # AUDIT: target is FAILED with phase=teardown. Today the outer autouse
-    # step closes BEFORE the failing fixture's teardown runs, so the test
-    # body's success is recorded as PASSED and the teardown error is
-    # invisible to the step.
+    # A fixture that raises after `yield` fails the teardown phase. The
+    # outer step's status reflects the teardown failure as FAILED rather
+    # than the call-phase pass.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.PASSED, (
-        f"teardown-fail outer step status was {outer.statuses[-1]}; "
-        "audit target is FAILED with phase=teardown"
-    )
+    assert outer.statuses[-1] == TestStatus.FAILED
 
 
 def test_call_fail_plus_teardown_fail(inner):
+    # Case: PHASE-03
     _run(
         inner,
         """
@@ -514,10 +513,10 @@ def test_x(bad_teardown):
             assert 1 == 2
         """,
     )
-    # AUDIT: the call-phase failure dominates (status FAILED), and the
-    # teardown error is silently lost. Target: status should reflect both
-    # signals, e.g. FAILED with a teardown-phase annotation so the
-    # teardown error is not invisible.
+    # Call-phase failure dominates the outer step status; the contract also
+    # requires the teardown error to be surfaced somewhere on the step
+    # (mechanism TBD — see pass_fail_behavior.md). This test asserts the
+    # status today; tighten once a surfacing mechanism is chosen.
     outer = capture.test_step("test_x")
     assert outer is not None
     assert outer.statuses[-1] == TestStatus.FAILED
@@ -528,7 +527,8 @@ def test_x(bad_teardown):
 # ---------------------------------------------------------------------------
 
 
-def test_missing_fixture_records_passed_step(inner):
+def test_missing_fixture_maps_to_error(inner):
+    # Case: COLL-01
     _run(
         inner,
         """
@@ -536,15 +536,12 @@ def test_x(nonexistent_fixture):
             assert True
         """,
     )
-    # AUDIT: target is ERROR with phase=setup. Today the autouse `step`
-    # fixture's setup still runs (because it has no dependency on the
-    # missing fixture), creates an outer step, then the missing-fixture
-    # error aborts setup. The autouse step's teardown runs with no
-    # rep_call.excinfo and resolves to PASSED -- so the user sees a
-    # green step in Sift for a test that never executed.
+    # An unresolved fixture is a setup-phase failure. The outer step
+    # surfaces as ERROR rather than a misleading green pass for a test
+    # that never executed.
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.PASSED
+    assert outer.statuses[-1] == TestStatus.ERROR
 
 
 # ---------------------------------------------------------------------------
@@ -553,6 +550,7 @@ def test_x(nonexistent_fixture):
 
 
 def test_manual_status_update_to_failed(inner):
+    # Case: API-01
     _run(
         inner,
         """
@@ -565,6 +563,7 @@ def test_x(step):
 
 
 def test_report_outcome_false_maps_to_failed(inner):
+    # Case: API-02
     _run(
         inner,
         """
@@ -577,6 +576,7 @@ def test_x(step):
 
 
 def test_measure_out_of_bounds_maps_to_failed(inner):
+    # Case: API-03
     _run(
         inner,
         """
@@ -585,3 +585,43 @@ def test_x(step):
         """,
     )
     assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_substep_failure_propagates_to_parent(inner):
+    # Case: API-04
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner") as inner_step:
+                inner_step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    # `test_measure_out_of_bounds_maps_to_failed` exercises a failed
+    # measurement on the function step itself; this one verifies the same
+    # failure on a nested substep propagates up to the parent.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_skipped_substep_does_not_fail_parent(inner):
+    # Case: API-05
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            with step.substep(name="optional_check") as cal:
+                cal.current_step.update(
+                    {"status": TestStatus.SKIPPED},
+                    log_file=step.report_context.log_file,
+                )
+        """,
+    )
+    # A manually-resolved SKIPPED on a substep must not propagate as a failure
+    # to the parent. The outer step has no measurements of its own and resolves
+    # to PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index 90bfd10ed..9a45d48de 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -59,10 +59,13 @@ nav:
       - examples/index.md
       - Basic Usage: examples/basic.ipynb
       - Data Ingestion: examples/ingestion.ipynb
-      - Pytest Plugin: examples/pytest_plugin.md
-#  - Guides:
-#      - Logging
-#      - Error Handling
+  - Guides:
+      - Pytest Plugin:
+          - Usage: guides/pytest_plugin/usage.md
+          - Pass/Fail Behavior: guides/pytest_plugin/pass_fail_behavior.md
+
+not_in_nav: |
+  /examples/pytest_plugin.md
 
 plugins:
   - search

From ffd8f4d160ae6150dd29fe2c7453a9ab63948fa4 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 15 May 2026 15:39:56 -0700
Subject: [PATCH 12/15] format

---
 .../_tests/util/test_step_status_states.py           | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/lib/sift_client/_tests/util/test_step_status_states.py b/python/lib/sift_client/_tests/util/test_step_status_states.py
index 197fe3298..03fe0943f 100644
--- a/python/lib/sift_client/_tests/util/test_step_status_states.py
+++ b/python/lib/sift_client/_tests/util/test_step_status_states.py
@@ -286,9 +286,7 @@ def test_x():
     assert outer is not None
     assert outer.statuses[-1] == TestStatus.SKIPPED
     duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
-    assert not duplicates, (
-        f"expected no duplicate nested step; got {len(duplicates)}"
-    )
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
 
 
 def test_pytest_mark_skip_records_skipped(inner):
@@ -344,9 +342,7 @@ def test_x(skipping_fixture):
     assert outer is not None
     assert outer.statuses[-1] == TestStatus.SKIPPED
     duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
-    assert not duplicates, (
-        f"expected no duplicate nested step; got {len(duplicates)}"
-    )
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
 
 
 # ---------------------------------------------------------------------------
@@ -371,9 +367,7 @@ def test_x():
     assert outer is not None
     assert outer.statuses[-1] == TestStatus.PASSED
     duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
-    assert not duplicates, (
-        f"expected no duplicate nested step; got {len(duplicates)}"
-    )
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
 
 
 def test_xfail_strict_unexpected_pass(inner):

From fdaf832a81201d4de283348f79de530dbc1500f0 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Sun, 17 May 2026 17:13:11 -0700
Subject: [PATCH 13/15] clean up

---
 .../_tests/util/step_status_states.md         |  44 ++---
 .../util/test_results/context_manager.py      |  51 +++++-
 .../util/test_results/pytest_util.py          | 170 ++++++++++++++++--
 3 files changed, 219 insertions(+), 46 deletions(-)

diff --git a/python/lib/sift_client/_tests/util/step_status_states.md b/python/lib/sift_client/_tests/util/step_status_states.md
index 9288a85d3..83b64783d 100644
--- a/python/lib/sift_client/_tests/util/step_status_states.md
+++ b/python/lib/sift_client/_tests/util/step_status_states.md
@@ -5,16 +5,16 @@ one scenario in that suite. The **target** column is the contract the suite
 asserts (sourced from
 [`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md));
 the **observed today** column records what the plugin actually produces
-right now. Rows marked `Gap` are scenarios where the test fails today and
-the plugin needs to be fixed to match the contract.
+right now. Every row should be marked `OK`; a `Gap` indicates the plugin has
+regressed against the contract.
 
 `TestStatus` values referenced below come from
 `sift_client.sift_types.test_report.TestStatus`: `PASSED`, `FAILED`, `ERROR`,
-`SKIPPED`. The targets below map every scenario onto these four existing
-statuses. An `ABORTED` status for hard process exits (`SystemExit`,
+`SKIPPED`, `IN_PROGRESS`. The targets below map every scenario onto these
+existing statuses. An `ABORTED` status for hard process exits (`SystemExit`,
 `KeyboardInterrupt`, signals) is a planned future addition; until it lands
-those cases baseline against `ERROR`. The user-facing contract these
-targets describe is documented in
+those cases baseline against `ERROR` or `IN_PROGRESS`. The user-facing
+contract these targets describe is documented in
 [`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
 
 ## Case ID scheme
@@ -43,9 +43,9 @@ that prefix; numbers are never reused or shifted when other sections grow.
 | `CALL-01` | Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
 | `CALL-02` | Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
 | `CALL-03` | Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
-| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `ERROR`                     | `FAILED`                                   | Gap    |
+| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `FAILED`                    | `FAILED`                                   | OK     |
 | `CALL-05` | `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ERROR` (baseline; `ABORTED` planned later) | OK    |
-| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `PASSED` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; document that a session-aborting interrupt may leave the step in `IN_PROGRESS` | Gap |
+| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `IN_PROGRESS` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; a session-aborting interrupt leaves the step in `IN_PROGRESS` | OK |
 
 ## Skip paths
 
@@ -53,32 +53,32 @@ that prefix; numbers are never reused or shifted when other sections grow.
 | --------- | --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
 | `SKIP-01` | Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
 | `SKIP-02` | Conditional collection-time skip        | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` (same route as `@pytest.mark.skip`)                               | `SKIPPED`                                                       | OK     |
-| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `ERROR`; a nested step with the same name records `SKIPPED`      | Outer step `SKIPPED`; no duplicate nested step                  | Gap    |
-| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `PASSED`; a nested `SKIPPED` step is created by the makereport hook | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | Gap |
+| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `SKIPPED`; no duplicate nested step                              | Outer step `SKIPPED`; no duplicate nested step                  | OK     |
+| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step           | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | OK   |
 
 ## xfail / xpass
 
 | Case       | Scenario                                  | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
 | ---------- | ----------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
-| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `FAILED`; nested `SKIPPED` substep from the makereport hook                          | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | Gap    |
-| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `PASSED` (plugin never sees pytest's "strict xpass" failure attached to the report)  | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Gap    |
-| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED` (pytest reports outcome="passed" with `wasxfail` set; plugin ignores it)    | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
-| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | Outer step `ERROR` (treated as a generic non-assertion exception)                           | `FAILED` (the `raises=` mismatch is a real test failure) | Gap |
-| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (only the makereport hook records a step)                                              | `SKIPPED` (the test never ran)                      | OK    |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step             | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | OK     |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | OK     |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED`                                                                              | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)                                     | `FAILED` (the `raises=` mismatch is a real test failure) | OK |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (the test never ran)                                                                   | `SKIPPED` (the test never ran)                      | OK    |
 
 ## Setup / teardown phases
 
 | Case       | Scenario                                   | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
 | ---------- | ------------------------------------------ | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step does not exist or lands `PASSED`; the plugin does not consult `report.when`                                                    | `ERROR` with `phase=setup` annotation                   | Gap    |
-| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `PASSED` — it closes before the failing teardown runs, so the error is invisible                                              | `FAILED` with `phase=teardown` annotation               | Gap    |
-| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is silently lost                                              | `FAILED` with a `phase=teardown` annotation so the teardown error is also visible | Gap |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step `ERROR`; the plugin reads the setup-phase report and maps `failed` → `ERROR`                                                   | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
+| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `FAILED`; after teardown the plugin upgrades a passed step when the teardown report shows `failed`                              | `FAILED` (a `phase=teardown` annotation is a planned follow-up) | OK     |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is not yet surfaced separately                                  | `FAILED`; surfacing the teardown error alongside is a planned follow-up | OK |
 
 ## Collection / fixture-resolution failures
 
 | Case      | Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
 | --------- | --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `PASSED` — the autouse `step` fixture's setup still runs before pytest detects the missing fixture; the user sees a green step for a test that never executed | `ERROR` with `phase=setup`                              | Gap    |
+| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `ERROR` — the missing fixture surfaces as a setup-phase failure, which the plugin now maps to `ERROR`                                  | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
 
 ## Plugin-API exit paths (in-test mutations)
 
@@ -121,6 +121,6 @@ Run the suite locally:
 pytest lib/sift_client/_tests/util/test_step_status_states.py -v
 ```
 
-Gap rows fail today. When the plugin fix for a row lands, the matching
-test turns green; update the **Observed today** column here to match the
-target and flip the row's status to **OK**.
+Every row should be `OK`. If a row regresses to `Gap`, the matching test
+fails; update the **Observed today** column here to describe the
+regression and flip the row's status to **Gap** until the plugin is fixed.
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 354f8564d..355d59e0f 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -39,6 +39,17 @@
 logger = logging.getLogger(__name__)
 
 
+def format_truncated_traceback(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+    tb: object | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from a traceback, keeping the first frame and the last 10."""
+    stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore[arg-type]
+    stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
+    return ErrorInfo(error_code=1, error_message="".join(stack))
+
+
 def log_replay_instructions(log_file: str | Path | None) -> None:
     """Log instructions for manually replaying a test result log file.
 
@@ -287,6 +298,17 @@ def record_step_outcome(self, outcome: bool, step: TestStep):
             self.open_step_results[step.step_path] = False
             self.any_failures = True
 
+    def mark_step_failed_after_close(self, step: TestStep):
+        """Mark a step's parent as failed after the step has already been popped from the stack.
+
+        Used by the pytest plugin when a teardown-phase report fires after the
+        fixture's `__exit__` has already resolved and exited the step.
+        """
+        self.any_failures = True
+        path_parts = step.step_path.split(".")
+        if len(path_parts) > 1:
+            self.open_step_results[".".join(path_parts[:-1])] = False
+
     def resolve_and_propagate_step_result(
         self,
         step: TestStep,
@@ -383,13 +405,7 @@ def update_step_from_result(
                 # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
                 self.report_context.record_step_outcome(False, self.current_step)
             else:
-                stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore
-                stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
-                trace = "".join(stack)
-                error_info = ErrorInfo(
-                    error_code=1,
-                    error_message=trace,
-                )
+                error_info = format_truncated_traceback(exc, exc_value, tb)
 
         # Resolve the status of this step (i.e. fail if children failed) and propagate the result to the parent step.
         result = self.report_context.resolve_and_propagate_step_result(
@@ -414,6 +430,27 @@ def update_step_from_result(
         return result
 
     def __exit__(self, exc, exc_value, tb):
+        if getattr(self, "_sift_managed_externally", False):
+            # The pytest fixture already resolved status from phase reports.
+            # Run the standard propagation so the parent step sees this step's
+            # pass/fail, emit one update_step with the resolved values, and pop
+            # from the stack without re-classifying.
+            assert self.current_step is not None
+            result = self.report_context.resolve_and_propagate_step_result(
+                self.current_step, self.current_step.error_info
+            )
+            self.current_step.update(
+                {
+                    "status": self.current_step.status,
+                    "end_time": datetime.now(timezone.utc),
+                    "error_info": self.current_step.error_info,
+                },
+            )
+            self.report_context.exit_step(self.current_step)
+            if hasattr(self, "force_result"):
+                result = self.force_result
+            return result
+
         result = self.update_step_from_result(exc, exc_value, tb)
 
         # Now that the step is updated. Let the report context handle removing it from the stack and updating the report context.
diff --git a/python/lib/sift_client/util/test_results/pytest_util.py b/python/lib/sift_client/util/test_results/pytest_util.py
index a96a47fb3..62ec5bba0 100644
--- a/python/lib/sift_client/util/test_results/pytest_util.py
+++ b/python/lib/sift_client/util/test_results/pytest_util.py
@@ -2,12 +2,14 @@
 
 from datetime import datetime, timezone
 from pathlib import Path
+from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, Generator
 
 import pytest
 
-from sift_client.sift_types.test_report import TestStatus
+from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import format_truncated_traceback
 
 if TYPE_CHECKING:
     from sift_client.client import SiftClient
@@ -59,17 +61,151 @@ def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool |
     return Path(raw)
 
 
+def _error_info_from_longrepr(longrepr: Any) -> ErrorInfo:
+    """Fall back to the report's longrepr when no Python exception is available."""
+    return ErrorInfo(error_code=1, error_message=str(longrepr) if longrepr is not None else "")
+
+
+def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
+    """Resolve the function step's status from pytest's per-phase reports.
+
+    Reads ``_sift_phase_setup`` / ``_sift_phase_call`` and the test's xfail marker,
+    then mutates ``new_step.current_step`` in place and flips
+    ``new_step._sift_managed_externally`` so ``NewStep.__exit__`` emits the
+    resolved status without re-classifying.
+
+    When the call phase reports ``passed`` and no override is needed (i.e. the
+    test's own status or substep failures should drive the result), this leaves
+    the step alone so the default ``__exit__`` resolution stays in charge.
+    """
+    setup_phase = getattr(item, "_sift_phase_setup", None)
+    call_phase = getattr(item, "_sift_phase_call", None)
+    xfail_marker = item.get_closest_marker("xfail")
+    xfail_runs = xfail_marker.kwargs.get("run", True) if xfail_marker is not None else True
+
+    status: TestStatus | None = None
+    error_info: ErrorInfo | None = None
+    keep_managed = False
+
+    if setup_phase is not None and setup_phase.report.outcome == "failed":
+        status = TestStatus.ERROR
+        excinfo = setup_phase.call.excinfo
+        if excinfo is not None:
+            error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+        else:
+            error_info = _error_info_from_longrepr(setup_phase.report.longrepr)
+    elif setup_phase is not None and setup_phase.report.outcome == "skipped":
+        status = TestStatus.SKIPPED
+    elif call_phase is None:
+        # Setup completed but the call-phase report never fired — the inner
+        # pytester session was aborted (e.g. by KeyboardInterrupt) before the
+        # plugin could observe the outcome. Leave the step at IN_PROGRESS so
+        # the report does not lie about a clean pass.
+        keep_managed = True
+    else:
+        wasxfail = getattr(call_phase.report, "wasxfail", None)
+        if wasxfail is not None:
+            if call_phase.report.outcome == "failed":
+                # Strict xpass: pytest synthesizes a failure when an xfail(strict=True)
+                # test unexpectedly passes. The xfail mark no longer matches reality.
+                status = TestStatus.FAILED
+            elif call_phase.report.outcome == "skipped":
+                if xfail_marker is not None and xfail_runs is False:
+                    # xfail(run=False): the test body never executed.
+                    status = TestStatus.SKIPPED
+                else:
+                    # xfail + expected failure: the test fulfilled its xfail expectation.
+                    status = TestStatus.PASSED
+            else:
+                # Non-strict xpass: passes that weren't required to fail.
+                status = TestStatus.PASSED
+        elif call_phase.report.outcome == "passed":
+            # Default __exit__ resolves PASSED/FAILED from open_step_results and any
+            # status the test code may have set. Don't override it here.
+            return
+        elif call_phase.report.outcome == "skipped":
+            status = TestStatus.SKIPPED
+        elif call_phase.report.outcome == "failed":
+            excinfo = call_phase.call.excinfo
+            if excinfo is None:
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, AssertionError):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, pytest.fail.Exception):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, KeyboardInterrupt):
+                # Hold status at IN_PROGRESS; a session-aborting interrupt should
+                # not look like a clean pass. Keep the managed flag so the default
+                # exit path does not coerce IN_PROGRESS to PASSED.
+                keep_managed = True
+            elif isinstance(excinfo.value, SystemExit):
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif xfail_marker is not None:
+                # xfail(raises=X) with a non-matching exception: the contract failed.
+                status = TestStatus.FAILED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            else:
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+
+    if status is None and not keep_managed:
+        return
+
+    assert new_step.current_step is not None
+    if status is not None:
+        # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
+        new_step.current_step.__dict__["status"] = status
+        if error_info is not None:
+            new_step.current_step.__dict__["error_info"] = error_info
+    new_step._sift_managed_externally = True
+
+
+def _finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestReport) -> None:
+    """Upgrade a closed step to FAILED when the teardown phase failed.
+
+    The autouse step fixture has already exited by the time the teardown
+    makereport hook fires, so call ``step.update`` again to override the status
+    server-side and propagate the failure to the still-open parent step.
+    """
+    step: NewStep | None = getattr(item, "_sift_step", None)
+    if step is None:
+        return
+    assert step.current_step is not None
+    if (
+        teardown_report.outcome == "failed"
+        and step.current_step.status == TestStatus.PASSED
+    ):
+        step.current_step.update({"status": TestStatus.FAILED})
+        step.report_context.mark_step_failed_after_close(step.current_step)
+
+
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """You should import this hook to capture any AssertionErrors that occur during the test. If not included, any assert failures in a test will not automatically fail the step."""
+    """Capture per-phase reports and finalize step status after teardown.
+
+    Importing this hook is required so the Sift plugin can map pytest's
+    setup/call/teardown outcomes onto the step's ``TestStatus``.
+    """
     outcome = yield
     report = outcome.get_result()
-    if report.outcome == "skipped":
-        # Skipped steps won't invoke the method/fixtures at all, so we need to manually record a step.
-        if REPORT_CONTEXT:
-            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
-                new_step.current_step.update({"status": TestStatus.SKIPPED})
-    setattr(item, "rep_" + report.when, call)
+    setattr(item, "_sift_phase_" + report.when, SimpleNamespace(call=call, report=report))
+
+    # Collection-time skip (``@pytest.mark.skip`` / ``skipif``): the autouse
+    # ``step`` fixture never runs, so the hook is the only place that can
+    # record a step. Strictly gated so it does not duplicate steps the fixture
+    # already created (presence of ``_sift_step`` is the "fixture ran" signal).
+    if (
+        REPORT_CONTEXT
+        and report.when == "setup"
+        and report.outcome == "skipped"
+        and getattr(item, "_sift_step", None) is None
+    ):
+        with REPORT_CONTEXT.new_step(name=item.name) as inline_step:
+            inline_step.current_step.update({"status": TestStatus.SKIPPED})
+
+    if report.when == "teardown":
+        _finalize_after_teardown(item, report)
 
 
 def _report_context_impl(
@@ -136,20 +272,20 @@ def report_context(
 
 
 def _step_impl(
-    report_context: ReportContext, request: pytest.FixtureRequest
+    report_context: ReportContext,
+    request: pytest.FixtureRequest,
+    phase_aware: bool = True,
 ) -> Generator[NewStep | None, None, None]:
     name = str(request.node.name)
     existing_docstring = request.node.obj.__doc__ or None
     with report_context.new_step(
         name=name, description=existing_docstring, assertion_as_fail_not_error=False
     ) as new_step:
+        if phase_aware:
+            setattr(request.node, "_sift_step", new_step)
         yield new_step
-        if hasattr(request.node, "rep_call") and request.node.rep_call.excinfo:
-            new_step.update_step_from_result(
-                request.node.rep_call.excinfo,
-                request.node.rep_call.excinfo.value,
-                request.node.rep_call.excinfo.tb,
-            )
+        if phase_aware:
+            _resolve_initial_status(new_step, request.node)
 
 
 @pytest.fixture(autouse=True)
@@ -168,7 +304,7 @@ def step(
     ):
         yield None
         return
-    yield from _step_impl(report_context, request)
+    yield from _step_impl(report_context, request, phase_aware=True)
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -187,7 +323,7 @@ def module_substep(
     ):
         yield None
         return
-    yield from _step_impl(report_context, request)
+    yield from _step_impl(report_context, request, phase_aware=False)
 
 
 @pytest.fixture(scope="session")

From dc1c9e63ac9703fb7dcb5abd61ebcaf4aae21541 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Sun, 17 May 2026 17:13:34 -0700
Subject: [PATCH 14/15] format

---
 python/lib/sift_client/util/test_results/pytest_util.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/lib/sift_client/util/test_results/pytest_util.py b/python/lib/sift_client/util/test_results/pytest_util.py
index 62ec5bba0..2399471e9 100644
--- a/python/lib/sift_client/util/test_results/pytest_util.py
+++ b/python/lib/sift_client/util/test_results/pytest_util.py
@@ -172,10 +172,7 @@ def _finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestRepo
     if step is None:
         return
     assert step.current_step is not None
-    if (
-        teardown_report.outcome == "failed"
-        and step.current_step.status == TestStatus.PASSED
-    ):
+    if teardown_report.outcome == "failed" and step.current_step.status == TestStatus.PASSED:
         step.current_step.update({"status": TestStatus.FAILED})
         step.report_context.mark_step_failed_after_close(step.current_step)
 
@@ -282,7 +279,7 @@ def _step_impl(
         name=name, description=existing_docstring, assertion_as_fail_not_error=False
     ) as new_step:
         if phase_aware:
-            setattr(request.node, "_sift_step", new_step)
+            request.node._sift_step = new_step
         yield new_step
         if phase_aware:
             _resolve_initial_status(new_step, request.node)

From 362989fad2520910eb553bb5397c63b4181b212c Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Sun, 17 May 2026 17:58:34 -0700
Subject: [PATCH 15/15] reorganize tests

---
 .../_tests/{util => pytest_plugin}/_step_status_capture.py  | 0
 .../_tests/{util => pytest_plugin}/step_status_states.md    | 0
 .../test_pass_fail.py}                                      | 6 +++---
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename python/lib/sift_client/_tests/{util => pytest_plugin}/_step_status_capture.py (100%)
 rename python/lib/sift_client/_tests/{util => pytest_plugin}/step_status_states.md (100%)
 rename python/lib/sift_client/_tests/{util/test_step_status_states.py => pytest_plugin/test_pass_fail.py} (98%)

diff --git a/python/lib/sift_client/_tests/util/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
similarity index 100%
rename from python/lib/sift_client/_tests/util/_step_status_capture.py
rename to python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
diff --git a/python/lib/sift_client/_tests/util/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
similarity index 100%
rename from python/lib/sift_client/_tests/util/step_status_states.md
rename to python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
diff --git a/python/lib/sift_client/_tests/util/test_step_status_states.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
similarity index 98%
rename from python/lib/sift_client/_tests/util/test_step_status_states.py
rename to python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index 03fe0943f..930a48833 100644
--- a/python/lib/sift_client/_tests/util/test_step_status_states.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -9,7 +9,7 @@
 Assertions encode the contract from
 ``docs/guides/pytest_plugin/pass_fail_behavior.md``. Tests for scenarios the
 plugin does not yet handle correctly are expected to **fail today** — they
-are the punch list. ``lib/sift_client/_tests/util/step_status_states.md``
+are the punch list. ``lib/sift_client/_tests/pytest_plugin/step_status_states.md``
 tracks each scenario's observed-today behavior next to the target so the
 remaining gaps are visible without running the suite.
 """
@@ -20,7 +20,7 @@
 
 import pytest
 
-from sift_client._tests.util import _step_status_capture as capture
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
 from sift_client.sift_types.test_report import TestStatus
 
 pytest_plugins = ["pytester"]
@@ -42,7 +42,7 @@
 # Bring the Sift fixtures + the makereport hook into this inner session.
 from sift_client.util.test_results import *  # noqa: F401,F403
 
-from sift_client._tests.util._step_status_capture import CAPTURED_STEPS, CapturedStep
+from sift_client._tests.pytest_plugin._step_status_capture import CAPTURED_STEPS, CapturedStep
 from sift_client.sift_types.test_report import (
     TestMeasurement,
     TestReport,