diff --git a/.githooks/pre-push-python/extras.sh b/.githooks/pre-push-python/extras.sh
index 56cb00620..515c5614e 100755
--- a/.githooks/pre-push-python/extras.sh
+++ b/.githooks/pre-push-python/extras.sh
@@ -1,5 +1,8 @@
 # ensure generated pyproject.toml extras are up-to-date
 
+# Clear git env vars set by the parent hook so git commands resolve the work tree normally
+unset GIT_DIR GIT_WORK_TREE GIT_INDEX_FILE GIT_PREFIX
+
 # Store the root directory of the repository
 REPO_ROOT="$(git rev-parse --show-toplevel)"
 PYTHON_DIR="$REPO_ROOT/python"
diff --git a/.githooks/pre-push-python/fmt-lint.sh b/.githooks/pre-push-python/fmt-lint.sh
index f112c7f09..f5261fbdb 100644
--- a/.githooks/pre-push-python/fmt-lint.sh
+++ b/.githooks/pre-push-python/fmt-lint.sh
@@ -2,6 +2,9 @@
 
 set -e
 
+# Clear git env vars set by the parent hook so git commands resolve the work tree normally
+unset GIT_DIR GIT_WORK_TREE GIT_INDEX_FILE GIT_PREFIX
+
 # Store the root directory of the repository
 REPO_ROOT="$(git rev-parse --show-toplevel)"
 PYTHON_DIR="$REPO_ROOT/python"
diff --git a/.githooks/pre-push-python/stubs.sh b/.githooks/pre-push-python/stubs.sh
index 9ee86fb42..8e72fd426 100644
--- a/.githooks/pre-push-python/stubs.sh
+++ b/.githooks/pre-push-python/stubs.sh
@@ -1,5 +1,8 @@
 # ensure generated python stubs are up-to-date, from sync clients
 
+# Clear git env vars set by the parent hook so git commands resolve the work tree normally
+unset GIT_DIR GIT_WORK_TREE GIT_INDEX_FILE GIT_PREFIX
+
 # Store the root directory of the repository
 REPO_ROOT="$(git rev-parse --show-toplevel)"
 PYTHON_DIR="$REPO_ROOT/python"
diff --git a/python/docs/examples/index.md b/python/docs/examples/index.md
index b6a964b35..936a35cfd 100644
--- a/python/docs/examples/index.md
+++ b/python/docs/examples/index.md
@@ -7,6 +7,7 @@ This section contains interactive Jupyter notebook examples demonstrating how to
 - **[Basic Usage](basic.ipynb)** - Introduction to the Sift Python client, covering basic operations and API usage
 - **[Data Ingestion](ingestion.ipynb)** - Learn how to ingest telemetry data into Sift using various methods
 - **[Pytest Plugin](pytest_plugin.md)** - Turn a pytest run into a Sift TestReport with measurements, nested steps, and pass/fail outcomes
+- **[Pytest Plugin Quickstart](pytest_plugin_quickstart.md)** - Guided tour of the runnable demo project under `python/examples/pytest_plugin/`
 
 ## Running Examples Locally
 
diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index cf56dd75e..c464e564e 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -9,11 +9,13 @@ This page walks through wiring the plugin into a project, the fixtures and
 hooks it provides, and the patterns you'll use day-to-day.
 
 !!! info "Where the plugin lives"
-    The plugin is part of `sift_client.util.test_results`. It is **not**
-    registered as a `pytest11` entry point. Projects opt in with a
-    `from sift_client.util.test_results import *` in their `conftest.py`.
-    That import is what wires up the fixtures, the CLI options, and the
-    `pytest_runtest_makereport` hook.
+    The plugin lives at `sift_client.pytest_plugin`. It is
+    **not** registered as a `pytest11` entry point. Projects opt in with a
+    `pytest_plugins` declaration in their top-level `conftest.py`. Pytest
+    then loads the module as a real plugin: the fixtures, CLI options, and
+    `pytest_runtest_makereport` hook all register through standard pytest
+    machinery, so `pytest --trace-config` lists it and
+    `pytest -p no:sift_client.pytest_plugin` disables it.
 
 ## Install
 
@@ -33,9 +35,26 @@ The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your
 
 ## Wire the plugin into `conftest.py`
 
-Two things are required: a session-scoped `sift_client` fixture (the plugin's
-`report_context` fixture resolves it by name), and a star-import that registers
-the plugin's fixtures into the conftest's namespace.
+A single `pytest_plugins` declaration in your top-level `conftest.py` is all
+that's required. The plugin ships a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+
+```python title="conftest.py"
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
+
+That's the whole setup. Every test in the session will now create a step on a
+single shared `TestReport`.
+
+### Customizing the `SiftClient`
+
+To construct the client differently (custom TLS, timeouts, alternate
+credentials, etc.), override the `sift_client` fixture in your conftest. The
+plugin's default falls away in favor of your definition.
 
 ```python title="conftest.py"
 import os
@@ -45,58 +64,121 @@ from dotenv import load_dotenv
 
 from sift_client import SiftClient, SiftConnectionConfig
 
-# Star-import wires fixtures + hooks + CLI options into pytest collection.
-from sift_client.util.test_results import *
-
 load_dotenv()
 
+pytest_plugins = ["sift_client.pytest_plugin"]
+
 
 @pytest.fixture(scope="session")
 def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI")
-    rest_url = os.getenv("SIFT_REST_URI")
-    api_key = os.getenv("SIFT_API_KEY")
-    
     return SiftClient(
         connection_config=SiftConnectionConfig(
-            api_key=api_key,
-            grpc_url=grpc_url,
-            rest_url=rest_url,
+            api_key=os.getenv("SIFT_API_KEY"),
+            grpc_url=os.getenv("SIFT_GRPC_URI"),
+            rest_url=os.getenv("SIFT_REST_URI"),
+            use_ssl=False,
         )
     )
 ```
 
-That's the whole setup. Every test in the session will now create a step on a
-single shared `TestReport`.
-
 ## Plugin provided fixtures
 
 | Name | Kind | Scope | Purpose |
 |---|---|---|---|
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
 | `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
-| `module_substep` | fixture (autouse) | module | One step per test file with each function nested as a substep. |
-| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted only when `--sift-test-results-check-connection` is set. |
+| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently — see [ini options](#ini-options). |
+| `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
+| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
 
 ### CLI options
 
 | Flag | Default | Effect |
 |---|---|---|
-| `--sift-test-results-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. |
-| `--no-sift-test-results-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
-| `--sift-test-results-check-connection` | off | Make `report_context`, `step`, and `module_substep` no-op (yield `None`) when `client_has_connection` is `False`. Lets the same suite run locally without a Sift backend. |
+| `--sift-offline` | off (online) | Skip the session-start ping and don't contact Sift. All create/update calls go to the JSONL log file for later replay via `import-test-result-log`. Missing `SIFT_*` env vars are tolerated; placeholders are filled. |
+| `--sift-disabled` | off | Skip Sift entirely. Nothing contacts the API and no log file is written; `step.measure(...)` still evaluates bounds and returns a real pass/fail boolean. Also honored via `SIFT_DISABLED=1`. Supersedes every other flag (disabled wins over offline). |
+| `--sift-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. Incompatible with `--sift-offline` since offline mode needs the log file as its sole sink. |
+| `--no-sift-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
+
+These can be passed permanently via `addopts`:
+
+```ini title="pytest.ini"
+[pytest]
+addopts = --sift-offline
+```
+
+Or set the matching ini key directly (recommended for stable per-project
+configuration). Each CLI flag has a corresponding key under
+`[tool.pytest.ini_options]` in `pyproject.toml` or `[pytest]` in `pytest.ini`.
+CLI flags, when passed, override the ini values.
+
+| Ini key | Type | Equivalent CLI flag |
+|---|---|---|
+| `sift_log_file` | string (`true` / `false` / `none` / path) | `--sift-log-file=<value>` |
+| `sift_git_metadata` | bool (default `true`) | `--no-sift-git-metadata` (sets to `false`) |
+| `sift_offline` | bool (default `false`) | `--sift-offline` |
+| `sift_disabled` | bool (default `false`) | `--sift-disabled` (also honors `SIFT_DISABLED` env var) |
+| `sift_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
+| `sift_package_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each Python package (directory with `__init__.py`) in the test path. |
+| `sift_module_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each test module (file). |
+| `sift_class_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each test class, including nested classes. |
+| `sift_parametrize_nesting` | bool (default `true`) | _(ini-only)_ — cluster parametrized tests under shared parents (`test_x → axis=value`) instead of flat leaves (`test_x[value]`). |
+
+The default `sift_client` fixture reads its two URIs from environment first
+and falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
+intentionally env-only — keep it out of source control and supply it through
+`pytest-dotenv` (see [API key handling](#api-key-handling) below). The env
+var wins when both are set, so secrets injected into a CI environment
+continue to override values committed to `pyproject.toml`. There are no CLI
+flags for credentials.
+
+| Ini key | Environment variable | Notes |
+|---|---|---|
+| _(none)_ | `SIFT_API_KEY` | Env-only. Use `.env` + `pytest-dotenv` locally; inject from your secret store in CI. |
+| `sift_grpc_uri` | `SIFT_GRPC_URI` | Stable per-org gRPC endpoint; safe to commit. |
+| `sift_rest_uri` | `SIFT_REST_URI` | Stable per-org REST endpoint; safe to commit. |
 
-These can be set permanently in `pytest.ini`:
+```toml title="pyproject.toml"
+[tool.pytest.ini_options]
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
+```
 
 ```ini title="pytest.ini"
 [pytest]
-addopts = --sift-test-results-check-connection
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = your-org.sift.example:443
+sift_rest_uri = https://your-org.sift.example
 ```
 
+#### API key handling
+
+`SIFT_API_KEY` is deliberately read from the process environment only. The
+recommended workflow uses the
+[`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) plugin (already a
+dependency of `sift-stack-py`), which loads variables from a `.env` file
+into `os.environ` before tests run.
+
+1. Add `.env` to `.gitignore`.
+2. Drop your key into `.env` at the project root:
+
+    ```bash title=".env"
+    SIFT_API_KEY=sk-...your-key...
+    ```
+
+3. In CI, set `SIFT_API_KEY` directly via your provider's secret manager
+   instead of committing a `.env` file.
+
+`pytest-dotenv` picks the file up automatically; no `pytest_configure`
+glue is needed.
+
 !!! warning "FedRAMP / shared environments"
-    Pass `--sift-test-results-log-file=false` to skip the temp file + worker
-    pipeline. Create/update calls then run inline against the API instead of
-    being deferred through a subprocess.
+    Pass `--sift-log-file=false` (or set the ini key to `"false"`)
+    to skip the temp file + worker pipeline. Create/update calls then run
+    inline against the API instead of being deferred through a subprocess.
 
 ### Report metadata captured automatically
 
@@ -107,7 +189,7 @@ Every report the plugin creates includes:
 - `system_operator`: `getpass.getuser()`.
 - `start_time` / `end_time`: set on session enter/exit.
 - `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
-- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-test-results-git-metadata` or when not in a git repo.
+- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-git-metadata` or when not in a git repo.
 
 Example invocations:
 
@@ -122,6 +204,50 @@ metadata), call `report_context.report.update({...})` from any test or
 fixture. See [Linking a Run](#linking-a-run-to-the-report) for the same
 pattern applied to `run_id`.
 
+## Controlling which tests produce reports
+
+By default every test in the session produces a Sift step. Two markers
+and one ini key let you narrow that to a specific set of tests, which is
+useful when a repo holds tests that you don't want included in the Sift test report.
+
+| Setting                                                 | Effect                                                                                       |
+|---------------------------------------------------------|----------------------------------------------------------------------------------------------|
+| `sift_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
+| `@pytest.mark.sift_include` on a test, class, or module | Force reporting on for that scope, regardless of the project default.                        |
+| `@pytest.mark.sift_exclude` on a test, class, or module | Force reporting off for that scope, regardless of the project default.                       |
+
+Closest marker determines setting. `sift_exclude` beats `sift_include` when both apply.
+`pytestmark` at the class or module level inherits to every test in scope.
+
+### Bulk-applying a marker to a directory
+
+To opt an entire directory in (or out) without editing each file, hook
+`pytest_collection_modifyitems` in the directory's `conftest.py`:
+
+```python title="tests/example/conftest.py"
+from pathlib import Path
+
+import pytest
+
+_HERE = Path(__file__).parent
+
+
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
+```
+
+This applies `sift_include` to every test collected under `tests/example/`.
+Combine with `sift_autouse = false` in `pyproject.toml` for
+opting in to specific directories. 
+
+`pytest_collection_modifyitems` receives every item in the session, not just
+this directory's, so the `relative_to` filter is what scopes the marker.
+
 ## Basic usage
 
 With the conftest in place, the simplest test needs nothing extra. The `step`
@@ -181,8 +307,8 @@ outcomes into `TestStatus`:
 | Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
 
 A failure or error at any depth propagates upward: the parent substep, the
-function step, the module step (if `module_substep` is active), and the
-session report all get marked failed.
+function step, the class/module/package steps above it, and the session
+report all get marked failed.
 
 ## Nested steps
 
@@ -218,12 +344,14 @@ Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …)
 assigned by `ReportContext`. Sibling substeps within the same parent
 auto-increment; opening a new top-level step starts a new branch.
 
-### One step per file
+### Mirroring the test layout
 
-`module_substep` is autouse and module-scoped. When it's active (it's pulled
-in by the star-import in `conftest.py`), each file becomes a parent step and
-every function in it nests one level down. Its name is the test file's
-basename and its description is the module's docstring (if any).
+The plugin opens a parent step for each Python package (`__init__.py`
+directory), test file, and test class above every test, plus a parent step
+for each `@pytest.mark.parametrize` axis. Every layer is on by default and
+individually opt-out via ini flags (`sift_package_step`, `sift_module_step`,
+`sift_class_step`, `sift_parametrize_nesting`). Class/module/package
+docstrings become the matching step's description.
 
 ### Linking a Run to the report
 
@@ -263,50 +391,43 @@ TestReport
 └── test_temperature
 ```
 
-### One step per file with `module_substep`
+### Modules nested under a package
 
-`module_substep` is autouse and module-scoped. Every file becomes a parent
-step and every function in it nests one level down.
+Two test files under the same Python package (directory with `__init__.py`)
+share that package step as their parent.
 
-```python title="test_battery.py"
+```python title="suites/__init__.py"
+```
+
+```python title="suites/test_battery.py"
 def test_voltage(step): ...
 def test_current(step): ...
 ```
 
-```python title="test_thermal.py"
+```python title="suites/test_thermal.py"
 def test_idle_temp(step): ...
 def test_load_temp(step): ...
 ```
 
 ```text title="Sift report"
 TestReport
-├── test_battery.py
-│   ├── test_voltage
-│   └── test_current
-└── test_thermal.py
-    ├── test_idle_temp
-    └── test_load_temp
+└── suites
+    ├── test_battery.py
+    │   ├── test_voltage
+    │   └── test_current
+    └── test_thermal.py
+        ├── test_idle_temp
+        └── test_load_temp
 ```
 
-### Test classes
+### Test classes (and nested classes)
 
-Pytest classes (`class TestFoo: ...`) do not create a parent step on their
-own. The plugin keys off the test node's `name`, which is just the method
-name. To group a class's methods under a class-level step, add a class-scoped
-fixture that opens a step with `report_context.new_step(...)`:
+`class TestFoo:` and `class TestOuter: class TestInner:` produce class and
+nested class steps automatically — no manual fixture needed.
 
 ```python title="test_charging.py"
-import pytest
-
-
 class TestCharging:
-    @pytest.fixture(scope="class", autouse=True)
-    def class_step(self, report_context):
-        with report_context.new_step(
-            name="TestCharging",
-            description="Charging subsystem",
-        ) as parent:
-            yield parent
+    """Charging subsystem."""
 
     def test_starts_at_zero(self, step): ...
     def test_reaches_full(self, step): ...
@@ -315,23 +436,20 @@ class TestCharging:
 
 ```text title="Sift report"
 TestReport
-└── TestCharging
-    ├── test_starts_at_zero
-    ├── test_reaches_full
-    └── test_thermal_throttle
+└── test_charging.py
+    └── TestCharging
+        ├── test_starts_at_zero
+        ├── test_reaches_full
+        └── test_thermal_throttle
 ```
 
-!!! note "Combining with `module_substep`"
-    `module_substep` and a class-scoped step both open at module/class scope,
-    so they each grab the next sibling slot under the report and the inner
-    one nests under the outer. If you want both layers (file → class →
-    method), make the class step itself open via the active outer step
-    rather than the report root.
+The class's docstring becomes the step description.
 
 ### Parametrized tests
 
-Each parametrize case is a distinct pytest node, so each gets its own step.
-The step name includes the parameter id pytest generates.
+Parametrized tests cluster under a parent step named after the test function,
+with one inner parent per parametrize axis (outer-to-inner in
+decorator-on-page order). Stacked parametrize produces nested step levels.
 
 ```python
 @pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
@@ -341,11 +459,36 @@ def test_rail(step, voltage):
 
 ```text title="Sift report"
 TestReport
-├── test_rail[3.3]
-├── test_rail[5.0]
-└── test_rail[12.0]
+└── test_module.py
+    └── test_rail
+        ├── voltage=3.3
+        ├── voltage=5.0
+        └── voltage=12.0
+```
+
+Stacked parametrize:
+
+```python
+@pytest.mark.parametrize("voltage", ["high", "low"])
+@pytest.mark.parametrize("component", ["motor", "valve"])
+def test_iso(step, voltage, component): ...
+```
+
+```text title="Sift report"
+TestReport
+└── test_module.py
+    └── test_iso
+        ├── voltage='high'
+        │   ├── component='motor'
+        │   └── component='valve'
+        └── voltage='low'
+            ├── component='motor'
+            └── component='valve'
 ```
 
+Set `sift_parametrize_nesting = false` in `pytest.ini` to fall back to flat
+leaf names (`test_rail[3.3]`).
+
 ### Helper functions
 
 Helpers called from a test do not auto-create a step. The plugin only sees
@@ -536,151 +679,129 @@ The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
 pytest
 
 # Pin the log file so you can replay it later if the import worker dies
-pytest --sift-test-results-log-file=./sift-results.jsonl
+pytest --sift-log-file=./sift-results.jsonl
 ```
 
-See [Running offline](#running-offline) for the same suite running with or
-without a reachable Sift server.
+See [Running modes](#running-modes) for the offline and disabled flags
+that let the same suite run without (or without contacting) Sift.
 
-## Running offline
+## Running modes
 
-The plugin supports two offline workflows, depending on whether you want a
-Sift report at all when the test environment can't reach Sift. The first
-turns the plugin into a no-op when the server is unreachable. The second
-keeps the plugin running normally and writes every create/update to a local
-JSONL file that you upload from a connected machine afterward.
+The plugin runs in one of three modes, picked at invocation:
 
-| Pattern | Flag | Runtime behavior | Follow-up |
-|---|---|---|---|
-| Skip when offline | `--sift-test-results-check-connection` | Fixtures yield `None`, no log file, no report. Pytest still reports pass/fail. | None. |
-| Capture locally, upload later | `--sift-test-results-log-file=<path>` | Plugin writes every create/update to the JSONL file. | `import-test-result-log <path>` from a connected machine. |
+| Mode | Flag | Network | Log file | `step.measure(...)` | When to use |
+|---|---|---|---|---|---|
+| Online (default) | _(none)_ | yes (pings at session start, aborts if it fails) | optional write-through backup | real measurement against Sift | CI with Sift credentials, local dev hitting your tenant |
+| Offline | `--sift-offline` | none | required (the sole sink) | real measurement queued to log | field tests, air-gapped labs, CI without network |
+| Disabled | `--sift-disabled` | none | none | bounds eval; returns a real bool | local dev or CI that doesn't have (or want) Sift |
 
-Pattern 1 suits laptop dev and CI without Sift secrets. Pattern 2 suits
-field tests, vehicles on remote sites, and air-gapped labs.
+Pass both flags? Disabled wins. It's the "skip Sift entirely" hammer and
+supersedes everything else.
 
-### Pattern 1: skip when offline
+### Online mode (default)
 
-`--sift-test-results-check-connection` makes the plugin ping Sift once at
-session start through the `client_has_connection` fixture (which by default
-calls `sift_client.ping.ping()`). On a failed ping, `report_context`,
-`step`, and `module_substep` yield `None` for the rest of the session.
-Pytest still runs the tests and still reports pass/fail.
+`report_context` resolves `client_has_connection` at session start. The
+default implementation calls `sift_client.ping.ping()`. A failed ping
+aborts the whole session with `pytest.UsageError` and points at
+`--sift-offline` and `--sift-disabled` as escape hatches.
 
-```bash
-pytest --sift-test-results-check-connection
-```
+This is loud on purpose. A CI run that silently no-ops on a flaky network
+won't get noticed until somebody goes looking for the report, which is
+usually weeks later, which is usually too late.
 
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-test-results-check-connection
-```
+With the default `--sift-log-file` setting on, create/update calls are
+written to a JSONL log file during the run and an
+`import-test-result-log --incremental` worker replays them against Sift
+in the background. If the worker crashes mid-session (connection failure,
+API error) or is still draining its backlog at session end, the failure
+is logged at session end with a `replay-test-result-log` command for
+manual recovery — test outcomes are unaffected and the local log file is
+preserved. Pass `--sift-log-file=false` to make every create/update
+synchronous against the API instead.
 
-#### Handling `None` in tests
+#### Overriding the connection check
 
-Calls on `step` raise `AttributeError` when it's `None`, so tests that take
-`step` as a parameter need a guard. The cleanest fix is to shadow the
-plugin's `step` fixture in your conftest and turn the `None` case into an
-automatic skip.
+Override `client_has_connection` when ping isn't the right signal, for
+example a token cache that's only warm when authenticated:
 
 ```python title="conftest.py"
-import pytest
+from pathlib import Path
 
-from sift_client.util.test_results import *
+import pytest
 
 
-@pytest.fixture(autouse=True)
-def step(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    yield step
+@pytest.fixture(scope="session")
+def client_has_connection(sift_client) -> bool:
+    return Path("~/.sift-token-cache").expanduser().is_file()
 ```
 
-The `step` parameter on the override resolves to the plugin's fixture, not
-to the override itself. `autouse=True` is required so the skip applies to
-tests that don't request `step` directly. The same shadowing trick works
-for `module_substep` and `report_context`.
-
-For one-off tests that don't share a conftest, an inline guard works just
-as well:
+The override is ignored under `--sift-offline` and `--sift-disabled`.
 
-```python
-def test_battery_voltage(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    step.measure(name="battery_voltage", value=4.97, bounds={"min": 4.8, "max": 5.2})
-```
+### Offline mode (`--sift-offline`)
 
-If you'd rather have tests pass through silently than skip them, wrap the
-calls in a helper that no-ops on `None`:
+Same fixtures, same `step.measure(...)` semantics as online. The
+difference is where the writes go: every create/update lands in a JSONL
+log file instead of hitting the Sift API. The session-start ping is
+skipped, missing `SIFT_*` env vars are tolerated (placeholders are
+filled), and the replay worker (`import-test-result-log --incremental`)
+does not get spawned at session end.
 
-```python
-def safe_measure(step, **kwargs):
-    if step is None:
-        return True
-    return step.measure(**kwargs)
+```bash
+pytest --sift-offline --sift-log-file=./run.jsonl
 ```
 
-#### Overriding the connection check
+Once you have connectivity, replay it:
 
-The default `client_has_connection` fixture calls `sift_client.ping.ping()`.
-Override it in your conftest if pinging is the wrong signal for your
-environment, for example a token cache that's only warm when authenticated:
+```bash
+import-test-result-log ./run.jsonl
+```
 
-```python title="conftest.py"
-from pathlib import Path
+That replay creates the report, steps, and measurements against Sift.
+See [Replaying a saved log file](#replaying-a-saved-log-file) for cleanup
+and the incremental flag.
 
-import pytest
+`--sift-log-file=none` is rejected when offline is set. The
+log file is the only sink in offline mode, so without it the results are
+gone.
 
+!!! warning "Pin the log path"
+    Without `--sift-log-file=<path>`, offline mode writes to
+    a `tempfile.NamedTemporaryFile` and only surfaces the path via a
+    `logger.info` line. Pin a known path when you intend to replay later.
 
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client) -> bool:
-    return Path("~/.sift-token-cache").expanduser().is_file()
-```
+### Disabled mode (`--sift-disabled`)
 
-The plugin only consults this fixture when `--sift-test-results-check-connection`
-is set, so an unused override has no effect on a normal run.
+The plugin stays loaded with the same fixtures and markers as the other
+modes. Nothing contacts Sift, no log file is written, and no `SIFT_*`
+env vars are required. `step.measure(...)`, `step.measure_avg(...)`,
+`step.measure_all(...)`, `step.substep(...)`, and
+`report_context.report.update({...})` all behave normally — bounds
+evaluate and you get a real pass/fail boolean back.
 
-### Pattern 2: capture locally, upload later
+Entities returned in disabled mode report `is_simulated == True` (on
+`TestReport`, `TestStep`, `TestMeasurement`, and `ReportContext`) so
+consumers and tests can branch on provenance. Offline-mode entities
+also report `is_simulated == True`.
 
-This pattern keeps the plugin running normally even when Sift is
-unreachable. The plugin writes to the log file, the worker dies on connect,
-and the file is left on disk for you to upload later. Pin the log file path
-so you can find it afterward, and don't pass
-`--sift-test-results-check-connection`, which would suppress the logging
-this pattern relies on.
+How to turn it on, in the order most projects pick:
 
 ```bash
-pytest --sift-test-results-log-file=./run.jsonl
-```
-
-What happens during the run:
-
-- Every report, step, and measurement create/update is written to
-  `run.jsonl`. The plugin doesn't contact the Sift API for any of these
-  calls; they return simulated responses keyed by UUIDs that the replay
-  later maps to real IDs.
-- The `import-test-result-log --incremental` worker subprocess starts and
-  exits early when it can't reach Sift. The session does not fail when the
-  worker exits before the run ends.
-- Tests run against a real `step` fixture, so `step.measure(...)`,
-  substeps, parametrize, fixtures, and `module_substep` behave exactly as
-  they do online. No conftest changes are needed.
+# In an .envrc, devcontainer, or CI job config
+export SIFT_DISABLED=1
 
-Once you have connectivity, replay the file:
+# Per-invocation kill-switch
+pytest --sift-disabled
 
-```bash
-import-test-result-log ./run.jsonl
+# Per-project default (uncommon; online is usually the right default)
+# pyproject.toml:
+#   [tool.pytest.ini_options]
+#   sift_disabled = true
 ```
 
-The replay creates the report, steps, and measurements against Sift in one
-batch. See [Replaying a saved log file](#replaying-a-saved-log-file) for
-details on cleanup and the incremental flag.
-
-!!! warning "Pin the log path for Pattern 2"
-    Without `--sift-test-results-log-file=<path>`, the plugin writes to a
-    `tempfile.NamedTemporaryFile` and only surfaces the path via a
-    `logger.info` line. Always pin a known path when you intend to replay
-    the file later.
+Good fit for local dev without Sift credentials. Also for library
+consumers who don't have a Sift tenant. Also useful in CI for runs that
+shouldn't add noise to the report stream, like a PR job re-running the
+same suite five times in a row.
 
 ## Replaying a saved log file
 
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
new file mode 100644
index 000000000..54328c707
--- /dev/null
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -0,0 +1,177 @@
+# Pytest Plugin Quickstart
+
+A walkthrough of the runnable demo at
+[`python/examples/pytest_plugin/`](https://github.com/sift-stack/sift/tree/main/python/examples/pytest_plugin).
+The demo is a self-contained pytest project that exercises every layer of the
+plugin's step tree: packages, modules, classes (including nested), parametrize
+axes, manual substeps, and gate markers. It also includes a tests directory
+that uses no Sift APIs at all, to show how the autouse fixtures capture plain
+pytest tests for free.
+
+For a conceptual reference (fixtures, ini flags, status semantics), see
+[Pytest Plugin](pytest_plugin.md).
+
+## Project layout
+
+```
+examples/pytest_plugin/
+├── conftest.py                            # registers the plugin
+├── pytest.ini                             # available ini knobs (all commented at defaults)
+├── .env.example                           # credential template
+└── tests/
+    ├── pytest_only/                       # subpackage step
+    │   ├── __init__.py
+    │   └── test_pytest_only_demo.py       # plain pytest, no Sift APIs
+    └── with_sift/                         # subpackage step
+        ├── __init__.py
+        └── test_with_sift_demo.py         # measurements, substeps, classes, parametrize, gates
+```
+
+Every Python package (directory with `__init__.py`), test file, and test class
+above each test becomes its own parent step in the report tree.
+
+## `conftest.py`
+
+A single `pytest_plugins` declaration loads the plugin; `load_dotenv()` is
+optional and just lets the default `sift_client` fixture pick up
+`SIFT_API_KEY` / `SIFT_GRPC_URI` / `SIFT_REST_URI` from a local `.env`.
+
+```python title="conftest.py"
+--8<-- "examples/pytest_plugin/conftest.py"
+```
+
+## `pytest.ini`
+
+Every knob is commented at its default value. Uncomment any line to opt out of
+a layer of the step tree.
+
+```ini title="pytest.ini"
+--8<-- "examples/pytest_plugin/pytest.ini"
+```
+
+## `.env.example`
+
+```bash title=".env.example"
+--8<-- "examples/pytest_plugin/.env.example"
+```
+
+## The pytest_only module
+
+Plain pytest tests with no `sift_client` imports, no `step` fixture, no
+markers. Each one still becomes a leaf step in the report tree. The plugin's
+autouse fixtures capture pass/fail automatically.
+
+```python title="tests/pytest_only/test_pytest_only_demo.py"
+--8<-- "examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py"
+```
+
+## The with_sift module
+
+Exercises the plugin's full surface: numeric / string / bool bounds, nested
+`step.substep`, `@pytest.mark.sift_exclude`, class steps with docstring
+descriptions, nested classes, stacked `@pytest.mark.parametrize`, and
+`step.report_outcome`.
+
+```python title="tests/with_sift/test_with_sift_demo.py"
+--8<-- "examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py"
+```
+
+## Run it
+
+### Without Sift credentials
+
+```bash
+cd python/examples/pytest_plugin
+pytest --sift-disabled -v
+```
+
+`--sift-disabled` makes the plugin a no-op transport: `step.measure(...)`
+still evaluates bounds and returns a real pass/fail boolean, but nothing
+contacts Sift and no log file is written. Useful for previewing the report
+tree or unit-testing measurement logic.
+
+### Against a real Sift org
+
+```bash
+cp .env.example .env
+# Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
+pytest -v
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+### Offline (record now, replay later)
+
+```bash
+pytest --sift-offline --sift-log-file=/tmp/sift-demo.jsonl -v
+# Later, from anywhere with credentials:
+import-test-result-log /tmp/sift-demo.jsonl
+```
+
+## Expected report tree
+
+With the plugin's defaults (every layer enabled), the demo produces:
+
+```
+TestReport (FAILED, since failures propagate up from leaves)
+├── pytest_only                         ← package step (FAILED)
+│   └── test_pytest_only_demo.py        ← module step (FAILED)
+│       ├── test_passes                                              PASSED
+│       ├── test_uses_a_pytest_fixture                               PASSED
+│       ├── test_assertion_failure_marks_step_failed                 FAILED
+│       ├── test_skipped                                             SKIPPED
+│       ├── test_unexpected_exception_marks_step_errored             ERROR
+│       ├── test_parametrize_without_step
+│       │   ├── value='v1'                                           PASSED
+│       │   └── value='v2'                                           PASSED
+│       └── TestPytestClass
+│           └── test_method                                          PASSED
+└── with_sift                           ← package step (FAILED)
+    └── test_with_sift_demo.py          ← module step (FAILED)
+        ├── test_measurements                                        PASSED
+        ├── test_substeps                                            PASSED
+        │   ├── phase_1
+        │   └── phase_2
+        │       └── phase_2a
+        │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
+        ├── test_measure_series                                      PASSED
+        ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
+        ├── test_assert_measurements_passed_at_end                                FAILED  (pytest FAILED)
+        ├── test_report_level_metadata                               PASSED
+        └── TestClassStep
+            ├── test_parametrize
+            │   ├── axis_a='a1'
+            │   │   ├── axis_b='b1'                                  PASSED
+            │   │   └── axis_b='b2'                                  PASSED
+            │   └── axis_a='a2'
+            │       ├── axis_b='b1'                                  PASSED
+            │       └── axis_b='b2'                                  PASSED
+            └── TestNested
+                └── test_report_outcome
+                    └── check                                        PASSED
+```
+
+The `pytest_only` module deliberately includes one failing, one skipped, and
+one erroring test so the demo shows every `TestStatus` mapping (`FAILED` for
+assertions, `SKIPPED` for `pytest.skip`, `ERROR` for any other exception).
+The `with_sift` module shows two patterns for handling measurement results:
+`test_failed_measurement_marks_sift_step_failed` lets the test keep passing
+in pytest while the Sift step is `FAILED` (useful when measurements are
+diagnostic data you want to collect regardless of outcome); and
+`test_assert_measurements_passed_at_end` takes every measurement first and
+then asserts `step.measurements_passed` once at the end, so every
+measurement still lands in the report even when one fails. The end-of-test
+assertion is the recommended pattern: asserting on an individual
+`step.measure(...)` call short-circuits on the first failure and skips
+every measurement that follows. Expected
+pytest output is `16 passed, 3 failed, 1 skipped`.
+
+Flip any of the `sift_*_step` / `sift_parametrize_nesting` flags in
+`pytest.ini` to `false` to collapse a layer.
+
+## Next steps
+
+- [Pytest Plugin](pytest_plugin.md): conceptual reference covering fixtures,
+  ini flags, status semantics, and layout-mapping examples.
+- The demo's [README](https://github.com/sift-stack/sift/blob/main/python/examples/pytest_plugin/README.md)
+  on GitHub mirrors this page and is the canonical source.
diff --git a/python/examples/pytest_plugin/.env.example b/python/examples/pytest_plugin/.env.example
new file mode 100644
index 000000000..a8c028598
--- /dev/null
+++ b/python/examples/pytest_plugin/.env.example
@@ -0,0 +1,3 @@
+SIFT_API_KEY=your-api-key
+SIFT_GRPC_URI=your-org.grpc.example.com
+SIFT_REST_URI=https://your-org.rest.example.com
diff --git a/python/examples/pytest_plugin/README.md b/python/examples/pytest_plugin/README.md
new file mode 100644
index 000000000..c74a9c939
--- /dev/null
+++ b/python/examples/pytest_plugin/README.md
@@ -0,0 +1,119 @@
+# Pytest plugin demo
+
+A self-contained pytest project that exercises every feature of
+`sift_client.pytest_plugin`: package / module / class / parametrize step
+nesting, nested classes, manual substeps, `step.measure(...)` against
+numeric / string / bool bounds, gate markers, and the ini opt-outs.
+
+```
+examples/pytest_plugin/
+├── conftest.py                            # registers the plugin
+├── pytest.ini                             # available ini knobs (all commented at defaults)
+├── .env.example                           # credential template (copy to .env for local runs)
+└── tests/
+    ├── pytest_only/                       # subpackage step: `pytest_only` opens a parent step
+    │   ├── __init__.py
+    │   └── test_pytest_only_demo.py       # plain pytest tests with no Sift APIs
+    └── with_sift/                         # subpackage step: `with_sift` opens a parent step
+        ├── __init__.py
+        └── test_with_sift_demo.py         # measurements, substeps, classes, nested classes,
+                                            # stacked parametrize, sift_exclude marker
+```
+
+Every layer of organization shows up in the report tree: Python packages
+(directories with `__init__.py`), modules (test files), classes (including
+nested classes), and parametrize axes each open a parent step. Flip
+`sift_package_step`, `sift_module_step`, `sift_class_step`, or
+`sift_parametrize_nesting` to `false` in `pytest.ini` to disable this behavior.
+
+## Run it
+
+**Against a real Sift org**:
+
+```bash
+cp .env.example .env
+# Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
+pytest -v
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+**Offline (record now, replay later - intended for offline environments)**:
+
+```bash
+pytest --sift-offline --sift-log-file=/tmp/sift-demo.jsonl -v
+# Later, from anywhere with credentials:
+import-test-result-log /tmp/sift-demo.jsonl
+```
+
+## What the report tree looks like
+
+With the plugin's defaults (everything in `pytest.ini` left commented), running
+this demo produces a tree like:
+
+```
+TestReport (FAILED, since failures propagate up from leaves)
+├── pytest_only                         ← package step (FAILED)
+│   └── test_pytest_only_demo.py        ← module step (FAILED)
+│       ├── test_passes                                              PASSED
+│       ├── test_uses_a_pytest_fixture                               PASSED
+│       ├── test_assertion_failure_marks_step_failed                 FAILED
+│       ├── test_skipped                                             SKIPPED
+│       ├── test_unexpected_exception_marks_step_errored             ERROR
+│       ├── test_parametrize_without_step
+│       │   ├── value='v1'                                           PASSED
+│       │   └── value='v2'                                           PASSED
+│       └── TestPytestClass
+│           └── test_method                                          PASSED
+└── with_sift                           ← package step (FAILED)
+    └── test_with_sift_demo.py          ← module step (FAILED)
+        ├── test_measurements                                        PASSED
+        ├── test_substeps                                            PASSED
+        │   ├── phase_1
+        │   └── phase_2
+        │       └── phase_2a
+        │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
+        ├── test_measure_series                                      PASSED
+        ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
+        ├── test_assert_measurements_passed_at_end                                FAILED  (pytest FAILED)
+        ├── test_report_level_metadata                               PASSED
+        └── TestClassStep
+            ├── test_parametrize
+            │   ├── axis_a='a1'
+            │   │   ├── axis_b='b1'                                  PASSED
+            │   │   └── axis_b='b2'                                  PASSED
+            │   └── axis_a='a2'
+            │       ├── axis_b='b1'                                  PASSED
+            │       └── axis_b='b2'                                  PASSED
+            └── TestNested
+                └── test_report_outcome
+                    └── check                                        PASSED
+```
+
+The `pytest_only` module deliberately includes one failing, one skipped, and
+one erroring test so the demo shows every `TestStatus` mapping (`FAILED` for
+assertions, `SKIPPED` for `pytest.skip`, `ERROR` for any other exception).
+The `with_sift` module shows two patterns for handling measurement results:
+`test_failed_measurement_marks_sift_step_failed` lets the test keep passing
+in pytest while the Sift step is `FAILED` (useful when measurements are
+diagnostic data you want to collect regardless of outcome); and
+`test_assert_measurements_passed_at_end` takes every measurement first and
+then asserts `step.measurements_passed` once at the end, so every
+measurement still lands in the report even when one fails. The end-of-test
+assertion is the recommended pattern: asserting on an individual
+`step.measure(...)` call short-circuits on the first failure and skips
+every measurement that follows. Expected
+pytest output is `16 passed, 3 failed, 1 skipped`.
+
+Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
+`pytest.ini` to `false` to collapse a layer.
+
+## What each file demonstrates
+
+| File | Feature |
+|---|---|
+| `conftest.py` | Plugin registration via `pytest_plugins`; optional `load_dotenv()` |
+| `pytest.ini` | The four nesting flags + git metadata flag at their defaults |
+| `tests/pytest_only/test_pytest_only_demo.py` | Plain pytest tests with no Sift APIs. The plugin captures pass/fail automatically; covers functions, fixtures, parametrize, classes, plus one each of `AssertionError` (FAILED), `pytest.skip` (SKIPPED), and a raised `ValueError` (ERROR) |
+| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `assert step.measurements_passed` end-of-test pattern that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
+| `tests/{pytest_only,with_sift}/__init__.py` | Each Python package (directory with `__init__.py`) becomes a parent step in the report tree |
diff --git a/python/examples/pytest_plugin/conftest.py b/python/examples/pytest_plugin/conftest.py
new file mode 100644
index 000000000..88253bd73
--- /dev/null
+++ b/python/examples/pytest_plugin/conftest.py
@@ -0,0 +1,15 @@
+"""Project-level conftest for the pytest plugin demo.
+
+A single ``pytest_plugins`` declaration is enough to load the plugin — its
+fixtures, hooks, and CLI options register through standard pytest machinery
+from there. ``load_dotenv()`` is optional; it just lets the default
+``sift_client`` fixture pick up ``SIFT_API_KEY`` / ``SIFT_GRPC_URI`` /
+``SIFT_REST_URI`` from a local ``.env`` when running against a real Sift org.
+These can also be set as environment variables using your preferred method.
+"""
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
diff --git a/python/examples/pytest_plugin/pytest.ini b/python/examples/pytest_plugin/pytest.ini
new file mode 100644
index 000000000..90a1a824b
--- /dev/null
+++ b/python/examples/pytest_plugin/pytest.ini
@@ -0,0 +1,11 @@
+[pytest]
+# Defaults give you the full step tree: every package, module, class, and
+# parametrize axis becomes a parent step. These are the available ini options
+# and their defaults.
+#
+# sift_autouse = true              # autouse fixtures (default: true)
+# sift_package_step = true         # Python package (dir with __init__.py) parent step (default: true)
+# sift_module_step = true          # module (test file) parent step (default: true)
+# sift_class_step = true           # class parent step incl. nested (default: true)
+# sift_parametrize_nesting = true  # parametrize parent steps (default: true)
+# sift_git_metadata = true         # git repo/branch/commit included on the report (default: true)
diff --git a/python/examples/pytest_plugin/tests/pytest_only/__init__.py b/python/examples/pytest_plugin/tests/pytest_only/__init__.py
new file mode 100644
index 000000000..939562d5f
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/pytest_only/__init__.py
@@ -0,0 +1,7 @@
+"""Subpackage of plain pytest tests with no Sift awareness.
+
+Demonstrates that the plugin captures any test's pass/fail with no opt-in
+needed — no ``step`` fixture, no markers, no imports from ``sift_client``.
+The package directory itself becomes a parent step in the report tree (via
+``sift_package_step``, on by default).
+"""
diff --git a/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py b/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
new file mode 100644
index 000000000..77790d301
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
@@ -0,0 +1,49 @@
+"""Plain pytest tests are automatically captured by the plugin as steps.
+
+No imports from ``sift_client`` or fixture usage required. Each test
+becomes a step in the report tree: passing tests resolve to ``PASSED``,
+failing tests to ``FAILED``. This allows integrating existing tests
+with Sift Test Results without modification.
+"""
+
+import pytest
+
+
+def test_passes():
+    """Functions become steps in the report tree. The function docstring is used as the step description."""
+    assert 1 + 1 == 2
+
+
+@pytest.mark.parametrize("value", ["v1", "v2"])
+def test_parametrize_without_step(value):
+    """Parametrized tests are nested under a common step with sub steps for each permutation."""
+    assert value.startswith("v")
+
+
+class TestPytestClass:
+    """Test classes are turned into parent steps for their methods. Class docstrings are used as step the description."""
+
+    def test_method(self):
+        assert True
+
+
+def test_uses_a_pytest_fixture(tmp_path):
+    """Normal pytest fixtures keep working the plugin doesn't intercept them."""
+    (tmp_path / "marker").write_text("ok")
+    assert (tmp_path / "marker").read_text() == "ok"
+
+
+def test_assertion_failure_marks_step_failed():
+    """An ``AssertionError`` resolves the Sift step as ``FAILED`` (no traceback attached)."""
+    assert 1 + 1 == 3
+
+
+@pytest.mark.skip(reason="Demonstrating the skip outcome")
+def test_skipped():
+    """Skipped tests resolve as ``SKIPPED`` in the Sift report."""
+    pass
+
+
+def test_unexpected_exception_marks_step_errored():
+    """Non-``AssertionError`` exceptions resolve the Sift step as ``ERROR`` with the traceback attached."""
+    raise ValueError("simulated environmental failure")
diff --git a/python/examples/pytest_plugin/tests/with_sift/__init__.py b/python/examples/pytest_plugin/tests/with_sift/__init__.py
new file mode 100644
index 000000000..6fd60c38d
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/with_sift/__init__.py
@@ -0,0 +1,8 @@
+"""Subpackage of tests that use the Sift plugin APIs explicitly.
+
+Demonstrates ``step.measure`` (numeric / string / bool bounds), nested
+``step.substep``, gate markers, class and nested-class step nesting, stacked
+parametrize, and ``step.report_outcome``. The package directory itself
+becomes a parent step in the report tree (via ``sift_package_step``, on by
+default).
+"""
diff --git a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
new file mode 100644
index 000000000..34bf602b7
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
@@ -0,0 +1,159 @@
+"""End-to-end demo of the test-results features: measurements, substeps,
+exclusion, classes, nested classes, and stacked parametrize."""
+
+import pytest
+
+
+def test_measurements(step) -> None:
+    """Measurements are the first-class method for recording numeric, string, or bool bounds criteria and their outcomes. These show up in report steps.
+    ``step.measure`` accepts numeric (min/max), string, or bool bounds.
+    Names should be chosen that provide sufficient context, but general enough that similar/identical measurements
+    across steps or reports can be compared.
+    """
+    step.measure(name="numeric_value", value=1.5, bounds={"min": 0.0, "max": 2.0})
+    step.measure(name="string_label", value="ok", bounds="ok")
+    step.measure(name="bool_flag", value=True, bounds=True)
+
+    # Descriptions and metadata can also be provided to measurements.
+    step.measure(
+        name="numeric_value_2",
+        value=1.5,
+        bounds={"min": 0.0, "max": 2.0},
+        description="Numeric that represents X, Y, Z",
+        metadata={"subsystem": "A"},
+    )
+
+    # If you plan to link the pytest report to a Sift Run, you can also assign related channels for easy plotting in the app
+    step.measure(
+        name="numeric_value",
+        value=1.5,
+        bounds={"min": 0.0, "max": 2.0},
+        channel_names=["channel_1", "channel_2"],
+    )
+
+
+def test_substeps(step) -> None:
+    """``step.substep(...)`` opens child steps inside one test; substeps nest arbitrarily.
+    This can be useful for grouping related measurements or for creating a more natural report structure
+    without the need to create a new test, class, etc.
+
+    Metadata can be attached at the step level by passing ``metadata=...`` to
+    ``substep``; the same keyword is accepted by ``report_context.new_step``
+    and propagates to the resulting ``TestStep``.
+    """
+    with step.substep(name="phase_1", metadata={"phase_index": 1}) as s1:
+        s1.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    with step.substep(name="phase_2", metadata={"phase_index": 2}) as s2:
+        with s2.substep(name="phase_2a") as s2a:
+            s2a.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+
+def test_measure_series(step) -> None:
+    """``measure_avg`` and ``measure_all`` are the series variants of ``measure``.
+
+    Both accept a list, numpy array, or pandas series of numeric values.
+    ``measure_avg`` records one row holding the mean of the series and
+    bounds-checks it. ``measure_all`` evaluates every value individually and
+    records one row per out-of-bounds element (in-bounds values are NOT
+    recorded, keeping the report compact).
+    """
+    voltages = [4.95, 5.02, 5.01, 4.98, 5.00]
+    step.measure_avg(
+        name="voltage_mean",
+        values=voltages,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+    # All values are in-bounds here, so measure_all records nothing extra;
+    # change one to e.g. 6.0 to see an out-of-bounds row appear.
+    step.measure_all(
+        name="voltage_samples",
+        values=voltages,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+
+
+def test_failed_measurement_marks_sift_step_failed(step) -> None:
+    """An out-of-bounds measurement marks the Sift step as ``FAILED``
+    without raising. The pytest test still passes (no assertion, no
+    exception); the Sift report records bounds compliance while pytest
+    records control flow.
+
+    Use this pattern when measurements are diagnostic data you want to
+    collect alongside the test result, even when some readings fall outside
+    spec. See ``test_assert_passed_at_end`` below for the recommended way
+    to also fail pytest when any measurement is out of bounds.
+    """
+    step.measure(
+        name="voltage",
+        value=99.0,  # outside the bounds below; marks the step FAILED in Sift
+        bounds={"min": 0.0, "max": 10.0},
+        unit="V",
+    )
+
+
+def test_assert_measurements_passed_at_end(step) -> None:
+    """Recommended pattern: take every measurement first, then assert
+    ``step.measurements_passed`` once at the end.
+
+    Asserting on individual ``step.measure(...)`` calls raises
+    ``AssertionError`` on the first failure, so any measurements after the
+    failing one never run and never land in the Sift report. The end-of-test
+    assertion is strictly better for diagnostic completeness: every
+    measurement is recorded, including the failures, and the aggregate
+    result is then folded into the pytest outcome.
+
+    The ``b`` measurement below is deliberately out of bounds. ``c`` still
+    runs and is recorded; only the final ``assert`` fires.
+    """
+    step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
+    step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})  # out of bounds
+    step.measure(name="c", value=1.5, bounds={"min": 0.0, "max": 2.0})  # still recorded
+    assert step.measurements_passed, "one or more measurements out of bounds"
+
+
+def test_report_level_metadata(step, report_context) -> None:
+    """Attach metadata to the run-wide ``TestReport`` via ``report_context.report.update(...)``.
+
+    The same ``update({...})`` pattern works for any field on
+    ``TestReportUpdate`` (``run_id``, ``serial_number``, ``part_number``,
+    ``system_operator``, ``metadata``, ...). Useful for linking a session
+    to a Sift Run or tagging the report with build / operator info.
+    """
+    report_context.report.update(
+        {
+            "metadata": {
+                "build_id": "v1.2.3",
+                "operator": "ci",
+            }
+        }
+    )
+    step.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+
+@pytest.mark.sift_exclude
+def test_excluded() -> None:
+    """``sift_exclude`` runs the test in pytest but produces no Sift step."""
+    assert True
+
+
+class TestClassStep:
+    """A test class becomes its own step in the report tree.
+
+    This docstring becomes the description of the ``TestClassStep`` step.
+    """
+
+    @pytest.mark.parametrize("axis_a", ["a1", "a2"])
+    @pytest.mark.parametrize("axis_b", ["b1", "b2"])
+    def test_parametrize(self, step, axis_a: str, axis_b: str) -> None:
+        """Stacked parametrize nests outer-to-inner in decorator-on-page order."""
+        step.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    class TestNested:
+        """Nested classes produce nested class steps."""
+
+        def test_report_outcome(self, step) -> None:
+            """``step.report_outcome`` records a non-numeric pass/fail substep."""
+            step.report_outcome(name="check", result=True, reason="value matched")
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
index d15f86c48..ff0c2b515 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
@@ -3,7 +3,7 @@
 import logging
 import uuid
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, TypeVar, cast
 
 from google.protobuf import json_format
 from sift.test_reports.v1.test_reports_pb2 import (
@@ -68,6 +68,9 @@
 logger = logging.getLogger(__name__)
 
 
+_EntityT = TypeVar("_EntityT", TestReport, TestStep, TestMeasurement)
+
+
 class TestResultsLowLevelClient(LowLevelClientBase, WithGrpcClient):
     """Low-level client for the TestResultsAPI.
 
@@ -82,6 +85,16 @@ def __init__(self, grpc_client: GrpcClient):
         """
         super().__init__(grpc_client)
 
+    @staticmethod
+    def _mark_simulated(instance: _EntityT) -> _EntityT:
+        """Stamp an entity as having been produced by the simulate path.
+
+        Mirrors the ``__dict__`` write used by ``BaseType._apply_client_to_instance``
+        to bypass pydantic's frozen-model guard.
+        """
+        instance.__dict__["_simulated"] = True
+        return instance
+
     @staticmethod
     def simulate_create_test_report_response(
         request: CreateTestReportRequest,
@@ -387,7 +400,7 @@ async def create_test_report(
                     request,
                     response_id=simulated_proto.test_report_id,
                 )
-            return TestReport._from_proto(simulated_proto)
+            return self._mark_simulated(TestReport._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestReport(request)
         grpc_test_report = cast("CreateTestReportResponse", response).test_report
@@ -505,7 +518,9 @@ async def update_test_report(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestReport", request)
-            return self.simulate_update_test_report_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_report_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestReport(request)
         grpc_test_report = cast("UpdateTestReportResponse", response).test_report
@@ -560,7 +575,7 @@ async def create_test_step(
                     request,
                     response_id=simulated_proto.test_step_id,
                 )
-            return TestStep._from_proto(simulated_proto)
+            return self._mark_simulated(TestStep._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestStep(request)
         grpc_test_step = cast("CreateTestStepResponse", response).test_step
@@ -661,7 +676,9 @@ async def update_test_step(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestStep", request)
-            return self.simulate_update_test_step_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_step_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestStep(request)
         grpc_test_step = cast("UpdateTestStepResponse", response).test_step
@@ -716,7 +733,7 @@ async def create_test_measurement(
                     request,
                     response_id=simulated_proto.measurement_id,
                 )
-            return TestMeasurement._from_proto(simulated_proto)
+            return self._mark_simulated(TestMeasurement._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestMeasurement(
             request
@@ -861,7 +878,9 @@ async def update_test_measurement(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestMeasurement", request)
-            return self.simulate_update_test_measurement_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_measurement_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestMeasurement(
             request
diff --git a/python/lib/sift_client/_tests/conftest.py b/python/lib/sift_client/_tests/conftest.py
index 5683182e5..0b939ae39 100644
--- a/python/lib/sift_client/_tests/conftest.py
+++ b/python/lib/sift_client/_tests/conftest.py
@@ -78,10 +78,14 @@ def ci_pytest_tag(sift_client):
     return tag
 
 
-# Import the Sift test results fixtures the way we recommend to users.
-from sift_client.util.test_results import *  # noqa: F403
-
-
 def pytest_configure(config: pytest.Config) -> None:
-    """Enable the Sift connection-check mode for the fixtures used in this test suite since we run w/ mock client in non-integration tests."""
-    config.option.sift_test_results_check_connection = True
+    """Pick a Sift plugin mode based on whether integration tests are running.
+
+    Integration runs (``-m integration``) stay online with the default
+    log-file pipeline enabled so CI exercises the JSONL write + import
+    worker replay path that production users hit. Every other run defaults
+    to ``--sift-disabled`` so unit tests don't need credentials.
+    """
+    is_integration_run = "integration" in (config.option.markexpr or "")
+    if not is_integration_run:
+        config.option.sift_disabled = True
diff --git a/python/lib/sift_client/_tests/pytest_plugin/__init__.py b/python/lib/sift_client/_tests/pytest_plugin/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_fakes.py b/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
new file mode 100644
index 000000000..460100daa
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
@@ -0,0 +1,132 @@
+"""Test doubles for the pytester-driven pytest-plugin tests.
+
+The fake ``ReportContext`` is a drop-in for the real one that records every
+step creation to a JSON file at session exit. Used by ``test_parametrize.py``
+to assert the step tree produced by an inner pytester pytest run.
+"""
+
+from __future__ import annotations
+
+import itertools
+import json
+from typing import TYPE_CHECKING, Any
+from unittest.mock import MagicMock
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class FakeStep:
+    def __init__(self, id_: str, name: str, parent_step_id: str | None, step_path: str) -> None:
+        self.id_ = id_
+        self.name = name
+        self.parent_step_id = parent_step_id
+        self.step_path = step_path
+        self.status: Any = None
+        self.description: Any = None
+        self.error_info: Any = None
+
+    def update(self, fields: dict[str, Any]) -> None:
+        for k, v in fields.items():
+            setattr(self, k, v)
+
+
+class FakeReport:
+    def __init__(self) -> None:
+        self.id_ = "report-id"
+
+    def update(self, fields: dict[str, Any]) -> None:
+        pass
+
+
+class FakeReportContext:
+    def __init__(self, steps_file: Path) -> None:
+        self.steps_file = steps_file
+        self.report = FakeReport()
+        self.client = MagicMock()
+        self.step_stack: list[FakeStep] = []
+        self.step_number_at_depth: dict[int, int] = {}
+        self.open_step_results: dict[str, bool] = {}
+        self.any_failures = False
+        self.log_file: Path | None = None
+        self.steps: list[dict[str, Any]] = []
+        self._ids = itertools.count(1)
+
+    def __enter__(self) -> FakeReportContext:
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.steps_file.write_text(json.dumps(self.steps))
+
+    def new_step(
+        self,
+        name: str,
+        description: str | None = None,
+        assertion_as_fail_not_error: bool = True,
+        metadata: dict[str, Any] | None = None,
+    ) -> Any:
+        # Reuse the real NewStep machinery — it talks to this fake via the
+        # methods below.
+        from sift_client.util.test_results.context_manager import NewStep
+
+        return NewStep(
+            self,  # type: ignore[arg-type]
+            name=name,
+            description=description,
+            assertion_as_fail_not_error=assertion_as_fail_not_error,
+            metadata=metadata,
+        )
+
+    def get_next_step_path(self) -> str:
+        top = self.step_stack[-1] if self.step_stack else None
+        path = top.step_path if top else ""
+        next_n = self.step_number_at_depth.get(len(self.step_stack), 0) + 1
+        prefix = f"{path}." if path else ""
+        return f"{prefix}{next_n}"
+
+    def create_step(
+        self,
+        name: str,
+        description: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> FakeStep:
+        step_path = self.get_next_step_path()
+        parent = self.step_stack[-1] if self.step_stack else None
+        step = FakeStep(
+            id_=f"step-{next(self._ids)}",
+            name=name,
+            parent_step_id=parent.id_ if parent else None,
+            step_path=step_path,
+        )
+        self.step_number_at_depth[len(self.step_stack)] = (
+            self.step_number_at_depth.get(len(self.step_stack), 0) + 1
+        )
+        self.step_stack.append(step)
+        self.open_step_results[step.step_path] = True
+        self.steps.append(
+            {
+                "id": step.id_,
+                "name": name,
+                "parent_step_id": step.parent_step_id,
+                "step_path": step_path,
+            }
+        )
+        return step
+
+    def record_step_outcome(self, outcome: bool, step: FakeStep) -> None:
+        if not outcome:
+            self.open_step_results[step.step_path] = False
+            self.any_failures = True
+
+    def resolve_and_propagate_step_result(self, step: FakeStep, error_info: Any = None) -> bool:
+        result = self.open_step_results.get(step.step_path, True)
+        if error_info:
+            result = False
+        return result
+
+    def exit_step(self, step: FakeStep) -> None:
+        self.step_number_at_depth[len(self.step_stack)] = 0
+        stack_top = self.step_stack.pop()
+        self.open_step_results.pop(step.step_path)
+        if stack_top.id_ != step.id_:
+            raise ValueError("popped step was not the top of the stack")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/conftest.py b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
new file mode 100644
index 000000000..783a12bf4
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
@@ -0,0 +1,63 @@
+"""Shared helpers for the pytest-plugin test suite.
+
+The tests in this directory drive inner pytester sessions to exercise the
+plugin's behavior in isolation. The fixtures below produce the boilerplate
+conftests those inner sessions need:
+
+- ``write_plugin_conftest``: minimal conftest that loads the plugin
+- ``write_probe_conftest``: conftest that loads the plugin and runs a probe
+  block inside ``pytest_configure``, useful for inspecting internal state
+  without running tests against a real backend
+
+Every test in this suite invokes the inner session via
+``pytester.runpytest_subprocess(...)`` rather than ``pytester.runpytest(...)``.
+``runpytest`` runs the inner pytest in-process, which re-imports the Sift
+plugin on each test; the plugin transitively imports numpy, whose C
+extensions refuse to initialize twice in one process and raise
+``cannot load module more than once per process``. Spawning a subprocess
+gives each inner session a fresh interpreter and sidesteps that guard.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from typing import Callable
+
+import pytest
+
+_SIFT_ENV_VARS = ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI", "SIFT_DISABLED")
+
+
+@pytest.fixture
+def clear_sift_env(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Unset all ``SIFT_*`` environment variables for the duration of the test."""
+    for name in _SIFT_ENV_VARS:
+        monkeypatch.delenv(name, raising=False)
+
+
+@pytest.fixture
+def write_plugin_conftest(pytester: pytest.Pytester) -> Callable[[], None]:
+    """Return a callable that writes a minimal conftest loading the plugin."""
+
+    def _write() -> None:
+        pytester.makeconftest('pytest_plugins = ["sift_client.pytest_plugin"]')
+
+    return _write
+
+
+@pytest.fixture
+def write_probe_conftest(pytester: pytest.Pytester) -> Callable[[str], None]:
+    """Return a callable that writes a conftest running ``probe_body`` in ``pytest_configure``.
+
+    ``probe_body`` is python source that runs at config time with ``config``
+    in scope; use ``print(...)`` calls and capture them with
+    ``result.stdout.fnmatch_lines``.
+    """
+
+    def _write(probe_body: str) -> None:
+        pytester.makeconftest(
+            'pytest_plugins = ["sift_client.pytest_plugin"]\n\n'
+            "def pytest_configure(config):\n" + textwrap.indent(textwrap.dedent(probe_body), "    ")
+        )
+
+    return _write
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
new file mode 100644
index 000000000..4efb9f554
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
@@ -0,0 +1,432 @@
+"""Tests for the plugin's CLI/ini configuration surface.
+
+Covers flag parsing, ini-key resolution, CLI-over-ini precedence, the
+defaults that apply when nothing is set, and the marker-based gate that
+governs the autouse fixtures. Credentials are tested in
+``test_credentials.py``.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestIniConfiguration:
+    """`addini` keys configure the plugin via pyproject.toml / pytest.ini."""
+
+    def test_ini_log_file_none(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_log_file = "none"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["RESOLVED: None"])
+
+    def test_python_false_disables_log_file(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """`config.option.sift_log_file = False` disables logging.
+
+        Conftests use this pattern (see lib/sift_client/_tests/util/conftest.py)
+        to opt their subtree out of log-file mode. Regression test for the
+        resolver case where Python `False` was previously confused with `None`
+        and silently kept the temp-file default.
+        """
+        write_probe_conftest(
+            """
+            config.option.sift_log_file = False
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["RESOLVED: None"])
+
+    def test_ini_log_file_path(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        log_path = tmp_path / "sift-run.jsonl"
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            f"""
+            [tool.pytest.ini_options]
+            sift_log_file = "{log_path}"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines([f"RESOLVED: {log_path}"])
+
+    def test_ini_offline_true(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _is_offline
+            print("OFFLINE:", _is_offline(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_offline = true
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["OFFLINE: True"])
+
+    def test_ini_disabled_true(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _is_disabled
+            print("DISABLED:", _is_disabled(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_disabled = true
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["DISABLED: True"])
+
+    def test_ini_git_metadata_false(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            print("INI_GIT:", config.getini("sift_git_metadata"))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_git_metadata = false
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["INI_GIT: False"])
+
+    def test_cli_overrides_ini(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """A CLI flag takes precedence over the matching ini key."""
+        cli_path = tmp_path / "cli-wins.jsonl"
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_log_file = "none"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", f"--sift-log-file={cli_path}")
+        result.stdout.fnmatch_lines([f"RESOLVED: {cli_path}"])
+
+    def test_cli_offline_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--sift-offline`` CLI flag flips the resolver to True."""
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _is_offline
+            print("OFFLINE:", _is_offline(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-offline")
+        result.stdout.fnmatch_lines(["OFFLINE: True"])
+
+    def test_cli_disabled_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--sift-disabled`` CLI flag flips the resolver to True."""
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _is_disabled
+            print("DISABLED:", _is_disabled(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-disabled")
+        result.stdout.fnmatch_lines(["DISABLED: True"])
+
+    def test_cli_no_git_metadata_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--no-sift-git-metadata`` CLI flag flips git_metadata to False.
+
+        Guards the negation flag's ``dest`` binding: the flag name doesn't match
+        the ini key, so a broken ``dest`` would silently fall back to the ini
+        default and pass every other test in this file.
+        """
+        write_probe_conftest(
+            """
+            print("CLI_GIT:", config.getoption("sift_git_metadata"))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--no-sift-git-metadata")
+        result.stdout.fnmatch_lines(["CLI_GIT: False"])
+
+    def test_defaults_when_neither_set(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import (
+                _is_disabled,
+                _is_offline,
+                _resolve_log_file,
+            )
+            print("RESOLVED:", _resolve_log_file(config))
+            print("OFFLINE:", _is_offline(config))
+            print("DISABLED:", _is_disabled(config))
+            print("INI_GIT:", config.getini("sift_git_metadata"))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(
+            [
+                "RESOLVED: True",
+                "OFFLINE: False",
+                "DISABLED: False",
+                "INI_GIT: True",
+            ]
+        )
+
+
+# A session-scoped `report_context` stub for the autouse-gate tests. Overrides
+# the plugin's real `report_context` so the inner pytest sessions don't try to
+# talk to a Sift backend; the gate tests only need to observe whether `step`
+# resolves to a real value or to None.
+_GATE_INNER_CONFTEST = textwrap.dedent(
+    """
+    from unittest.mock import MagicMock
+
+    import pytest
+
+    pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+    @pytest.fixture(scope="session")
+    def report_context():
+        yield MagicMock()
+    """
+)
+
+
+class TestAutouseGate:
+    """`sift_include` / `sift_exclude` markers and the `sift_autouse` ini gate."""
+
+    def test_default_ini_true_activates(self, pytester: pytest.Pytester) -> None:
+        """Plugin default (ini absent) keeps the autouse fixtures active."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            def test_inner(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_default_ini_false_skips(self, pytester: pytest.Pytester) -> None:
+        """`sift_autouse = false` makes the autouse fixtures no-op by default."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_sift_include_marker_forces_on(self, pytester: pytest.Pytester) -> None:
+        """`@pytest.mark.sift_include` overrides ini-false to enable the gate."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_inner(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_sift_exclude_marker_forces_off(self, pytester: pytest.Pytester) -> None:
+        """`@pytest.mark.sift_exclude` overrides ini-true to disable the gate."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_exclude
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_exclude_beats_include(self, pytester: pytest.Pytester) -> None:
+        """When both markers are present, `sift_exclude` wins (safer default)."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            @pytest.mark.sift_exclude
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_module_pytestmark_inherits(self, pytester: pytest.Pytester) -> None:
+        """Module-level `pytestmark = pytest.mark.sift_include` covers every test in the module."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            pytestmark = pytest.mark.sift_include
+
+            def test_inner_a(step):
+                assert step is not None
+
+            def test_inner_b(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=2)
+
+    def test_bulk_apply_via_conftest_hook(self, pytester: pytest.Pytester) -> None:
+        """A subtree opts in via `pytest_collection_modifyitems`; siblings stay off.
+
+        Regression test for this repo's wiring pattern: the project default is
+        autouse-off, the integration subtree's conftest bulk-applies
+        `sift_include`, and sibling subtrees remain disabled. Verifies the
+        per-directory mechanism works in a single pytest invocation.
+        """
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        included = pytester.mkdir("included_subtree")
+        (included / "conftest.py").write_text(
+            textwrap.dedent(
+                """
+                from pathlib import Path
+
+                import pytest
+
+                _HERE = Path(__file__).parent
+
+
+                def pytest_collection_modifyitems(config, items):
+                    for item in items:
+                        try:
+                            item.path.relative_to(_HERE)
+                        except ValueError:
+                            continue
+                        item.add_marker(pytest.mark.sift_include)
+                """
+            )
+        )
+        (included / "test_included.py").write_text(
+            "def test_included(step):\n    assert step is not None\n"
+        )
+        untouched = pytester.mkdir("untouched_subtree")
+        (untouched / "test_untouched.py").write_text(
+            "def test_untouched(step):\n    assert step is None\n"
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=2)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
new file mode 100644
index 000000000..3f6d22a6e
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
@@ -0,0 +1,117 @@
+"""Tests for the default ``sift_client`` fixture's credential resolution.
+
+Covers the env-var-then-ini fallback for URIs, the env-only handling of
+``SIFT_API_KEY``, and the error path that names missing credentials.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    import pytest
+
+
+class TestCredentials:
+    """The default ``sift_client`` fixture's resolution of env vars and ini keys."""
+
+    def test_uris_from_ini(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The default sift_client fixture reads URI credentials from ini when env vars are unset."""
+        monkeypatch.setenv("SIFT_API_KEY", "env-key")
+        monkeypatch.delenv("SIFT_GRPC_URI", raising=False)
+        monkeypatch.delenv("SIFT_REST_URI", raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            sift_offline = true
+
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_credentials_loaded(sift_client):
+                cfg = sift_client.grpc_client._config
+                assert cfg.api_key == "env-key"
+                assert "ini-grpc:1234" in cfg.uri
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_env_var_overrides_ini_uri(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """When both env var and ini set a URI, the env var wins."""
+        monkeypatch.setenv("SIFT_API_KEY", "env-key")
+        monkeypatch.setenv("SIFT_GRPC_URI", "env-grpc:9999")
+        monkeypatch.delenv("SIFT_REST_URI", raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            sift_offline = true
+
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_env_wins(sift_client):
+                assert "env-grpc:9999" in sift_client.grpc_client._config.uri
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_api_key_ignored_from_ini(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """`sift_api_key` is not registered as an ini key; the fixture refuses to use it."""
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            monkeypatch.delenv(name, raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_api_key = "should-be-ignored"
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            """
+        )
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "SIFT_API_KEY" in combined, combined
+
+    def test_missing_credentials_named_in_error(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """A missing credential aborts with all missing names listed."""
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            monkeypatch.delenv(name, raising=False)
+        write_plugin_conftest()
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            assert name in combined, combined
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
new file mode 100644
index 000000000..90a5fcb56
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
@@ -0,0 +1,182 @@
+"""Tests for ``--sift-disabled`` mode.
+
+Disabled mode skips Sift entirely. Autouse fixtures yield stub objects so
+test code that calls ``step.measure(...)`` keeps working without any Sift
+configuration; ``measure*`` evaluates bounds locally and returns the real
+pass/fail boolean. Nothing reaches Sift and no log file is written.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestDisabledMode:
+    def test_in_bounds_passes_out_of_bounds_fails(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Stub measure* evaluates bounds locally; pass/fail matches the real plugin."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_passes_in_bounds(step):
+                assert step.measure(name="v", value=5.0, bounds={"min": 4.8, "max": 5.2})
+
+            def test_fails_out_of_bounds(step):
+                assert step.measure(name="v", value=99.0, bounds={"max": 5.2}) is False
+
+            def test_substep_and_report_outcome(step):
+                with step.substep(name="inner") as inner:
+                    assert inner.report_outcome(name="ok", result=True) is True
+
+            def test_string_bounds(step):
+                assert step.measure(name="fw", value="1.0", bounds="1.0") is True
+                assert step.measure(name="fw", value="1.0", bounds="2.0") is False
+
+            def test_measure_avg(step):
+                assert step.measure_avg(
+                    name="bus", values=[4.97, 5.01, 5.03], bounds={"min": 4.9, "max": 5.1}
+                ) is True
+
+            def test_measure_all_outlier(step):
+                assert step.measure_all(
+                    name="p", values=[10.1, 10.2, 99.9], bounds={"max": 11.0}
+                ) is False
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=6)
+
+    def test_disabled_does_not_require_credentials(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Disabled mode never reads SIFT_* env vars; runs cleanly without them."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_via_env_var(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        """``SIFT_DISABLED=1`` triggers disabled mode without the CLI flag."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        monkeypatch.setenv("SIFT_DISABLED", "1")
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_supersedes_offline(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-disabled`` wins when combined with ``--sift-offline``.
+
+        Disabled is the "skip Sift entirely" hammer; passing it alongside
+        offline shouldn't error. The session runs without credentials, without
+        a log file, and without the offline-mode replay machinery.
+        """
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_runs(step):
+                assert step.measure(name="v", value=5.0, bounds={"max": 10.0}) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled", "--sift-offline")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_yields_stub_fixtures(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """`report_context` / `step` are real instances backed by a simulate client."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            from sift_client.util.test_results import ReportContext
+            from sift_client.util.test_results.context_manager import NewStep
+
+            def test_types(step, report_context):
+                assert isinstance(report_context, ReportContext)
+                assert report_context.is_simulated is True
+                assert report_context.report.is_simulated is True
+                assert step.current_step.is_simulated is True
+                assert isinstance(step, NewStep)
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_writes_no_log_file_even_when_path_pinned(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Disabled mode skips the log-file pipeline even when a path is pinned."""
+        log_path = tmp_path / "should-not-exist.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        assert not log_path.exists(), f"log file unexpectedly created at {log_path}"
+
+    def test_disabled_skips_client_has_connection_and_sift_client(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Disabled mode never resolves ``client_has_connection`` or ``sift_client``.
+
+        The plugin's ``report_context`` short-circuits to the stub before
+        consulting either fixture. Overrides that raise on resolution stay
+        un-triggered, so the inner test passes cleanly.
+        """
+        pytester.makeconftest(
+            """
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                raise AssertionError("sift_client should not resolve in disabled mode")
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                raise AssertionError(
+                    "client_has_connection should not resolve in disabled mode"
+                )
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_runs(step):
+                assert step.measure(name="v", value=5.0, bounds={"max": 10.0}) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
new file mode 100644
index 000000000..cecad2df8
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -0,0 +1,889 @@
+"""Tests for the plugin's hierarchy-step nesting behavior.
+
+Covers every layer the plugin opens parent steps for — packages, modules,
+classes (including nested), parametrize axes — plus the ini opt-out flags,
+failure-cleanup semantics, and the drain helper.
+
+Each test spins up an inner pytest run via ``pytester`` whose conftest swaps
+in a ``FakeReportContext`` (from ``_fakes.py``) that records every step
+creation to a JSON file. The outer test reads that file and asserts the
+resulting step tree.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path as _Path
+from textwrap import dedent
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+_STEPS_FILE_ENV = "SIFT_FAKE_STEPS_FILE"
+
+# ``_fakes.py`` is excluded from the wheel by ``pyproject.toml``'s
+# ``packages.find`` rule that strips ``sift_client._tests``. The inner
+# pytester subprocess uses the installed package and cannot import from
+# ``sift_client._tests``. Embed the fake source directly into the inner
+# conftest so the subprocess gets a fully self-contained module to load.
+_FAKES_SOURCE = (_Path(__file__).parent / "_fakes.py").read_text()
+
+_INNER_CONFTEST = f"""
+{_FAKES_SOURCE}
+
+import os
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+@pytest.fixture(scope="session")
+def sift_client():
+    return MagicMock()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def report_context(sift_client):
+    import sift_client.pytest_plugin as plugin_module
+    steps_file = Path(os.environ[{_STEPS_FILE_ENV!r}])
+    with FakeReportContext(steps_file) as ctx:
+        plugin_module.REPORT_CONTEXT = ctx
+        yield ctx
+"""
+
+
+@pytest.fixture
+def steps_file(pytester: pytest.Pytester, monkeypatch: pytest.MonkeyPatch) -> Path:
+    path = pytester.path / "captured_steps.json"
+    pytester.makeconftest(_INNER_CONFTEST)
+    monkeypatch.setenv(_STEPS_FILE_ENV, str(path))
+    return path
+
+
+def _by_name(steps: list[dict]) -> dict[str, list[dict]]:
+    out: dict[str, list[dict]] = {}
+    for s in steps:
+        out.setdefault(s["name"], []).append(s)
+    return out
+
+
+def _ancestor_names(steps: list[dict], leaf: dict) -> list[str]:
+    """Walk from ``leaf`` to the root via parent_step_id, returning names."""
+    by_id = {s["id"]: s for s in steps}
+    chain: list[str] = []
+    cur: dict | None = leaf
+    while cur is not None:
+        chain.append(cur["name"])
+        parent_id = cur["parent_step_id"]
+        cur = by_id.get(parent_id) if parent_id else None
+    return chain
+
+
+def test_class_methods_cluster_under_class_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_klass=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["TestFoo"]) == 1
+    class_id = by_name["TestFoo"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == class_id
+    assert by_name["test_b"][0]["parent_step_id"] == class_id
+
+
+def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_nested=dedent(
+            """
+            class TestOuter:
+                class TestInner:
+                    def test_a(self):
+                        pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["TestOuter"]) == 1
+    assert len(by_name["TestInner"]) == 1
+    leaf = by_name["test_a"][0]
+    assert _ancestor_names(steps, leaf) == [
+        "test_a",
+        "TestInner",
+        "TestOuter",
+        "test_nested.py",
+    ]
+
+
+def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_cp=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    class_id = by_name["TestFoo"][0]["id"]
+    test_a_id = by_name["test_a"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == class_id
+    assert by_name["v=1"][0]["parent_step_id"] == test_a_id
+    assert by_name["v=2"][0]["parent_step_id"] == test_a_id
+
+
+def test_two_sibling_classes_in_module(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_sib=dedent(
+            """
+            class TestA:
+                def test_x(self):
+                    pass
+
+            class TestB:
+                def test_y(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    mod_id = by_name["test_sib.py"][0]["id"]
+    assert by_name["TestA"][0]["parent_step_id"] == mod_id
+    assert by_name["TestB"][0]["parent_step_id"] == mod_id
+    # Sanity: each class is opened exactly once (no duplicate parents).
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+
+
+def test_mixed_class_and_free_function(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_mix=dedent(
+            """
+            class TestA:
+                def test_x(self):
+                    pass
+
+            def test_free():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    mod_id = by_name["test_mix.py"][0]["id"]
+    # Class method parents to TestA; free function parents directly to module.
+    assert by_name["TestA"][0]["parent_step_id"] == mod_id
+    assert by_name["test_x"][0]["parent_step_id"] == by_name["TestA"][0]["id"]
+    assert by_name["test_free"][0]["parent_step_id"] == mod_id
+
+
+def test_class_with_all_excluded_methods_no_class_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_excl=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.sift_exclude
+                def test_a(self):
+                    pass
+
+                @pytest.mark.sift_exclude
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+    assert "test_b" not in by_name
+
+
+def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_clsexcl=dedent(
+            """
+            import pytest
+
+            @pytest.mark.sift_exclude
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+
+
+def test_class_docstring_becomes_step_description(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_doc=dedent(
+            '''
+            class TestFoo:
+                """Class docstring."""
+
+                def test_a(self):
+                    pass
+            '''
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # The fake records step creation but not all fields — check the class
+    # step was recorded, then read the description via the FakeStep's
+    # description attribute by re-reading steps. The fake's create_step only
+    # records name/parent/path/id, so verify via the leaf chain only here.
+    leaf = by_name["test_a"][0]
+    assert _ancestor_names(steps, leaf)[:3] == ["test_a", "TestFoo", "test_doc.py"]
+
+
+def test_transition_between_class_chains_drains_parametrize(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_trans=dedent(
+            """
+            import pytest
+
+            class TestA:
+                @pytest.mark.parametrize("v", [1])
+                def test_x(self, v):
+                    pass
+
+            class TestB:
+                @pytest.mark.parametrize("w", [2])
+                def test_y(self, w):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Each class opens exactly once; parametrize parents under the right class.
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+    test_x_id = by_name["test_x"][0]["id"]
+    test_y_id = by_name["test_y"][0]["id"]
+    assert by_name["v=1"][0]["parent_step_id"] == test_x_id
+    assert by_name["w=2"][0]["parent_step_id"] == test_y_id
+    # Confirm full chain: leaves trace up through correct class.
+    chain_x = _ancestor_names(steps, by_name["v=1"][0])
+    chain_y = _ancestor_names(steps, by_name["w=2"][0])
+    assert "TestA" in chain_x
+    assert "TestB" not in chain_x
+    assert "TestB" in chain_y
+    assert "TestA" not in chain_y
+
+
+# ---------------------------------------------------------------------------
+# Failure-cleanup tests
+# ---------------------------------------------------------------------------
+
+
+def test_drain_step_stack_continues_past_failing_exit() -> None:
+    """Lenient mode: a misbehaving ``__exit__`` must not block the rest of the stack."""
+    from sift_client.pytest_plugin import (
+        SiftPytestStepDrainWarning,
+        _drain_step_stack,
+    )
+
+    class _Good:
+        def __init__(self) -> None:
+            self.closed = False
+
+        def __exit__(self, *_: object) -> None:
+            self.closed = True
+
+    class _Bad:
+        def __exit__(self, *_: object) -> None:
+            raise RuntimeError("boom")
+
+    g1, g2, bad = _Good(), _Good(), _Bad()
+    stack: list[tuple[str, object]] = [("g1", g1), ("bad", bad), ("g2", g2)]
+    with pytest.warns(SiftPytestStepDrainWarning, match="boom"):
+        _drain_step_stack(stack)
+    assert stack == []
+    assert g1.closed
+    assert g2.closed
+
+
+def test_drain_step_stack_strict_drains_fully_then_raises() -> None:
+    """Strict mode: drain every frame, then raise with the FIRST failure chained."""
+    from sift_client.pytest_plugin import (
+        SiftPytestStepDrainError,
+        _drain_step_stack,
+    )
+
+    class _Good:
+        def __init__(self) -> None:
+            self.closed = False
+
+        def __exit__(self, *_: object) -> None:
+            self.closed = True
+
+    class _Bad:
+        def __init__(self, label: str) -> None:
+            self.label = label
+
+        def __exit__(self, *_: object) -> None:
+            raise RuntimeError(f"boom-{self.label}")
+
+    g, b1, b2 = _Good(), _Bad("first"), _Bad("second")
+    # Stack drains LIFO: pop order is b2, b1, g. So b2's failure is the first
+    # one collected and surfaces in __cause__.
+    stack: list[tuple[str, object]] = [("g", g), ("b1", b1), ("b2", b2)]
+    with pytest.raises(SiftPytestStepDrainError, match="2 step.*'b2'") as exc_info:
+        _drain_step_stack(stack, swallow_errors=False)
+    # Stack fully drained even though it raised.
+    assert stack == []
+    assert g.closed
+    # Original exception chained for debuggability.
+    assert isinstance(exc_info.value.__cause__, RuntimeError)
+    assert "boom-second" in str(exc_info.value.__cause__)
+
+
+def test_failing_test_in_class_does_not_orphan_class_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """A failing class method must not block the class step from cleaning up.
+
+    Sibling methods in the same class must still parent to the same class
+    step, and a later class in the module must open as a sibling (not nested
+    under an orphan).
+    """
+    pytester.makepyfile(
+        test_fail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["TestFoo"]) == 1
+    assert len(by_name["TestBar"]) == 1
+    foo_id = by_name["TestFoo"][0]["id"]
+    bar_id = by_name["TestBar"][0]["id"]
+    mod_id = by_name["test_fail.py"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == foo_id
+    assert by_name["test_b"][0]["parent_step_id"] == foo_id
+    assert by_name["test_c"][0]["parent_step_id"] == bar_id
+    # Both classes are siblings under the same module — TestBar didn't get
+    # nested under an orphan TestFoo.
+    assert by_name["TestFoo"][0]["parent_step_id"] == mod_id
+    assert by_name["TestBar"][0]["parent_step_id"] == mod_id
+
+
+def test_failing_parametrized_method_in_class_closes_full_chain(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """A failing parametrized class method must not orphan its parametrize parents."""
+    pytester.makepyfile(
+        test_pfail=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    if v == 1:
+                        raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    foo_id = by_name["TestFoo"][0]["id"]
+    test_a_id = by_name["test_a"][0]["id"]
+    # Both parametrize leaves parent to the same test_a; test_b parents
+    # directly to TestFoo (no parametrize parent leaked across methods).
+    assert by_name["v=1"][0]["parent_step_id"] == test_a_id
+    assert by_name["v=2"][0]["parent_step_id"] == test_a_id
+    assert by_name["test_b"][0]["parent_step_id"] == foo_id
+
+
+# ---------------------------------------------------------------------------
+# Opt-out flag tests
+# ---------------------------------------------------------------------------
+
+
+def _write_ini(pytester: pytest.Pytester, **overrides: object) -> None:
+    """Write a pytest.ini with the given sift_* overrides set under [pytest]."""
+    lines = ["[pytest]"]
+    for key, value in overrides.items():
+        lines.append(f"{key} = {value}")
+    pytester.makefile(".ini", pytest="\n".join(lines) + "\n")
+
+
+def test_sift_class_step_false_skips_class_steps(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(pytester, sift_class_step="false")
+    pytester.makepyfile(
+        test_noclass=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    mod_id = by_name["test_noclass.py"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == mod_id
+    assert by_name["test_b"][0]["parent_step_id"] == mod_id
+
+
+def test_sift_module_step_false_skips_module_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(pytester, sift_module_step="false")
+    pytester.makepyfile(
+        test_nomod=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "test_nomod.py" not in by_name
+    # TestFoo attaches to the report root (no parent recorded by the fake).
+    assert by_name["TestFoo"][0]["parent_step_id"] is None
+    assert by_name["test_a"][0]["parent_step_id"] == by_name["TestFoo"][0]["id"]
+
+
+def test_sift_parametrize_nesting_false_keeps_flat_leaves(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(pytester, sift_parametrize_nesting="false")
+    pytester.makepyfile(
+        test_flat=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [1, 2])
+            def test_a(v):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # No parametrize parent step.
+    assert "test_a" not in by_name
+    assert "v=1" not in by_name
+    # Leaves use the bracket-mangled pytest names.
+    assert "test_a[1]" in by_name
+    assert "test_a[2]" in by_name
+    mod_id = by_name["test_flat.py"][0]["id"]
+    assert by_name["test_a[1]"][0]["parent_step_id"] == mod_id
+    assert by_name["test_a[2]"][0]["parent_step_id"] == mod_id
+
+
+def test_sift_module_step_false_still_drains_across_modules(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """sift_module_step=false must not merge same-named classes across modules.
+
+    The hierarchy chain always includes the module ancestor for identity
+    (even when it's not rendered as a step), so two modules each declaring
+    ``class TestFoo`` produce two distinct ``TestFoo`` frames in the diff.
+    """
+    _write_ini(pytester, sift_module_step="false")
+    pytester.makepyfile(
+        test_a=dedent(
+            """
+            class TestFoo:
+                def test_x(self):
+                    pass
+            """
+        ),
+        test_b=dedent(
+            """
+            class TestFoo:
+                def test_y(self):
+                    pass
+            """
+        ),
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Two distinct TestFoo class steps — one per module — not a shared frame.
+    assert len(by_name["TestFoo"]) == 2
+    foo_ids = {s["id"] for s in by_name["TestFoo"]}
+    # Each test method parents to a different TestFoo id.
+    test_x_parent = by_name["test_x"][0]["parent_step_id"]
+    test_y_parent = by_name["test_y"][0]["parent_step_id"]
+    assert test_x_parent in foo_ids
+    assert test_y_parent in foo_ids
+    assert test_x_parent != test_y_parent
+
+
+def test_package_step_default_opens_for_init_dirs(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """Default: a directory with ``__init__.py`` produces a parent package step."""
+    pytester.mkpydir("pkg_a")
+    (pytester.path / "pkg_a" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "pkg_a" in by_name
+    pkg_id = by_name["pkg_a"][0]["id"]
+    mod = by_name["test_x.py"][0]
+    assert mod["parent_step_id"] == pkg_id
+
+
+def test_same_named_packages_in_different_dirs_do_not_merge(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """Two packages with the same display name but different paths must stay distinct.
+
+    The hierarchy diff compares on ``nodeid`` (identity), not just the
+    display name — so a ``utils`` package under ``proj_a/`` and another
+    under ``proj_b/`` (where ``proj_a/`` and ``proj_b/`` are bare
+    directories that pytest treats as ``pytest.Dir`` nodes and the chain
+    walker skips) produce two distinct ``utils`` parent steps in the report
+    tree, not a silent merge.
+    """
+    (pytester.path / "proj_a" / "utils").mkdir(parents=True)
+    (pytester.path / "proj_a" / "utils" / "__init__.py").touch()
+    (pytester.path / "proj_a" / "utils" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    (pytester.path / "proj_b" / "utils").mkdir(parents=True)
+    (pytester.path / "proj_b" / "utils" / "__init__.py").touch()
+    (pytester.path / "proj_b" / "utils" / "test_y.py").write_text(
+        dedent(
+            """
+            def test_two():
+                pass
+            """
+        )
+    )
+    # ``importlib`` import mode is required so two packages with the same
+    # name on disk don't collide during sys.path-based import.
+    result = pytester.runpytest_subprocess("-v", "--import-mode=importlib")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Two distinct ``utils`` package steps — one per project.
+    assert len(by_name["utils"]) == 2
+    utils_ids = {s["id"] for s in by_name["utils"]}
+    # Each module step parents to a different ``utils`` instance.
+    parent_x = by_name["test_x.py"][0]["parent_step_id"]
+    parent_y = by_name["test_y.py"][0]["parent_step_id"]
+    assert parent_x in utils_ids
+    assert parent_y in utils_ids
+    assert parent_x != parent_y
+
+
+def test_sift_package_step_false_skips_package_steps(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """With ``sift_package_step=false`` the directory step is suppressed."""
+    _write_ini(pytester, sift_package_step="false")
+    pytester.mkpydir("pkg_a")
+    (pytester.path / "pkg_a" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "pkg_a" not in by_name
+    # The module step still opens and is now the top-level frame.
+    assert by_name["test_x.py"][0]["parent_step_id"] is None
+
+
+def test_all_three_flags_false_matches_legacy_behavior(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(
+        pytester,
+        sift_module_step="false",
+        sift_class_step="false",
+        sift_parametrize_nesting="false",
+    )
+    pytester.makepyfile(
+        test_legacy=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # No module, class, or parametrize parents — just bracket-mangled leaves.
+    assert "test_legacy.py" not in by_name
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+    assert "test_a[1]" in by_name
+    assert "test_a[2]" in by_name
+    assert by_name["test_a[1]"][0]["parent_step_id"] is None
+    assert by_name["test_a[2]"][0]["parent_step_id"] is None
+
+
+# ---------------------------------------------------------------------------
+# Parametrize nesting
+# ---------------------------------------------------------------------------
+
+
+def test_single_parametrize_clusters_under_originalname(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_rail=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [3.3, 5.0])
+            def test_rail(v):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Module step + one shared `test_rail` parent + two leaves.
+    assert len(by_name["test_rail.py"]) == 1
+    assert len(by_name["test_rail"]) == 1
+    assert len(by_name["v=3.3"]) == 1
+    assert len(by_name["v=5.0"]) == 1
+    test_rail_id = by_name["test_rail"][0]["id"]
+    assert by_name["v=3.3"][0]["parent_step_id"] == test_rail_id
+    assert by_name["v=5.0"][0]["parent_step_id"] == test_rail_id
+
+
+def test_stacked_parametrize_nests_outer_to_inner(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_iso=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("voltage", ["high", "low"])
+            @pytest.mark.parametrize("component", ["motor", "ducer"])
+            def test_iso(voltage, component):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # One `test_iso` parent, two `voltage='…'` parents, four `component='…'` leaves.
+    assert len(by_name["test_iso"]) == 1
+    assert len(by_name["voltage='high'"]) == 1
+    assert len(by_name["voltage='low'"]) == 1
+    assert len(by_name["component='motor'"]) == 2  # one per voltage
+    assert len(by_name["component='ducer'"]) == 2
+    test_iso_id = by_name["test_iso"][0]["id"]
+    vh_id = by_name["voltage='high'"][0]["parent_step_id"]
+    vl_id = by_name["voltage='low'"][0]["parent_step_id"]
+    assert vh_id == test_iso_id
+    assert vl_id == test_iso_id
+    # Each component leaf parents to one of the voltage parents.
+    voltage_ids = {
+        by_name["voltage='high'"][0]["id"],
+        by_name["voltage='low'"][0]["id"],
+    }
+    for leaf in by_name["component='motor'"] + by_name["component='ducer'"]:
+        assert leaf["parent_step_id"] in voltage_ids
+
+
+def test_fixture_parametrization_participates(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_widget=dedent(
+            """
+            import pytest
+
+            @pytest.fixture(params=["a", "b"])
+            def widget(request):
+                return request.param
+
+            def test_widget(widget):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["test_widget"]) == 1
+    parent_id = by_name["test_widget"][0]["id"]
+    assert by_name["widget='a'"][0]["parent_step_id"] == parent_id
+    assert by_name["widget='b'"][0]["parent_step_id"] == parent_id
+
+
+def test_module_boundary_isolates_parametrize_stack(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_a=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [1, 2])
+            def test_one(v):
+                pass
+            """
+        ),
+        test_b=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("w", ["x", "y"])
+            def test_two(w):
+                pass
+            """
+        ),
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Each module step contains its own `test_one`/`test_two` parametrize subtree.
+    mod_a = by_name["test_a.py"][0]
+    mod_b = by_name["test_b.py"][0]
+    assert by_name["test_one"][0]["parent_step_id"] == mod_a["id"]
+    assert by_name["test_two"][0]["parent_step_id"] == mod_b["id"]
+
+
+def test_leaf_parent_chain_terminates_at_report(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_chain=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("a", [1])
+            @pytest.mark.parametrize("b", ["x"])
+            def test_chain(a, b):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    leaf = next(s for s in steps if s["name"].startswith("b="))
+    chain = _ancestor_names(steps, leaf)
+    # leaf b=… → a=… → test_chain → test_chain.py (module step) → root
+    assert chain == ["b='x'", "a=1", "test_chain", "test_chain.py"]
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_offline.py b/python/lib/sift_client/_tests/pytest_plugin/test_offline.py
new file mode 100644
index 000000000..f0470bad3
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_offline.py
@@ -0,0 +1,135 @@
+"""Tests for ``--sift-offline`` mode.
+
+Offline mode routes every create/update through the JSONL log file without
+contacting Sift. The session-start ping is skipped, the import worker is not
+spawned, and missing ``SIFT_*`` env vars are tolerated (placeholders are
+filled). Offline + ``--sift-log-file=none`` is rejected as a
+usage error since the log file is the sole sink in this mode.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestOfflineMode:
+    def test_offline_runs_without_network(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode constructs the client locally and never pings."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_in_bounds(step):
+                assert step.measure(name="v", value=5.0, bounds={"min": 4.8, "max": 5.2})
+
+            def test_out_of_bounds(step):
+                assert step.measure(name="v", value=10.0, bounds={"max": 5.2}) is False
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=2)
+
+    def test_log_file_none_incompatible_with_offline(
+        self,
+        pytester: pytest.Pytester,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-log-file=none`` + ``--sift-offline`` is a usage error."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", "--sift-log-file=none")
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "incompatible with --sift-offline" in combined, combined
+
+    def test_offline_yields_real_fixtures(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode runs a real ReportContext; entities still report `is_simulated=True` because the log-file path synthesizes responses prior to replay."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            from sift_client.util.test_results import ReportContext
+            from sift_client.util.test_results.context_manager import NewStep
+
+            def test_types(step, report_context):
+                assert isinstance(report_context, ReportContext)
+                assert isinstance(step, NewStep)
+                assert report_context.client._simulate is False
+                # log-file mode synthesizes responses, so entities are flagged simulated.
+                assert report_context.is_simulated is True
+                assert step.current_step.is_simulated is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=1)
+
+    def test_offline_writes_jsonl_to_pinned_log_file(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode populates the pinned JSONL file with create/update entries."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_one(step):
+                assert step.measure(
+                    name="v", value=5.0, bounds={"min": 4.8, "max": 5.2}
+                ) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        assert log_path.exists(), f"offline mode did not create {log_path}"
+        content = log_path.read_text()
+        assert content.strip(), "log file is empty"
+        # Each non-empty line is ``[Operation:uuid] {json}``. A successful
+        # session produces at least the report create + step create lines.
+        lines = [line for line in content.splitlines() if line.strip()]
+        assert any(line.startswith("[CreateTestReport:") for line in lines), content
+        assert any(line.startswith("[CreateTestStep:") for line in lines), content
+
+    def test_offline_skips_client_has_connection(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Offline mode never resolves ``client_has_connection``.
+
+        Override the fixture to raise on resolution. If the override is
+        invoked, the session aborts. If it isn't, the inner test passes
+        cleanly, which confirms the offline path skipped the ping check.
+        """
+        pytester.makeconftest(
+            """
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                raise AssertionError(
+                    "client_has_connection should not resolve in offline mode"
+                )
+            """
+        )
+        pytester.makepyfile("def test_runs(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_online.py b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
new file mode 100644
index 000000000..876fffb0e
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
@@ -0,0 +1,133 @@
+"""Tests for online mode (the default).
+
+Online mode requires connectivity to Sift. The plugin pings via
+``client_has_connection`` at session start and aborts with
+``pytest.UsageError`` on failure. Missing ``SIFT_API_KEY`` /
+``SIFT_GRPC_URI`` / ``SIFT_REST_URI`` env vars are reported as a usage error
+so the failure is actionable.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestOnlineMode:
+    def test_ping_failure_aborts(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Online mode with an unreachable ping aborts the session via UsageError."""
+        pytester.makeconftest(
+            """
+            import pytest
+            from unittest.mock import MagicMock
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                client = MagicMock()
+                client.ping.ping.side_effect = ConnectionError("unreachable")
+                return client
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_should_not_run():
+                assert True
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Sift ping failed" in combined, combined
+
+    def test_missing_env_vars_named_in_error(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The default ``sift_client`` fixture names missing env vars in its error."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_should_not_run():
+                pass
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        for var in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            assert var in combined, combined
+
+    def test_online_resolves_client_has_connection_once(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+    ) -> None:
+        """Online mode resolves ``client_has_connection`` exactly once at session start.
+
+        Overrides the fixture to bump a counter persisted to a file the outer
+        test reads after the inner session finishes. Outcomes aren't asserted
+        because the real ``ReportContext`` constructed against a ``MagicMock``
+        client crashes downstream when Pydantic sees mock IDs; what we're
+        verifying is the ping path itself, which runs before construction.
+        """
+        counter_file = tmp_path / "ping_calls.txt"
+        pytester.makeconftest(
+            f"""
+            from pathlib import Path
+            from unittest.mock import MagicMock
+
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+            _COUNTER = Path({str(counter_file)!r})
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                return MagicMock()
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                prior = int(_COUNTER.read_text()) if _COUNTER.exists() else 0
+                _COUNTER.write_text(str(prior + 1))
+                return True
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_a(): pass
+
+            @pytest.mark.sift_include
+            def test_b(): pass
+            """
+        )
+        pytester.runpytest_subprocess()
+        assert counter_file.exists(), "client_has_connection was not resolved"
+        assert counter_file.read_text() == "1", (
+            f"expected session-scoped fixture to resolve once, got {counter_file.read_text()}"
+        )
diff --git a/python/lib/sift_client/_tests/util/conftest.py b/python/lib/sift_client/_tests/util/conftest.py
index 45279cca6..9e255da8a 100644
--- a/python/lib/sift_client/_tests/util/conftest.py
+++ b/python/lib/sift_client/_tests/util/conftest.py
@@ -1,14 +1,35 @@
-import pytest
+from pathlib import Path
 
+import pytest
 
-def pytest_addoption(parser: pytest.Parser) -> None:
-    existing_options = [opt.names() for opt in parser._anonymous.options]
-    # Flatten the list of lists into a single list of strings
-    flat_options = [item for sublist in existing_options for item in sublist]
-    if not any("--sift-test-results-log-file" in name for name in flat_options):
-        parser.addoption("--sift-test-results-log-file", action="store_true", default=False)
+_HERE = Path(__file__).parent
 
 
 def pytest_configure(config: pytest.Config) -> None:
     """Configure the pytest configuration to disable the Sift test results log file."""
-    config.option.sift_test_results_log_file = False
+    config.option.sift_log_file = False
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: "list[pytest.Item]") -> None:
+    """Bulk-apply ``@pytest.mark.sift_include`` to integration tests under util/.
+
+    The project-wide default in ``pyproject.toml`` is ``sift_autouse
+    = false`` so unit tests pay nothing for the globally-loaded Sift plugin.
+    Integration tests in this subtree still need the autouse fixtures, so this
+    hook flips the gate back on for any test already marked
+    ``@pytest.mark.integration``. Unit tests in the same directory (e.g.
+    ``test_cel_utils.py``) are left alone.
+
+    ``pytest_collection_modifyitems`` receives all items in the session (pytest
+    does not auto-scope it to the conftest's directory), so we filter by path
+    explicitly. ``Path.relative_to`` is the 3.8-compatible form of the path
+    containment check (``Path.is_relative_to`` arrived in 3.9).
+    """
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        if item.get_closest_marker("integration") is None:
+            continue
+        item.add_marker(pytest.mark.sift_include)
diff --git a/python/lib/sift_client/_tests/util/test_report_context.py b/python/lib/sift_client/_tests/util/test_report_context.py
new file mode 100644
index 000000000..e92e57bb8
--- /dev/null
+++ b/python/lib/sift_client/_tests/util/test_report_context.py
@@ -0,0 +1,99 @@
+"""Tier 1 tests for `ReportContext.__exit__`'s replay-worker handling.
+
+Each test substitutes the `import-test-result-log` argv with a tiny Python
+`-c` invocation that produces a controlled end-state (clean exit / hang /
+non-zero exit), then enters and exits a `ReportContext` against a
+simulate-mode `SiftClient`. This validates that real subprocess outcomes
+route to the right branch of `__exit__` without depending on the real
+replay binary or a Sift backend.
+"""
+
+from __future__ import annotations
+
+import sys
+import warnings
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.errors import SiftWarning
+from sift_client.util.test_results import ReportContext
+
+
+def _make_simulate_client() -> SiftClient:
+    """Build a SiftClient flagged for in-process simulation.
+
+    Constructor URLs are placeholders; nothing dials them because every
+    test-results write short-circuits through the simulate path.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="test",
+            grpc_url="test.invalid:0",
+            rest_url="http://test.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
+
+def _make_context(command: list[str], *, timeout: float = 0.5) -> ReportContext:
+    """Build a ReportContext whose replay subprocess is the provided command.
+
+    ``log_file=True`` triggers the temp-file path so ``_open_import_proc`` fires
+    on ``__enter__``. The substitute argv is swapped in via the public-ish
+    ``_build_replay_command`` hook so the production Popen kwargs stay
+    exercised. ``timeout`` overrides the worker grace window so tests don't
+    wait the full production timeout for the timeout branch to trigger.
+    """
+    rc = ReportContext(_make_simulate_client(), name="test", log_file=True)
+    rc._build_replay_command = lambda: command  # type: ignore[method-assign]
+    rc._import_proc_timeout = timeout
+    return rc
+
+
+def test_worker_clean_exit_is_silent() -> None:
+    """Worker exits with code 0 → __exit__ emits no SiftWarning (case 1)."""
+    rc = _make_context([sys.executable, "-c", "pass"])
+    with warnings.catch_warnings(record=True) as recorded:
+        warnings.simplefilter("always")
+        with rc:
+            pass
+    sift_warnings = [w for w in recorded if issubclass(w.category, SiftWarning)]
+    assert sift_warnings == []
+    assert rc._import_proc is not None
+    assert rc._import_proc.returncode == 0
+
+
+def test_worker_timeout_kills_and_warns() -> None:
+    """Worker still running at session end → kill + SiftWarning, no raise (case 2)."""
+    rc = _make_context([sys.executable, "-c", "import time; time.sleep(30)"], timeout=0.2)
+    with pytest.warns(SiftWarning) as recorded:
+        with rc:
+            pass
+    assert rc._import_proc is not None
+    # `kill()` + `wait()` were called; process is dead.
+    assert rc._import_proc.poll() is not None
+    messages = "\n".join(str(w.message) for w in recorded)
+    assert "did not exit in 0.2s" in messages
+    assert "import-test-result-log" in messages
+
+
+def test_worker_nonzero_exit_warns_stderr_no_raise() -> None:
+    """Worker exits non-zero with stderr → SiftWarning with stderr + replay hint, no raise (case 3)."""
+    rc = _make_context(
+        [
+            sys.executable,
+            "-c",
+            "import sys; sys.stderr.write('rpc deadline exceeded'); sys.exit(2)",
+        ]
+    )
+    with pytest.warns(SiftWarning) as recorded:
+        with rc:
+            pass
+    assert rc._import_proc is not None
+    assert rc._import_proc.returncode == 2
+    messages = "\n".join(str(w.message) for w in recorded)
+    assert "exited with code 2" in messages
+    assert "rpc deadline exceeded" in messages
+    assert "import-test-result-log" in messages
diff --git a/python/lib/sift_client/_tests/util/test_test_results_utils.py b/python/lib/sift_client/_tests/util/test_test_results_utils.py
index 256803769..4fd6ab112 100644
--- a/python/lib/sift_client/_tests/util/test_test_results_utils.py
+++ b/python/lib/sift_client/_tests/util/test_test_results_utils.py
@@ -385,6 +385,46 @@ def test_report_outcome(self, report_context, step):
         if not initial_any_failures:
             report_context.any_failures = False
 
+    def test_measurements_passed_property(self, report_context, step):
+        """``step.measurements_passed`` counts only direct ``measure*`` calls
+        on this step, and stays True when only a substep or ``report_outcome``
+        records a failure.
+        """
+        current_step_path = step.current_step.step_path
+        initial_open_step_result = report_context.open_step_results.get(current_step_path, True)
+        initial_any_failures = report_context.any_failures
+
+        # No measurements yet, vacuously True.
+        assert step.measurements_passed is True
+
+        # In-bounds measurement keeps it True.
+        step.measure(name="ok", value=1.0, bounds={"min": 0.0, "max": 2.0})
+        assert step.measurements_passed is True
+
+        # A failing report_outcome doesn't flip measurements_passed because
+        # it isn't a direct measure() call on this step.
+        step.report_outcome("substep-fail", False, "deliberately failing")
+        assert step.measurements_passed is True
+
+        # Out-of-bounds measurement flips ``measurements_passed`` False.
+        step.measure(name="bad", value=99.0, bounds={"min": 0.0, "max": 2.0})
+        assert step.measurements_passed is False
+
+        # measure_avg / measure_all go through ``measure`` internally and
+        # also increment the counter on out-of-bounds values.
+        step.measure_avg(
+            name="bad_avg",
+            values=[50.0, 60.0, 70.0],  # mean 60 is well outside [0, 2]
+            bounds={"min": 0.0, "max": 2.0},
+        )
+        assert step.measurements_passed is False
+
+        # Restore state.
+        if initial_open_step_result:
+            report_context.open_step_results[current_step_path] = True
+        if not initial_any_failures:
+            report_context.any_failures = False
+
     def test_bad_assert(self, report_context, step):
         # Capture current state of report context's failures so we can keep things passed at a high level if the test's induced failures happen as expected.
         current_step_path = step.current_step.step_path
diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py
index 95fd25b71..ff574adba 100644
--- a/python/lib/sift_client/client.py
+++ b/python/lib/sift_client/client.py
@@ -152,6 +152,11 @@ def __init__(
         WithGrpcClient.__init__(self, grpc_client=grpc_client)
         WithRestClient.__init__(self, rest_client=rest_client)
 
+        # When set, test-results writes return synthesized responses without
+        # contacting Sift. Read by `TestResultsAPIAsync._simulate`. Used by the
+        # pytest plugin's ``--sift-disabled`` mode.
+        self._simulate: bool = False
+
         self.ping = PingAPI(self)
         self.assets = AssetsAPI(self)
         self.calculated_channels = CalculatedChannelsAPI(self)
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
new file mode 100644
index 000000000..7c4c1c2f5
--- /dev/null
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -0,0 +1,939 @@
+from __future__ import annotations
+
+import os
+import warnings
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Tuple
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.errors import SiftWarning
+from sift_client.sift_types.test_report import TestStatus
+from sift_client.util.test_results import ReportContext
+
+
+class SiftPytestPluginWarning(SiftWarning):
+    """Base warning for issues raised by the Sift pytest plugin."""
+
+
+class SiftPytestStepDrainWarning(SiftPytestPluginWarning):
+    """A step's ``__exit__`` raised while the plugin was draining its stack.
+
+    Surfaced at module-teardown or session-end so the drain can continue and
+    pytest test outcomes stay unaffected; the underlying exception is included
+    in the message for debugging.
+    """
+
+
+class SiftPytestStepDrainError(RuntimeError):
+    """Raised when mid-session drain fails — signals a likely upstream invariant break."""
+
+
+if TYPE_CHECKING:
+    from sift_client.util.test_results.context_manager import NewStep
+
+REPORT_CONTEXT: Any = None
+
+_STASH_MISSING = object()
+
+_PARAMETRIZE_PATH_KEY = pytest.StashKey[Tuple[str, ...]]()
+# Each frame: (path_key, open step). Frames are shared across sibling test items
+# and drained at session end.
+_PARAMETRIZE_STACK: list[tuple[str, Any]] = []
+
+_HIERARCHY_KEY = pytest.StashKey[Tuple[Tuple[str, str, "str | None", bool], ...]]()
+# Outer-to-inner frames for the item's collection-tree ancestors. Each chain
+# entry is ``(identity, name, doc, rendered)``:
+#   - ``identity``: a globally-unique key (``node.nodeid``) used for diff
+#     comparison. Two ancestors at the same depth with the same display name
+#     but reached via different paths (e.g., ``proj_a/utils`` and
+#     ``proj_b/utils`` in a monorepo) get distinct identities, so they never
+#     silently merge in the diff.
+#   - ``name``: the human-readable step name used when ``rendered`` opens the
+#     Sift step.
+#   - ``doc``: docstring used for the step description if rendered.
+#   - ``rendered``: True iff the corresponding ``sift_*_step`` ini flag is on.
+#     Non-rendered frames participate in the diff but do not call
+#     ``rc.new_step(...)`` — they appear with ``ns=None`` in the stack.
+#
+# Stack entries: ``(identity, name, open_step_or_None)``. Frames are shared
+# across sibling test items and drained at session end. Drained AFTER
+# _PARAMETRIZE_STACK since parametrize parents nest inside hierarchy parents.
+_HIERARCHY_STACK: list[tuple[str, str, Any]] = []
+
+
+def _drain_step_stack(stack: list, *, swallow_errors: bool = True) -> None:
+    """Pop and close every frame.
+
+    With ``swallow_errors=True`` (default, used at teardown / session end),
+    per-frame failures are surfaced as ``SiftPytestStepDrainWarning`` so a
+    single misbehaving ``__exit__`` can't block the rest of the stack from
+    cleaning up or cascade out of pytest's finalizer chain.
+
+    With ``swallow_errors=False`` (mid-session, when a class transition forces
+    parametrize parents to close), the stack is still fully drained but the
+    first per-frame exception is re-raised at the end as a
+    ``SiftPytestStepDrainError`` so a real upstream invariant violation
+    surfaces as a test error instead of a silenceable warning.
+    """
+    errors: list[tuple[str, BaseException]] = []
+    while stack:
+        entry = stack.pop()
+        # Tolerate either ``(name, ns)`` (parametrize stack) or
+        # ``(identity, name, ns)`` (hierarchy stack) entries.
+        name, ns = entry[-2], entry[-1]
+        if ns is None:
+            # Non-rendered diff-only frame (e.g. a Package frame when
+            # ``sift_package_step=false``); nothing to close.
+            continue
+        try:
+            ns.__exit__(None, None, None)
+        except Exception as exc:
+            if swallow_errors:
+                warnings.warn(
+                    f"Sift plugin: closing step {name!r} during drain raised "
+                    f"{type(exc).__name__}: {exc}",
+                    SiftPytestStepDrainWarning,
+                    stacklevel=2,
+                )
+            else:
+                errors.append((name, exc))
+    if errors:
+        first_name, first_exc = errors[0]
+        raise SiftPytestStepDrainError(
+            f"Sift plugin: {len(errors)} step(s) raised while draining mid-session; "
+            f"first failure on {first_name!r}: {type(first_exc).__name__}: {first_exc}"
+        ) from first_exc
+
+
+def _drain_parametrize_stack(*, swallow_errors: bool = True) -> None:
+    _drain_step_stack(_PARAMETRIZE_STACK, swallow_errors=swallow_errors)
+
+
+def _drain_hierarchy_stack(*, swallow_errors: bool = True) -> None:
+    _drain_step_stack(_HIERARCHY_STACK, swallow_errors=swallow_errors)
+
+
+def _close_frame(name: str, ns: Any) -> None:
+    """Close a single frame, warning on per-frame failure.
+
+    Used by the mid-session hierarchy-stack pop and the rollback paths so a
+    misbehaving ``__exit__`` neither shadows the original exception nor leaks
+    sibling frames. ``ns=None`` indicates a non-rendered diff-only frame; skip.
+    """
+    if ns is None:
+        return
+    try:
+        ns.__exit__(None, None, None)
+    except Exception as exc:
+        warnings.warn(
+            f"Sift plugin: closing step {name!r} raised {type(exc).__name__}: {exc}",
+            SiftPytestStepDrainWarning,
+            stacklevel=2,
+        )
+
+
+def _build_parametrize_path(item: pytest.Item) -> tuple[str, ...]:
+    """Outer-to-inner step display names for a parametrized item.
+
+    Pytest stores ``callspec.params`` with the BOTTOM decorator's axis first;
+    the Sift step tree treats the TOP decorator as outermost, so we reverse.
+    """
+    callspec = getattr(item, "callspec", None)
+    if callspec is None or not callspec.params:
+        return ()
+    originalname = getattr(item, "originalname", item.name)
+    frames: list[str] = [originalname]
+    for name, value in reversed(callspec.params.items()):
+        frames.append(f"{name}={value!r}")
+    return tuple(frames)
+
+
+def _build_hierarchy_chain(
+    item: pytest.Item | pytest.Collector,
+    config: pytest.Config,
+) -> tuple[tuple[str, str, str | None, bool], ...]:
+    """Outer-to-inner ``(identity, name, docstring, rendered)`` for collection ancestors.
+
+    Walks ``item.parent`` upward and ALWAYS collects every ``pytest.Package``,
+    ``pytest.Module``, and ``pytest.Class`` ancestor — they all participate in
+    the diff that keeps the report tree coherent across tests, so two
+    same-named ancestors reached via different paths (e.g., ``proj_a/utils``
+    and ``proj_b/utils`` in a monorepo where the ``proj_*`` dirs are
+    ``pytest.Dir`` nodes the walker skips) cannot silently merge.
+
+    The ``identity`` field is ``node.nodeid`` — globally unique per collected
+    node. The diff compares on identity, not the display ``name``.
+
+    The ``rendered`` flag is True iff the layer's ini flag is on
+    (``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``).
+    Non-rendered frames participate in the diff for identity but don't open a
+    Sift step.
+
+    The ``node.obj`` access is a pytest property that imports the underlying
+    Python object and can raise *any* exception (ImportError, custom
+    metaclass errors, descriptor ``__doc__`` properties that throw). Guard
+    broadly so a misbehaving collector doesn't abort the whole collection
+    phase — that frame's docstring just becomes ``None``.
+    """
+    include_package = bool(_option_or_ini(config, _PACKAGE_STEP))
+    include_module = bool(_option_or_ini(config, _MODULE_STEP))
+    include_class = bool(_option_or_ini(config, _CLASS_STEP))
+
+    chain: list[tuple[str, str, str | None, bool]] = []
+    # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
+    # isn't part of pytest's public API; widen to ``Any`` for the walk.
+    node: Any = item
+    while node is not None:
+        if isinstance(node, pytest.Class):
+            rendered = include_class
+        elif isinstance(node, pytest.Module):
+            rendered = include_module
+        elif isinstance(node, pytest.Package):
+            rendered = include_package
+        else:
+            node = node.parent
+            continue
+        try:
+            doc = (
+                (getattr(node, "obj", None) and getattr(node.obj, "__doc__", None)) or ""
+            ).strip() or None
+        except Exception:
+            doc = None
+        chain.append((node.nodeid, node.name, doc, rendered))
+        node = node.parent
+    return tuple(reversed(chain))
+
+
+@dataclass(frozen=True)
+class _Option:
+    """A single Sift plugin setting, registered as a CLI flag and/or an ini key.
+
+    ``ini_name`` is used as both the ini key and the CLI ``dest``, so a value
+    set either way lands on the same config slot. ``cli_flag=None`` makes the
+    option ini-only (e.g. the URI fallbacks).
+    """
+
+    ini_name: str
+    ini_help: str
+    cli_flag: str | None = None
+    cli_help: str | None = None
+    action: str | None = None
+    ini_type: str | None = None
+    ini_default: Any = None
+
+
+_LOG_FILE = _Option(
+    cli_flag="--sift-log-file",
+    ini_name="sift_log_file",
+    cli_help="Path to write the Sift test result log file. "
+    "Use 'true' (default) to auto-create a temp file, "
+    "False, 'false', or 'none' to disable logging, "
+    "or a file path to write to a specific location.",
+    ini_help="Default value for --sift-log-file. Same values accepted as "
+    "the CLI flag (path, 'true', 'false', 'none').",
+)
+
+_GIT_METADATA = _Option(
+    cli_flag="--no-sift-git-metadata",
+    ini_name="sift_git_metadata",
+    action="store_false",
+    cli_help="Exclude git metadata from the Sift test results. "
+    "Git metadata (repo, branch, commit) is included by default.",
+    ini_help="Include git repo/branch/commit in the report (true/false). "
+    "Defaults to true. The --no-sift-git-metadata CLI flag overrides "
+    "this when passed.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_OFFLINE = _Option(
+    cli_flag="--sift-offline",
+    ini_name="sift_offline",
+    action="store_true",
+    cli_help="Run without contacting Sift. All create/update calls are written "
+    "to a JSONL log file for later replay via `import-test-result-log`. "
+    "No session-start ping is attempted.",
+    ini_help="When true, run in offline mode (same effect as --sift-offline). Defaults to false.",
+    ini_type="bool",
+    ini_default=False,
+)
+
+_DISABLED = _Option(
+    cli_flag="--sift-disabled",
+    ini_name="sift_disabled",
+    action="store_true",
+    cli_help="Disable Sift integration entirely. Nothing contacts the API "
+    "and no log file is written. `step.measure(...)` still returns real "
+    "pass/fail booleans. Returned entities expose `is_simulated == True`. "
+    "Also honored via the `SIFT_DISABLED` env var. Supersedes every other "
+    "flag.",
+    ini_help="When true, run in disabled mode (same effect as --sift-disabled). "
+    "Also honored via the SIFT_DISABLED env var. Supersedes every other "
+    "setting. Defaults to false.",
+    ini_type="bool",
+    ini_default=False,
+)
+
+_GRPC_URI = _Option(
+    ini_name="sift_grpc_uri",
+    ini_help="Sift gRPC endpoint URI. The default `sift_client` fixture "
+    "prefers the SIFT_GRPC_URI environment variable and falls back to "
+    "this ini value.",
+)
+
+_REST_URI = _Option(
+    ini_name="sift_rest_uri",
+    ini_help="Sift REST endpoint URI. The default `sift_client` fixture "
+    "prefers the SIFT_REST_URI environment variable and falls back to "
+    "this ini value.",
+)
+
+_AUTOUSE = _Option(
+    ini_name="sift_autouse",
+    ini_help="Default for the Sift autouse fixtures (report_context, step, "
+    "_hierarchy_parents, _parametrize_parents). When true (default), tests "
+    "are included unless marked with @pytest.mark.sift_exclude. When false, "
+    "tests are skipped unless marked with @pytest.mark.sift_include. "
+    "Bulk-apply markers in a directory's conftest via "
+    "`pytest_collection_modifyitems`.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_PACKAGE_STEP = _Option(
+    ini_name="sift_package_step",
+    ini_help="When true (default), open a parent step for each Python package "
+    "(directory with an ``__init__.py``) in the test path. Set to false to "
+    "flatten package grouping.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_MODULE_STEP = _Option(
+    ini_name="sift_module_step",
+    ini_help="When true (default), open a per-module parent step. Set to false "
+    "to skip module-level grouping in the report tree.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_CLASS_STEP = _Option(
+    ini_name="sift_class_step",
+    ini_help="When true (default), open per-class parent steps (including nested "
+    "classes). Set to false to keep class methods at module level.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_PARAMETRIZE_NESTING = _Option(
+    ini_name="sift_parametrize_nesting",
+    ini_help="When true (default), parametrized tests nest under shared parent "
+    "steps (e.g. test_a -> v=1, v=2). Set to false to keep the flat per-test "
+    "leaf naming (test_a[1], test_a[2]).",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_OPTIONS: tuple[_Option, ...] = (
+    _LOG_FILE,
+    _GIT_METADATA,
+    _OFFLINE,
+    _DISABLED,
+    _GRPC_URI,
+    _REST_URI,
+    _AUTOUSE,
+    _PACKAGE_STEP,
+    _MODULE_STEP,
+    _CLASS_STEP,
+    _PARAMETRIZE_NESTING,
+)
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Register Sift-specific command-line options and ini keys.
+
+    Each option can be set on the command line or under ``[tool.pytest.ini_options]``
+    in ``pyproject.toml`` (or ``[pytest]`` in ``pytest.ini``). CLI values take
+    precedence over ini values, which take precedence over the built-in default.
+    """
+    group = parser.getgroup("sift", description="Sift test results")
+    for opt in _OPTIONS:
+        if opt.cli_flag is not None:
+            cli_kwargs: dict[str, Any] = {
+                "dest": opt.ini_name,
+                "default": None,
+                "help": opt.cli_help,
+            }
+            if opt.action is not None:
+                cli_kwargs["action"] = opt.action
+            group.addoption(opt.cli_flag, **cli_kwargs)
+
+        ini_kwargs: dict[str, Any] = {"help": opt.ini_help, "default": opt.ini_default}
+        if opt.ini_type is not None:
+            ini_kwargs["type"] = opt.ini_type
+        parser.addini(opt.ini_name, **ini_kwargs)
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the Sift gate markers so they show up in `pytest --markers`."""
+    config.addinivalue_line(
+        "markers",
+        "sift_include: force the Sift autouse fixtures to activate for this test "
+        "regardless of the `sift_autouse` ini default.",
+    )
+    config.addinivalue_line(
+        "markers",
+        "sift_exclude: force the Sift autouse fixtures to skip this test "
+        "regardless of the `sift_autouse` ini default.",
+    )
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
+    """Stash each item's class chain + parametrize path and cluster siblings.
+
+    Sorts by ``(file_path, hierarchy_chain, parametrize_path)`` so sibling
+    items under a shared parent (package, module, class, or parametrize axis)
+    stay contiguous — otherwise a free function sorting between two class
+    methods would tear down + re-open the class step, producing duplicate
+    parents in the report tree.
+    """
+    for item in items:
+        item.stash[_HIERARCHY_KEY] = _build_hierarchy_chain(item, config)
+        item.stash[_PARAMETRIZE_PATH_KEY] = _build_parametrize_path(item)
+    # Use ``.get(...)`` defensively: a third-party hook may inject items after
+    # our stashing loop runs, and we'd rather sort them at the tail than
+    # KeyError out of collection.
+    items.sort(
+        key=lambda i: (
+            str(i.path),
+            tuple(identity for identity, _, _, _ in i.stash.get(_HIERARCHY_KEY, ())),
+            i.stash.get(_PARAMETRIZE_PATH_KEY, ()),
+        )
+    )
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
+    """Drain any parent steps still open at session end (innermost first).
+
+    Wrapped so a failure in the inner drain does not prevent the outer one
+    from running. With ``module_substep`` removed, this is the sole place
+    where hierarchy parents close — they persist across all tests and only
+    drain when the session ends.
+    """
+    try:
+        _drain_parametrize_stack()
+    finally:
+        _drain_hierarchy_stack()
+
+
+def _is_offline(pytestconfig: pytest.Config | None) -> bool:
+    return bool(_option_or_ini(pytestconfig, _OFFLINE))
+
+
+def _is_disabled(pytestconfig: pytest.Config | None) -> bool:
+    if bool(_option_or_ini(pytestconfig, _DISABLED)):
+        return True
+    return os.getenv("SIFT_DISABLED", "").lower() in ("1", "true", "yes")
+
+
+def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
+    """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
+
+    `get_closest_marker` walks the node hierarchy upward, so markers applied
+    at any level (function, class, module, package, session) are honored.
+    """
+    if node.get_closest_marker("sift_exclude"):
+        return False
+    if node.get_closest_marker("sift_include"):
+        return True
+    return default
+
+
+def _option_or_ini(pytestconfig: pytest.Config | None, opt: _Option) -> Any:
+    """Resolve a Sift plugin setting from CLI > ini > None.
+
+    The ``addoption`` registrations use ``default=None`` so we can tell whether
+    the CLI was actually used. When the CLI didn't set a value, fall back to
+    the matching ``addini`` key.
+    """
+    if pytestconfig is None:
+        return None
+    cli = pytestconfig.getoption(opt.ini_name, default=None)
+    if cli is not None:
+        return cli
+    try:
+        return pytestconfig.getini(opt.ini_name)
+    except (KeyError, ValueError):
+        return None
+
+
+def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
+    """Determine log_file value from CLI flag or ini key.
+
+    Three signal types arrive here:
+
+    * ``None`` — unset; nothing was passed on the CLI and the ini key is
+      absent. Treat as the default "use a temp file."
+    * Python ``False`` — an explicit disable, typically set in a conftest via
+      ``config.option.sift_log_file = False``. Return ``None`` so
+      the rest of the pipeline knows to skip logging entirely.
+    * A string (from CLI or ini) — interpret ``"true"`` / ``"1"`` as the temp
+      file default, ``"false"`` / ``"none"`` as disable, anything else as a
+      file path.
+
+    Rejects ``--sift-log-file=none`` combined with ``--sift-offline`` since
+    offline mode needs the log file as its sole sink.
+    """
+    raw = _option_or_ini(pytestconfig, _LOG_FILE)
+    disabled = raw is False or (isinstance(raw, str) and raw.lower() in ("false", "none"))
+    if disabled and _is_offline(pytestconfig):
+        raise pytest.UsageError(
+            "--sift-log-file=none is incompatible with --sift-offline; offline "
+            "mode requires a log file. Pin one with --sift-log-file=<path>, or "
+            "drop --sift-log-file=none to use a temp file."
+        )
+    if raw is False:
+        return None
+    if not raw:
+        return True
+    lower = str(raw).lower()
+    if lower in ("true", "1"):
+        return True
+    if lower in ("false", "none"):
+        return None
+    return Path(raw)
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
+    """Capture pytest outcomes so assertion failures and skips land on the Sift step."""
+    outcome = yield
+    report = outcome.get_result()
+    if report.outcome == "skipped":
+        # Skipped tests bypass the autouse `step` fixture, so we record the step manually here.
+        if REPORT_CONTEXT:
+            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
+                new_step.current_step.update({"status": TestStatus.SKIPPED})
+    setattr(item, "rep_" + report.when, call)
+
+
+def _report_context_impl(
+    sift_client: SiftClient,
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config | None = None,
+) -> Generator[ReportContext, None, None]:
+    args = request.config.invocation_params.args
+    test_path = Path(args[0]) if args else None
+    if test_path is not None and test_path.exists():
+        base_name = test_path.name
+        test_case: Path | str = test_path
+    else:
+        base_name = "pytest " + " ".join(args) if args else "pytest"
+        test_case = base_name
+    # Mode → ReportContext flags:
+    #   online (default): log_file=<temp or user path>, replay_log_file=True
+    #   --sift-offline:   log_file=<temp or user path>, replay_log_file=False
+    #   --sift-disabled:  log_file=False,               replay_log_file=False
+    disabled = sift_client._simulate
+    offline = False if disabled else _is_offline(pytestconfig)
+    log_file: str | Path | bool | None = False if disabled else _resolve_log_file(pytestconfig)
+    git_metadata = _option_or_ini(pytestconfig, _GIT_METADATA)
+    include_git_metadata = True if git_metadata is None else bool(git_metadata)
+    with ReportContext(
+        sift_client,
+        name=f"{base_name} {datetime.now(timezone.utc).isoformat()}",
+        test_case=str(test_case),
+        log_file=log_file,
+        include_git_metadata=include_git_metadata,
+        replay_log_file=not (disabled or offline),
+    ) as context:
+        global REPORT_CONTEXT
+        REPORT_CONTEXT = context
+        try:
+            yield context
+        finally:
+            # Drain the hierarchy + parametrize stacks INSIDE the
+            # ReportContext's ``with`` block, so the final ``__exit__``
+            # update calls for those parent steps are written to the log
+            # file BEFORE the import worker drains. Without this, the
+            # worker exits with a partial backlog and the parent steps
+            # are stuck IN_PROGRESS in the Sift report.
+            try:
+                _drain_parametrize_stack()
+            finally:
+                _drain_hierarchy_stack()
+
+
+_CREDENTIAL_KEYS: tuple[tuple[str, _Option | None], ...] = (
+    ("SIFT_API_KEY", None),  # env-only; never read from ini to keep secrets out of source control.
+    ("SIFT_GRPC_URI", _GRPC_URI),
+    ("SIFT_REST_URI", _REST_URI),
+)
+
+# Placeholder credentials used in --sift-offline mode when env/ini values
+# are missing. Offline mode never makes network calls, so the values are
+# only syntactically required by SiftConnectionConfig.
+_OFFLINE_DEFAULTS = {
+    "SIFT_API_KEY": "offline",
+    "SIFT_GRPC_URI": "offline.invalid:0",
+    "SIFT_REST_URI": "http://offline.invalid",
+}
+
+
+def _build_disabled_client() -> SiftClient:
+    """Construct a SiftClient for ``--sift-disabled`` mode.
+
+    Tagged with ``_simulate=True`` so test-results writes short-circuit through
+    the existing low-level simulate path without contacting Sift. The URLs are
+    syntactically valid but unreachable; nothing dials them.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="disabled",
+            grpc_url="disabled.invalid:0",
+            rest_url="http://disabled.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
+
+def _resolve_credential(
+    pytestconfig: pytest.Config | None, env_name: str, opt: _Option | None
+) -> str | None:
+    """Resolve a Sift credential: env var first, then ini key (if registered), else None."""
+    env_value = os.getenv(env_name)
+    if env_value:
+        return env_value
+    if opt is None or pytestconfig is None:
+        return None
+    ini_value = pytestconfig.getini(opt.ini_name)
+    return ini_value if isinstance(ini_value, str) and ini_value else None
+
+
+@pytest.fixture(scope="session")
+def sift_client(pytestconfig: pytest.Config) -> SiftClient:
+    """Default ``SiftClient`` resolved from environment variables and ini keys.
+
+    Each credential is read from its environment variable first. The URIs
+    (``SIFT_GRPC_URI``, ``SIFT_REST_URI``) additionally fall back to the
+    ``sift_grpc_uri`` / ``sift_rest_uri`` ini keys, since they are stable
+    per-org values that are safe to commit. ``SIFT_API_KEY`` is intentionally
+    env-only — use ``pytest-dotenv`` (already a project dependency) to load
+    it from a ``.env`` file kept out of version control.
+
+    Projects that need custom construction (TLS toggles, custom timeouts,
+    etc.) can override this fixture by defining their own ``sift_client``
+    in their ``conftest.py``; pytest fixture resolution prefers the local
+    definition.
+
+    In ``--sift-offline`` mode the missing-credential check is relaxed:
+    real env vars and ini values still win when set (so the client is
+    constructible against a real backend even though no calls are made), but
+    anything still missing is filled with a placeholder. In ``--sift-disabled``
+    mode the credential resolution is skipped entirely and placeholders are
+    always used.
+    """
+    if _is_disabled(pytestconfig):
+        return _build_disabled_client()
+    resolved = {env: _resolve_credential(pytestconfig, env, opt) for env, opt in _CREDENTIAL_KEYS}
+    missing = [env for env, value in resolved.items() if not value]
+    if missing and not _is_offline(pytestconfig):
+        raise pytest.UsageError(
+            "Sift credentials missing: "
+            + ", ".join(missing)
+            + ". Set the environment variable(s) — pytest-dotenv loads them "
+            "from a `.env` file automatically — or set the URIs via "
+            "`sift_grpc_uri` / `sift_rest_uri` under `[tool.pytest.ini_options]` "
+            "in pyproject.toml, or override the sift_client fixture in your "
+            "conftest.py, or pass --sift-offline / --sift-disabled to run "
+            "without contacting Sift."
+        )
+    for env in missing:
+        resolved[env] = _OFFLINE_DEFAULTS[env]
+    # `or ""` is unreachable in practice since the `missing` check above guarantees
+    # non-None values
+    return SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key=resolved.get("SIFT_API_KEY") or "",
+            grpc_url=resolved.get("SIFT_GRPC_URI") or "",
+            rest_url=resolved.get("SIFT_REST_URI") or "",
+        )
+    )
+
+
+@pytest.fixture(scope="session")
+def report_context(
+    request: pytest.FixtureRequest, pytestconfig: pytest.Config
+) -> Generator[ReportContext, None, None]:
+    """Lazy session-scoped Sift ReportContext.
+
+    The fixture is no longer autouse; it's instantiated on the first call
+    to ``request.getfixturevalue("report_context")``, which today happens
+    inside the gated ``step``, ``_hierarchy_parents``, and
+    ``_parametrize_parents`` fixtures. If every test in the session is
+    excluded via the marker gate, this fixture is never resolved and no
+    ReportContext (or teardown subprocess) is created.
+
+    What gets yielded depends on the mode:
+
+    * ``--sift-disabled``: a real ``ReportContext`` against a placeholder
+      ``SiftClient`` with ``_simulate=True``. Every test-results write
+      returns a synthesized response without contacting Sift; no log file
+      is written; the replay subprocess never spawns. Test code that calls
+      ``step.measure(...)`` keeps working because bounds are evaluated as
+      usual and routed through the simulate path.
+    * ``--sift-offline``: a real ReportContext, but the session-start ping
+      is skipped, all create/update calls go to the JSONL log file, and
+      the import-test-result-log replay subprocess is not spawned at
+      session end.
+    * default (online): verify connectivity via ``client_has_connection``
+      before constructing the context. A failed ping aborts the session
+      with ``pytest.UsageError`` and points at ``--sift-offline`` and
+      ``--sift-disabled`` as escape hatches.
+
+    The log-file destination is controlled by
+    ``--sift-log-file``; defaults to a temp file when unset.
+    """
+    if _is_disabled(pytestconfig):
+        yield from _report_context_impl(
+            _build_disabled_client(), request, pytestconfig=pytestconfig
+        )
+        return
+    sift_client = request.getfixturevalue("sift_client")
+    if not _is_offline(pytestconfig):
+        try:
+            request.getfixturevalue("client_has_connection")
+        except pytest.UsageError:
+            raise
+        except Exception as exc:
+            grpc_config = getattr(getattr(sift_client, "grpc_client", None), "_config", None)
+            grpc_url = getattr(grpc_config, "uri", "<unknown>")
+            raise pytest.UsageError(
+                f"Sift ping failed against {grpc_url}: {exc}. "
+                "Pass --sift-offline to run without contacting Sift, or "
+                "--sift-disabled to skip Sift entirely."
+            ) from exc
+    yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
+
+
+def _step_impl(
+    report_context: ReportContext, request: pytest.FixtureRequest
+) -> Generator[NewStep, None, None]:
+    node = request.node
+    # Items get a parametrize path stashed in ``pytest_collection_modifyitems``;
+    # modules/other nodes fall back to their node name. The leaf frame
+    # (``path[-1]``) is the test-specific display name — parents are opened
+    # by ``_parametrize_parents``. When parametrize-nesting is disabled, fall
+    # back to the bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf
+    # remains uniquely identifiable.
+    if _option_or_ini(request.config, _PARAMETRIZE_NESTING):
+        path = node.stash.get(_PARAMETRIZE_PATH_KEY, ())
+        name = path[-1] if path else str(node.name)
+    else:
+        name = str(node.name)
+    # ``node.obj`` may not exist (e.g., ``pytest.DoctestItem``) or may raise
+    # when accessed — fall back to no description in those cases rather than
+    # erroring out a perfectly valid test. ``getattr``'s default only
+    # suppresses ``AttributeError``; the try/except catches everything else
+    # (RuntimeError from a misbehaving ``__doc__`` descriptor, etc.).
+    try:
+        existing_docstring = getattr(getattr(node, "obj", None), "__doc__", None) or None
+    except Exception:
+        existing_docstring = None
+    with report_context.new_step(
+        name=name, description=existing_docstring, assertion_as_fail_not_error=False
+    ) as new_step:
+        yield new_step
+        if hasattr(node, "rep_call") and node.rep_call.excinfo:
+            new_step.update_step_from_result(
+                node.rep_call.excinfo,
+                node.rep_call.excinfo.value,
+                node.rep_call.excinfo.tb,
+            )
+
+
+@pytest.fixture(autouse=True)
+def _hierarchy_parents(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+) -> None:
+    """Open/close hierarchy parent steps (packages, modules, classes) for the current item.
+
+    Same diff-stack pattern as ``_parametrize_parents`` but operates on
+    ``_HIERARCHY_KEY``. The chain is built outer-to-inner from the item's
+    collection-tree ancestors; which node types are included is decided at
+    build time by ``sift_package_step`` / ``sift_module_step`` /
+    ``sift_class_step``. When the chain changes (pop or push), the parametrize
+    stack is drained first since parametrize parents nest INSIDE these.
+
+    Gated off when the item is excluded (avoids eager ``report_context`` setup).
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _sift_enabled_for(request.node, default):
+        return None
+    # Fall back to computing the chain on-demand for items that bypassed
+    # ``pytest_collection_modifyitems`` (e.g., dynamically inserted by another
+    # plugin's later hook). Defaulting to ``()`` would incorrectly drain the
+    # entire open hierarchy stack for those items.
+    desired = request.node.stash.get(_HIERARCHY_KEY, _STASH_MISSING)
+    if desired is _STASH_MISSING:
+        desired = _build_hierarchy_chain(request.node, pytestconfig)
+    common = 0
+    # Compare on identity (nodeid) — same-named ancestors at different paths
+    # MUST stay distinct.
+    while (
+        common < len(_HIERARCHY_STACK)
+        and common < len(desired)
+        and _HIERARCHY_STACK[common][0] == desired[common][0]
+    ):
+        common += 1
+    # Any change to the hierarchy chain orphans parametrize parents from the
+    # previous test — drain them before mutating the hierarchy stack so
+    # ReportContext's top-of-stack invariant holds. Strict mode: a per-frame
+    # ``__exit__`` failure here signals a real upstream drift between the
+    # plugin stacks and ReportContext; raise it as a test error instead of a
+    # silenceable warning.
+    if common < len(_HIERARCHY_STACK) or common < len(desired):
+        _drain_parametrize_stack(swallow_errors=False)
+    # Symmetric per-frame guard for the hierarchy pop so one bad ``__exit__``
+    # doesn't leave _HIERARCHY_STACK partially drained for every subsequent test.
+    while len(_HIERARCHY_STACK) > common:
+        _identity, name, ns = _HIERARCHY_STACK.pop()
+        _close_frame(name, ns)
+    if not desired[common:]:
+        return None
+    # Fetch ``report_context`` lazily — but only when there's at least one
+    # rendered frame to push. Pure diff-only frames (e.g. a Package frame when
+    # ``sift_package_step=false``) just update _HIERARCHY_STACK with ns=None.
+    rc = None
+    # Roll back any partial push so a mid-loop exception doesn't leave half
+    # the chain orphaned on the stack. Per-frame guard inside the rollback so
+    # a failing ``__exit__`` doesn't shadow the original exception or leak
+    # the remaining opened frames.
+    opened: list[tuple[str, str, Any]] = []
+    try:
+        for identity, name, doc, rendered in desired[common:]:
+            if rendered:
+                if rc is None:
+                    rc = request.getfixturevalue("report_context")
+                ns = rc.new_step(name=name, description=doc, assertion_as_fail_not_error=False)
+                ns.__enter__()
+                opened.append((identity, name, ns))
+            else:
+                opened.append((identity, name, None))
+    except BaseException:
+        while opened:
+            _identity, name, ns = opened.pop()
+            _close_frame(name, ns)
+        raise
+    _HIERARCHY_STACK.extend(opened)
+    return None
+
+
+@pytest.fixture(autouse=True)
+def _parametrize_parents(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+    _hierarchy_parents: None,
+) -> None:
+    """Open/close shared parametrize parent steps for the current item.
+
+    Diffs the item's desired parametrize path against the open stack: pops the
+    stale tail, then opens new parents (everything except the innermost frame —
+    the ``step`` fixture creates that as the leaf). Parents persist across
+    sibling items so a tree like ``test_x[a=1]`` / ``test_x[a=2]`` shares one
+    ``test_x`` container.
+
+    Gated off when the current item is excluded so that excluded items don't
+    eagerly request ``report_context`` (which would defeat its lazy creation),
+    or when ``sift_parametrize_nesting=false``. Parents persist until the
+    diff against a subsequent test's chain pops them, or until
+    ``pytest_sessionfinish`` drains anything left at session end.
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _sift_enabled_for(request.node, default):
+        return None
+    if not _option_or_ini(pytestconfig, _PARAMETRIZE_NESTING):
+        return None
+    # Fall back to on-demand computation for dynamically-inserted items;
+    # see _hierarchy_parents for the same rationale.
+    desired = request.node.stash.get(_PARAMETRIZE_PATH_KEY, _STASH_MISSING)
+    if desired is _STASH_MISSING:
+        desired = _build_parametrize_path(request.node)
+    parents = desired[:-1]
+    common = 0
+    while (
+        common < len(_PARAMETRIZE_STACK)
+        and common < len(parents)
+        and _PARAMETRIZE_STACK[common][0] == parents[common]
+    ):
+        common += 1
+    # Per-frame guard so one bad ``__exit__`` doesn't leave _PARAMETRIZE_STACK
+    # partially drained for every subsequent test.
+    while len(_PARAMETRIZE_STACK) > common:
+        name, ns = _PARAMETRIZE_STACK.pop()
+        _close_frame(name, ns)
+    if not parents[common:]:
+        return None
+    rc = request.getfixturevalue("report_context")
+    opened: list[tuple[str, Any]] = []
+    try:
+        for display in parents[common:]:
+            ns = rc.new_step(name=display, assertion_as_fail_not_error=False)
+            ns.__enter__()
+            opened.append((display, ns))
+    except BaseException:
+        while opened:
+            name, ns = opened.pop()
+            _close_frame(name, ns)
+        raise
+    _PARAMETRIZE_STACK.extend(opened)
+    return None
+
+
+@pytest.fixture(autouse=True)
+def step(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+    _parametrize_parents: None,
+) -> Generator[NewStep | None, None, None]:
+    """Create an outer step for the function when the Sift gate is on.
+
+    Resolves the gate via `_sift_enabled_for(request.node, ini_default)`:
+    `sift_exclude` marker forces off, `sift_include` forces on, otherwise the
+    `sift_autouse` ini default applies. When on, requests the
+    session `report_context` lazily — the first gated test in the session
+    triggers its creation, subsequent gated tests reuse it. In
+    ``--sift-disabled`` mode the report context is backed by a
+    ``SiftClient(_simulate=True)`` placeholder, so every write returns a
+    synthesized response without contacting Sift.
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _sift_enabled_for(request.node, default):
+        yield None
+        return
+    rc = request.getfixturevalue("report_context")
+    yield from _step_impl(rc, request)
+
+
+@pytest.fixture(scope="session")
+def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRequest) -> bool:
+    """Verify the ``SiftClient`` can reach Sift via ``/ping``.
+
+    Consulted at session start by ``report_context`` in online mode. A failed
+    ping raises through ``report_context`` and aborts the session with
+    ``pytest.UsageError``. Override this fixture in your conftest to use a
+    different reachability signal (e.g. a cached auth token) for environments
+    where pinging is the wrong check. Returns ``False`` in ``--sift-disabled``
+    mode without constructing a client.
+    """
+    if _is_disabled(pytestconfig):
+        return False
+    sift_client = request.getfixturevalue("sift_client")
+    sift_client.ping.ping()
+    return True
diff --git a/python/lib/sift_client/resources/test_results.py b/python/lib/sift_client/resources/test_results.py
index 22e984b5e..9e88b6081 100644
--- a/python/lib/sift_client/resources/test_results.py
+++ b/python/lib/sift_client/resources/test_results.py
@@ -96,6 +96,7 @@ async def create(
         created_report = await self._low_level_client.create_test_report(
             test_report=test_report,
             log_file=log_file,
+            simulate=self.client._simulate,
         )
         return self._finalize(created_report, log_file)
 
@@ -271,7 +272,7 @@ async def update(
         update.resource_id = test_report_id
         existing = test_report if isinstance(test_report, TestReport) else None
         updated_test_report = await self._low_level_client.update_test_report(
-            update, log_file=log_file, existing=existing
+            update, log_file=log_file, existing=existing, simulate=self.client._simulate
         )
         return self._finalize(updated_test_report, log_file)
 
@@ -319,7 +320,7 @@ async def create_step(
         if isinstance(test_step, dict):
             test_step = TestStepCreate.model_validate(test_step)
         test_step_result = await self._low_level_client.create_test_step(
-            test_step, log_file=log_file
+            test_step, log_file=log_file, simulate=self.client._simulate
         )
         return self._finalize(test_step_result, log_file)
 
@@ -450,7 +451,7 @@ async def update_step(
         update.resource_id = test_step_id
         existing = test_step if isinstance(test_step, TestStep) else None
         updated_test_step = await self._low_level_client.update_test_step(
-            update, log_file=log_file, existing=existing
+            update, log_file=log_file, existing=existing, simulate=self.client._simulate
         )
         return self._finalize(updated_test_step, log_file)
 
@@ -484,10 +485,10 @@ async def create_measurement(
         if isinstance(test_measurement, dict):
             test_measurement = TestMeasurementCreate.model_validate(test_measurement)
         test_measurement_result = await self._low_level_client.create_test_measurement(
-            test_measurement, log_file=log_file
+            test_measurement, log_file=log_file, simulate=self.client._simulate
         )
         measurement = self._finalize(test_measurement_result, log_file)
-        if update_step and log_file is None:
+        if update_step and log_file is None and not self.client._simulate:
             step = await self.get_step(test_step=test_measurement_result.test_step_id)
             if step.status == TestStatus.PASSED and not measurement.passed:
                 await self.update_step(test_step=step, update={"status": TestStatus.FAILED})
@@ -508,7 +509,7 @@ async def create_measurements(
             A tuple of (measurements_created_count, measurement_ids).
         """
         return await self._low_level_client.create_test_measurements(
-            test_measurements, log_file=log_file
+            test_measurements, log_file=log_file, simulate=self.client._simulate
         )
 
     async def list_measurements(
@@ -621,10 +622,16 @@ async def update_measurement(
 
         update.resource_id = test_measurement.id_
         updated_test_measurement = await self._low_level_client.update_test_measurement(
-            update, log_file=log_file, existing=test_measurement
+            update, log_file=log_file, existing=test_measurement, simulate=self.client._simulate
         )
         updated_test_measurement = self._finalize(updated_test_measurement, log_file)
-        if update_step and log_file is None and update.passed is not None and not update.passed:
+        if (
+            update_step
+            and log_file is None
+            and not self.client._simulate
+            and update.passed is not None
+            and not update.passed
+        ):
             step = await self.get_step(test_step=updated_test_measurement.test_step_id)
             if step.status == TestStatus.PASSED:
                 await self.update_step(test_step=step, update={"status": TestStatus.FAILED})
diff --git a/python/lib/sift_client/sift_types/_mixins/simulated.py b/python/lib/sift_client/sift_types/_mixins/simulated.py
new file mode 100644
index 000000000..bdc2c572a
--- /dev/null
+++ b/python/lib/sift_client/sift_types/_mixins/simulated.py
@@ -0,0 +1,32 @@
+"""Mixin that exposes ``is_simulated`` on test-results entity types."""
+
+from __future__ import annotations
+
+
+class SimulatedMixin:
+    """Mixin for sift_types whose response can be produced by the simulate path.
+
+    The low-level wrapper stamps ``_simulated=True`` on entities it returns from
+    a simulated branch (see ``TestResultsLowLevelClient._mark_simulated``). This
+    mixin exposes that flag as a read-only ``is_simulated`` property so
+    consumers and tests can detect when an instance was synthesized rather than
+    round-tripped through Sift.
+
+    Inheriting classes are expected to declare a private field
+    ``_simulated: bool = False`` so pydantic tracks the default correctly.
+    """
+
+    _simulated: bool
+
+    @property
+    def is_simulated(self) -> bool:
+        """True when this instance was returned from the simulate path.
+
+        Set by the low-level wrapper when the call short-circuited to a
+        synthesized response (either ``SiftClient._simulate`` mode or per-call
+        ``log_file`` / ``simulate=True``). False for entities returned from a
+        normal online call or constructed manually outside the SDK. Offline
+        mode also reports True since responses are synthesized prior to
+        replay.
+        """
+        return self._simulated
diff --git a/python/lib/sift_client/sift_types/test_report.py b/python/lib/sift_client/sift_types/test_report.py
index ecc24f52f..c4abfc548 100644
--- a/python/lib/sift_client/sift_types/test_report.py
+++ b/python/lib/sift_client/sift_types/test_report.py
@@ -36,6 +36,7 @@
     ModelUpdate,
 )
 from sift_client.sift_types._mixins.file_attachments import FileAttachmentsMixin
+from sift_client.sift_types._mixins.simulated import SimulatedMixin
 from sift_client.sift_types.channel import Channel
 from sift_client.util.metadata import metadata_dict_to_proto, metadata_proto_to_dict
 
@@ -153,7 +154,7 @@ def to_proto(self) -> TestStepProto:
         return proto
 
 
-class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin):
+class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin, SimulatedMixin):
     """TestStep model representing a step in a test."""
 
     test_report_id: str
@@ -169,6 +170,8 @@ class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin):
     metadata: dict[str, str | float | bool] | None = None
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(cls, proto: TestStepProto, sift_client: SiftClient | None = None) -> TestStep:
@@ -383,7 +386,7 @@ def to_proto(self) -> TestMeasurementProto:
         return proto
 
 
-class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"]):
+class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"], SimulatedMixin):
     """TestMeasurement model representing a measurement in a test."""
 
     measurement_type: TestMeasurementType
@@ -404,6 +407,8 @@ class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"]):
 
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(
@@ -599,7 +604,7 @@ def _to_proto(self) -> ErrorInfoProto:
         )
 
 
-class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
+class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin, SimulatedMixin):
     """TestReport model representing a test report."""
 
     status: TestStatus
@@ -617,6 +622,8 @@ class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
     is_archived: bool
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(
diff --git a/python/lib/sift_client/util/test_results/__init__.py b/python/lib/sift_client/util/test_results/__init__.py
index e7a82866c..a3ac081bc 100644
--- a/python/lib/sift_client/util/test_results/__init__.py
+++ b/python/lib/sift_client/util/test_results/__init__.py
@@ -49,78 +49,105 @@ def main(self):
     cleanup()
 ```
 
-## Pytest Fixtures
+## Pytest Plugin
 
-The report context and steps can also be accessed in pytest by importing the `report_context` and `step` fixtures.
+The pytest plugin lives at `sift_client.pytest_plugin`. Opt in
+from your `conftest.py`:
 
-### How to use:
-- These fixtures are set to autouse and will automatically create a report and steps for each test function.
-  - If you want each module(file) to be marked as a step w/ each test as a substep, import the `module_substep` fixture as well.
-- The `report_context` fixture requires a fixture `sift_client` returning an `SiftClient` instance to be passed in.
-
-Note: FedRAMP users: report_context will log test results to a temp file to avoid API calls during test execution. If this is a shared environment, you can disable logging by passing ``--sift-test-results-log-file=false``.
-
-#### Configuration
-
-Import the `pytest_addoption` function to add configuration options for Test Results to the commandline or add the options to your pyproject.toml file (https://docs.pytest.org/en/stable/reference/customize.html#configuration). If ommitted, will use the default values described below.
-
-- Git metadata: Include git metadata (repo, branch, commit) in the test results. Default is True. You can disable it by passing `--no-sift-test-results-git-metadata`.
-- Log file: Write test results to a file. This happens automatically but you can configure specify a specific log file by passing `--sift-test-results-log-file=<path>` or disable logging by passing `--sift-test-results-log-file=false`.
-- Check connection: Pass `--sift-test-results-check-connection` (off by default) to make the `report_context`, `step`, and `module_substep` fixtures no-op when the Sift client has no connection to the server. Requires a `client_has_connection` fixture to be available.
+```python
+# conftest.py
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
 
-###### Example at top of your test file or in your conftest.py file:
+By default, every test in the session produces a Sift report: one
+`TestReport` per session, one step per test function (`step`), and one
+parent step per Python package (directory with `__init__.py`), test file,
+and test class
+above it. Individual layers can be flattened via the `sift_package_step`,
+`sift_module_step`, `sift_class_step`, and `sift_parametrize_nesting` ini
+flags. The plugin also registers a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+Override it by defining your own `sift_client` fixture in your conftest.
+
+Note: FedRAMP users: results are buffered to a temp file and uploaded by a
+subprocess at session end (no API calls during the run). Disable the buffer
+entirely with `--sift-log-file=false` for inline uploads.
+
+### Controlling which tests produce reports
+
+The autouse fixtures fire for every test by default. To narrow that:
+
+- Set `sift_autouse = false` in `pyproject.toml` to flip the
+  project default off, then opt tests back in below.
+- `@pytest.mark.sift_include` forces reporting on for a test, class, or
+  module. `@pytest.mark.sift_exclude` forces it off. Closest marker wins.
+  `sift_exclude` beats `sift_include` when both apply.
+- `pytestmark` at the class or module level inherits to every test in scope.
+- For a whole directory, apply the marker in bulk from that directory's
+  `conftest.py`:
 
 ```python
-import pytest
+# tests/integration/conftest.py
+from pathlib import Path
 
-@pytest.fixture(scope="session")
-def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI", "localhost:50051")
-    rest_url = os.getenv("SIFT_REST_URI", "localhost:8080")
-    api_key = os.getenv("SIFT_API_KEY", "")
+import pytest
 
-    client = SiftClient(api_key=api_key, grpc_url=grpc_url, rest_url=rest_url)
+_HERE = Path(__file__).parent
 
-    return client
 
-from sift_client.util.test_results import *
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
 ```
 
-###### Then in your test file:
+#### Configuration
 
-```python
-# Because step was already imported and set autouse=True, this test will automatically get a step created for it.
-def test_no_includes():
-    assert condition, "Example failure"
-
-# Passing the fixtures to the test function allows you to take measurements or create substeps.
-def test_example(report_context, step):
-    # This will add a measurement to the current step for this function
-    step.measure(name="Example Measurement", value=test_string_value, bounds="expected_string_value")
-
-    with report_context.new_step(name="Example Step") as substep:
-        example_measurement = tlm.read(channel_name)
-        substep.measure(name="Substep Measurement", value=example_measurement, bounds=(min=74.9, max=75.1))
+CLI options registered by the plugin:
+
+- `--sift-offline`: Run without contacting Sift. All create/update calls are
+  written to the JSONL log file for later replay via `import-test-result-log`.
+  No session-start ping is attempted.
+- `--sift-disabled`: Skip Sift entirely. Nothing contacts the API and no
+  log file is written. `step.measure(...)` still evaluates bounds and
+  returns a real pass/fail boolean. Returned entities expose
+  ``is_simulated == True``. Also honored via the `SIFT_DISABLED` env
+  var. Supersedes every other flag.
+- `--sift-log-file`: Path to write the JSONL log file. `true`
+  (default) auto-creates a temp file. `false` or `none` disables logging.
+  Any other value is treated as a file path.
+- `--no-sift-git-metadata`: Exclude git metadata (repo, branch,
+  commit) from the test report. Included by default.
+
+Each option has a matching ini key for per-project configuration under
+``[tool.pytest.ini_options]`` in ``pyproject.toml`` (or ``[pytest]`` in
+``pytest.ini``). CLI flags override ini values. The
+``sift_autouse`` ini key (bool, default ``true``) sets the
+project-wide default for the gate described above. The default
+``sift_client`` fixture reads ``sift_grpc_uri`` and ``sift_rest_uri`` as
+fallbacks when the corresponding env vars are unset (env vars win when
+both are set). ``SIFT_API_KEY`` is env-only. Load it from a ``.env`` file
+via the ``pytest-dotenv`` plugin or inject it via your CI secret manager.
+
+```toml
+[tool.pytest.ini_options]
+sift_autouse = false
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
 ```
+
+To disable the plugin for a single run:
+`pytest -p no:sift_client.pytest_plugin`.
 """
 
 from .context_manager import NewStep, ReportContext
-from .pytest_util import (
-    client_has_connection,
-    module_substep,
-    pytest_addoption,
-    pytest_runtest_makereport,
-    report_context,
-    step,
-)
 
 __all__ = [
     "NewStep",
     "ReportContext",
-    "client_has_connection",
-    "module_substep",
-    "pytest_addoption",
-    "pytest_runtest_makereport",
-    "report_context",
-    "step",
 ]
diff --git a/python/lib/sift_client/util/test_results/bounds.py b/python/lib/sift_client/util/test_results/bounds.py
index ef5c67ce5..b734cc126 100644
--- a/python/lib/sift_client/util/test_results/bounds.py
+++ b/python/lib/sift_client/util/test_results/bounds.py
@@ -1,5 +1,10 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
 from sift_client.sift_types.test_report import (
     NumericBounds,
     TestMeasurement,
@@ -8,6 +13,55 @@
     TestMeasurementUpdate,
 )
 
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+def to_numpy_array(
+    values: list[float | int] | NDArray[np.float64] | pd.Series,
+) -> NDArray[np.float64]:
+    """Normalize a list / ndarray / pandas Series into a numpy array.
+
+    Shared by ``measure_avg`` and ``measure_all`` on ``NewStep`` so the
+    accepted input types stay in sync across measurement variants.
+    """
+    if isinstance(values, list):
+        return np.array(values)
+    if isinstance(values, np.ndarray):
+        return values
+    if isinstance(values, pd.Series):
+        return values.to_numpy()
+    raise ValueError(f"Invalid value type: {type(values)}")
+
+
+def out_of_bounds_mask(
+    arr: NDArray[np.float64],
+    bounds: dict[str, float] | NumericBounds,
+) -> NDArray[np.bool_]:
+    """Return a boolean mask selecting elements of ``arr`` that violate ``bounds``.
+
+    Raises ``ValueError`` when ``bounds`` has neither ``min`` nor ``max`` set.
+    """
+    if isinstance(bounds, dict):
+        bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
+    mask: NDArray[np.bool_] | None = None
+    if bounds.min is not None:
+        mask = arr < bounds.min
+    if bounds.max is not None:
+        above = arr > bounds.max
+        mask = mask | above if mask is not None else above
+    if mask is None:
+        raise ValueError("No bounds provided")
+    return mask
+
+
+def all_within_bounds(
+    arr: NDArray[np.float64],
+    bounds: dict[str, float] | NumericBounds,
+) -> bool:
+    """Return True when every element of ``arr`` is within ``bounds``."""
+    return bool(arr[out_of_bounds_mask(arr, bounds)].size == 0)
+
 
 def assign_value_to_measurement(
     measurement: TestMeasurement | TestMeasurementCreate | TestMeasurementUpdate,
@@ -32,6 +86,38 @@ def assign_value_to_measurement(
         raise ValueError(f"Invalid value type: {type(value)}")
 
 
+def value_passes_bounds(
+    value: float | str | bool,
+    bounds: dict[str, float] | NumericBounds | str | bool | None,
+) -> bool:
+    """Evaluate a value against bounds without recording a measurement."""
+    if bounds is None:
+        return True
+    if isinstance(bounds, dict):
+        bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
+    if isinstance(bounds, bool):
+        if isinstance(value, str):
+            return str(value).lower() == str(bounds).lower()
+        return bool(value) == bounds
+    if isinstance(bounds, str):
+        if not (isinstance(value, str) or isinstance(value, bool)):
+            raise ValueError("Value must be a string if bounds provided is a string")
+        if isinstance(value, bool):
+            return str(value).lower() == str(bounds).lower()
+        return value == bounds
+    # NumericBounds
+    try:
+        if bounds.min is not None and bounds.min > value:  # type: ignore[operator]
+            return False
+        if bounds.max is not None and bounds.max < value:  # type: ignore[operator]
+            return False
+    except TypeError:
+        raise TypeError(
+            f"Value must be a float or int to evaluate numeric bounds but gave {type(value)}"
+        ) from None
+    return True
+
+
 def evaluate_measurement_bounds(
     measurement: TestMeasurement | TestMeasurementCreate | TestMeasurementUpdate,
     value: float | str | bool,
@@ -53,31 +139,10 @@ def evaluate_measurement_bounds(
 
     if isinstance(bounds, dict):
         bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
-    if isinstance(bounds, bool):
-        if isinstance(value, str):
-            measurement.passed = str(value).lower() == str(bounds).lower()
-        else:
-            measurement.passed = bool(value) == bounds
-        return bool(measurement.passed)
-    elif isinstance(bounds, str):
-        if not (isinstance(value, str) or isinstance(value, bool)):
-            raise ValueError("Value must be a string if bounds provided is a string")
+    if isinstance(bounds, str) and not isinstance(bounds, bool):
         measurement.string_expected_value = bounds
-        if isinstance(value, bool):
-            measurement.passed = str(value).lower() == str(bounds).lower()
-        else:
-            measurement.passed = value == bounds
     elif isinstance(bounds, NumericBounds):
         measurement.numeric_bounds = bounds
-        measurement.passed = True
-        try:
-            if measurement.numeric_bounds.min is not None:
-                measurement.passed = measurement.passed and measurement.numeric_bounds.min <= value  # type: ignore
-            if measurement.numeric_bounds.max is not None:
-                measurement.passed = measurement.passed and measurement.numeric_bounds.max >= value  # type: ignore
-        except TypeError:
-            raise TypeError(
-                f"Value must be a float or int to evaluate numeric bounds but gave {type(value)}"
-            ) from None
 
+    measurement.passed = value_passes_bounds(value, bounds)
     return bool(measurement.passed)
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 354f8564d..bd2ec917f 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -7,14 +7,15 @@
 import subprocess
 import tempfile
 import traceback
+import warnings
 from contextlib import AbstractContextManager, contextmanager
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
-import pandas as pd
 
+from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import (
     ErrorInfo,
     NumericBounds,
@@ -28,9 +29,12 @@
 )
 from sift_client.util.test_results.bounds import (
     evaluate_measurement_bounds,
+    out_of_bounds_mask,
+    to_numpy_array,
 )
 
 if TYPE_CHECKING:
+    import pandas as pd
     from numpy.typing import NDArray
 
     from sift_client.client import SiftClient
@@ -40,15 +44,19 @@
 
 
 def log_replay_instructions(log_file: str | Path | None) -> None:
-    """Log instructions for manually replaying a test result log file.
+    """Surface replay instructions when an import/replay attempt fails.
 
-    Used when an import/replay attempt fails so the user can retry against the same file.
+    Emitted as a ``SiftWarning`` (not a logger.error) so pytest and other
+    runners surface it in their warning summary; logger.error is suppressed
+    by default in most CLI tools.
     """
     if log_file is None:
         return
-    logger.error(
-        f"Error replaying log file: {log_file}.\n"
-        f"  Can replay with `replay-test-result-log {log_file}`."
+    warnings.warn(
+        f"Sift log file was not fully replayed: {log_file}. "
+        f"Re-run with `import-test-result-log {log_file}` to complete the upload.",
+        SiftWarning,
+        stacklevel=2,
     )
 
 
@@ -108,6 +116,11 @@ class ReportContext(AbstractContextManager):
     open_step_results: dict[str, bool]
     any_failures: bool
     _import_proc: subprocess.Popen | None = None
+    # Seconds to wait for the import worker subprocess to finish uploading
+    # the JSONL backlog at session end before killing it. Tests substitute
+    # a smaller value (via ``_make_context`` patching) so they don't wait
+    # the full window for the timeout branch to trigger.
+    _import_proc_timeout: float = 30.0
 
     def __init__(
         self,
@@ -118,6 +131,7 @@ def __init__(
         test_case: str | None = None,
         log_file: str | Path | bool | None = None,
         include_git_metadata: bool = False,
+        replay_log_file: bool = True,
     ):
         """Initialize a new report context.
 
@@ -128,10 +142,18 @@ def __init__(
             system_operator: The operator of the test system. Will default to the current user if not provided.
             test_case: The name of the test case. Will default to the basename of the file containing the test if not provided.
             log_file: If True, create a temp log file. If a path, use that path.
-                All create/update operations will be logged to this file.
+                If False/None, no log file is written and create/update calls
+                the API.
             include_git_metadata: If True, include git metadata in the report.
+            replay_log_file: When True (the default) and ``log_file`` is set,
+                spawn ``import-test-result-log --incremental`` to push log
+                entries to Sift in the background during the session. When
+                False, the log file is just a record and no worker is spawned.
+                Replay happens later via ``replay-test-result-log <path>``.
+                Has no effect when ``log_file`` is None.
         """
         self.client = client
+        self.replay_log_file = replay_log_file
         self.step_is_open = False
         self.step_stack = []
         self.step_number_at_depth = {}
@@ -163,28 +185,41 @@ def __init__(
         )
         self.report = client.test_results.create(create, log_file=self.log_file)
 
+    def _build_replay_command(self) -> list[str]:
+        """Build the argv for the import-test-result-log replay subprocess.
+
+        Factored out for testability — tests substitute commands that exit
+        with controlled returncodes / stderr to exercise the ``__exit__``
+        branches without depending on the real replay binary.
+        """
+        return [
+            "import-test-result-log",
+            "--incremental",
+            str(self.log_file),
+            "--grpc-url",
+            self.client.grpc_client._config.uri,
+            "--rest-url",
+            self.client.rest_client._config.base_url,
+            "--api-key",
+            self.client.grpc_client._config.api_key,
+        ]
+
     def _open_import_proc(self):
-        """Open a subprocess to import the log file."""
+        """Open a subprocess to import the log file.
+
+        ``stderr`` is captured so a worker crash mid-session can surface its
+        error at session end via ``__exit__`` rather than failing silently.
+        """
         with _quiet_fork_stderr():
             self._import_proc = subprocess.Popen(
-                [
-                    "import-test-result-log",
-                    "--incremental",
-                    str(self.log_file),
-                    "--grpc-url",
-                    self.client.grpc_client._config.uri,
-                    "--rest-url",
-                    self.client.rest_client._config.base_url,
-                    "--api-key",
-                    self.client.grpc_client._config.api_key,
-                ],
+                self._build_replay_command(),
                 stdin=subprocess.PIPE,
                 stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
             )
 
     def __enter__(self):
-        if self.log_file:
+        if self.log_file and self.replay_log_file:
             self._open_import_proc()
         return self
 
@@ -199,17 +234,58 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.report.update(update)
 
         if self._import_proc is not None:
+            # Three outcomes for the replay worker at session end. None of
+            # them fail the session — tests already ran and their outcome
+            # is independent of delivery. The local log file is the source
+            # of recovery for both failure modes via
+            # `import-test-result-log <path>`:
+            #   1. Exits cleanly (returncode 0). Silent.
+            #   2. Still running after the grace window (TimeoutExpired).
+            #      Healthy worker with a large backlog; kill and surface
+            #      replay instructions. 30 seconds is enough for a normal
+            #      test suite to drain; pathological backlogs should opt
+            #      into inline mode (`--sift-log-file=false`) instead.
+            #   3. Exited with non-zero. Connection failures and API call
+            #      errors land here — the worker's replay loop has no retry,
+            #      so the first failed RPC crashes the subprocess. Surface
+            #      the captured stderr with replay instructions.
             try:
-                self._import_proc.communicate(timeout=1)
+                _, stderr_bytes = self._import_proc.communicate(timeout=self._import_proc_timeout)
             except subprocess.TimeoutExpired:
-                logger.error("Import process did not exit in 10s, killing it")
                 self._import_proc.kill()
                 self._import_proc.wait()
+                warnings.warn(
+                    f"Sift import worker did not exit in "
+                    f"{self._import_proc_timeout}s; killing it. "
+                    "Local log file is preserved for manual replay.",
+                    SiftWarning,
+                    stacklevel=2,
+                )
+                log_replay_instructions(self.log_file)
+                return True  # Ensures the session is marked as passed in pytest
+            if self._import_proc.returncode != 0:
+                stderr_text = (
+                    stderr_bytes.decode("utf-8", errors="replace").strip() if stderr_bytes else ""
+                )
+                warnings.warn(
+                    f"Sift import worker exited with code "
+                    f"{self._import_proc.returncode}. stderr: {stderr_text or '<empty>'}",
+                    SiftWarning,
+                    stacklevel=2,
+                )
                 log_replay_instructions(self.log_file)
-                raise
 
         return True
 
+    @property
+    def is_simulated(self) -> bool:
+        """True when this context's report came from the simulate path.
+
+        Delegates to ``self.report.is_simulated``; see ``TestReport.is_simulated``
+        for the full semantics.
+        """
+        return self.report.is_simulated
+
     def new_step(
         self,
         name: str,
@@ -353,6 +429,11 @@ def __init__(
         self.client = report_context.client
         self.current_step = self.report_context.create_step(name, description, metadata=metadata)
         self.assertion_as_fail_not_error = assertion_as_fail_not_error
+        # Per-step measurement-failure count for ``measurements_passed``.
+        # Tracks only direct ``measure*`` calls on this NewStep instance;
+        # substep / ``report_outcome`` failures are intentionally not folded
+        # in here (see ``measurements_passed`` vs ``passed``).
+        self._failed_measurement_count = 0
 
     def __enter__(self):
         """Enter the context manager to create a new step.
@@ -361,6 +442,20 @@ def __enter__(self):
         """
         return self
 
+    @property
+    def measurements_passed(self) -> bool:
+        """True if every measurement recorded directly on this step has passed.
+
+        Counts only ``step.measure``, ``step.measure_avg``, and
+        ``step.measure_all`` calls on this ``NewStep`` instance. Useful for
+        the ``assert step.measurements_passed`` pattern at the end of a test
+        when you want to fail pytest on any out-of-bounds measurement
+        without short-circuiting on the first failure (asserting on
+        individual ``measure(...)`` return values skips every measurement
+        after the failing one).
+        """
+        return self._failed_measurement_count == 0
+
     def update_step_from_result(
         self,
         exc: type[Exception] | None,
@@ -473,6 +568,8 @@ def measure(
             create, log_file=self.report_context.log_file
         )
         self.report_context.record_step_outcome(measurement.passed, self.current_step)
+        if not measurement.passed:
+            self._failed_measurement_count += 1
 
         return measurement.passed
 
@@ -505,15 +602,7 @@ def measure_avg(
         returns: The true if the average of the values is within the bounds, false otherwise.
         """
         timestamp = timestamp if timestamp else datetime.now(timezone.utc)
-        np_array = None
-        if isinstance(values, list):
-            np_array = np.array(values)
-        elif isinstance(values, np.ndarray):
-            np_array = values
-        elif isinstance(values, pd.Series):
-            np_array = values.to_numpy()
-        else:
-            raise ValueError(f"Invalid value type: {type(values)}")
+        np_array = to_numpy_array(values)
         avg = float(np.mean(np_array))
         result = self.measure(
             name=name,
@@ -561,31 +650,8 @@ def measure_all(
         returns: The true if all values are within the bounds, false otherwise.
         """
         timestamp = timestamp if timestamp else datetime.now(timezone.utc)
-        np_array = None
-        if isinstance(values, list):
-            np_array = np.array(values)
-        elif isinstance(values, np.ndarray):
-            np_array = values
-        elif isinstance(values, pd.Series):
-            np_array = values.to_numpy()
-        else:
-            raise ValueError(f"Invalid value type: {type(values)}")
-
-        numeric_bounds = bounds
-        if isinstance(numeric_bounds, dict):
-            numeric_bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))  # type: ignore
-
-        # Construct a mask of the values that are outside the bounds.
-        mask = None
-        if numeric_bounds.min is not None:
-            mask = np_array < numeric_bounds.min
-        if numeric_bounds.max is not None:
-            val_above_max = np_array > numeric_bounds.max
-            mask = mask | val_above_max if mask is not None else val_above_max
-        if mask is None:
-            raise ValueError("No bounds provided")
-
-        rows_outside_bounds = np_array[mask]
+        np_array = to_numpy_array(values)
+        rows_outside_bounds = np_array[out_of_bounds_mask(np_array, bounds)]
         for row in rows_outside_bounds:
             self.measure(
                 name=name,
diff --git a/python/lib/sift_client/util/test_results/pytest_util.py b/python/lib/sift_client/util/test_results/pytest_util.py
deleted file mode 100644
index a96a47fb3..000000000
--- a/python/lib/sift_client/util/test_results/pytest_util.py
+++ /dev/null
@@ -1,206 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generator
-
-import pytest
-
-from sift_client.sift_types.test_report import TestStatus
-from sift_client.util.test_results import ReportContext
-
-if TYPE_CHECKING:
-    from sift_client.client import SiftClient
-    from sift_client.util.test_results.context_manager import NewStep
-
-REPORT_CONTEXT: ReportContext | None = None
-
-
-def pytest_addoption(parser: pytest.Parser) -> None:
-    """Register Sift-specific command-line options."""
-    parser.addoption(
-        "--sift-test-results-log-file",
-        default=None,
-        help="Path to write the Sift test result log file. "
-        "Use 'true' (default) to auto-create a temp file, "
-        "False, 'false', or 'none' to disable logging, "
-        "or a file path to write to a specific location.",
-    )
-    parser.addoption(
-        "--no-sift-test-results-git-metadata",
-        action="store_false",
-        dest="sift_test_results_git_metadata",
-        default=True,
-        help="Exclude git metadata from the Sift test results. "
-        "Git metadata (repo, branch, commit) is included by default.",
-    )
-    parser.addoption(
-        "--sift-test-results-check-connection",
-        action="store_true",
-        default=False,
-        help="Skip the sift test-result fixtures (report_context, step, module_substep) "
-        "when the Sift client has no connection to the server. Requires a "
-        "`client_has_connection` fixture to be available in the test session.",
-    )
-
-
-def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
-    """Determine log_file value from --sift-test-results-log-file option."""
-    raw = None
-    if pytestconfig is not None:
-        raw = pytestconfig.getoption("--sift-test-results-log-file", default=None)
-    if raw is None:
-        return True
-    lower = str(raw).lower()
-    if lower in ("true", "1"):
-        return True
-    if lower in ("false", "none"):
-        return None
-    return Path(raw)
-
-
-@pytest.hookimpl(tryfirst=True, hookwrapper=True)
-def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """You should import this hook to capture any AssertionErrors that occur during the test. If not included, any assert failures in a test will not automatically fail the step."""
-    outcome = yield
-    report = outcome.get_result()
-    if report.outcome == "skipped":
-        # Skipped steps won't invoke the method/fixtures at all, so we need to manually record a step.
-        if REPORT_CONTEXT:
-            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
-                new_step.current_step.update({"status": TestStatus.SKIPPED})
-    setattr(item, "rep_" + report.when, call)
-
-
-def _report_context_impl(
-    sift_client: SiftClient,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config | None = None,
-) -> Generator[ReportContext | None, None, None]:
-    args = request.config.invocation_params.args
-    test_path = Path(args[0]) if args else None
-    if test_path is not None and test_path.exists():
-        base_name = test_path.name
-        test_case: Path | str = test_path
-    else:
-        base_name = "pytest " + " ".join(args) if args else "pytest"
-        test_case = base_name
-    log_file = _resolve_log_file(pytestconfig)
-    include_git_metadata = (
-        bool(pytestconfig.getoption("sift_test_results_git_metadata", default=True))
-        if pytestconfig
-        else True
-    )
-    with ReportContext(
-        sift_client,
-        name=f"{base_name} {datetime.now(timezone.utc).isoformat()}",
-        test_case=str(test_case),
-        log_file=log_file,
-        include_git_metadata=include_git_metadata,
-    ) as context:
-        # Set a global so we can access this in pytest hooks.
-        global REPORT_CONTEXT
-        REPORT_CONTEXT = context
-        yield context
-
-
-def _check_connection_enabled(pytestconfig: pytest.Config | None) -> bool:
-    """Return True when the caller opted into `--sift-test-results-check-connection`."""
-    if pytestconfig is None:
-        return False
-    return bool(pytestconfig.getoption("sift_test_results_check_connection", default=False))
-
-
-def _has_sift_connection(request: pytest.FixtureRequest) -> bool:
-    """Resolve the `client_has_connection` fixture lazily; only called when the check is enabled."""
-    return bool(request.getfixturevalue("client_has_connection"))
-
-
-@pytest.fixture(scope="session", autouse=True)
-def report_context(
-    sift_client: SiftClient, request: pytest.FixtureRequest, pytestconfig: pytest.Config
-) -> Generator[ReportContext | None, None, None]:
-    """Create a report context for the session.
-
-    The log file destination is controlled by ``--sift-test-results-log-file``.
-    Defaults to a temp file when not set.
-
-    When ``--sift-test-results-check-connection`` is passed, this fixture will no-op
-    (yield None) if the Sift client has no connection to the server. That mode
-    requires a ``client_has_connection`` fixture to be available in the session.
-    """
-    if _check_connection_enabled(pytestconfig) and not _has_sift_connection(request):
-        yield None
-        return
-    yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
-
-
-def _step_impl(
-    report_context: ReportContext, request: pytest.FixtureRequest
-) -> Generator[NewStep | None, None, None]:
-    name = str(request.node.name)
-    existing_docstring = request.node.obj.__doc__ or None
-    with report_context.new_step(
-        name=name, description=existing_docstring, assertion_as_fail_not_error=False
-    ) as new_step:
-        yield new_step
-        if hasattr(request.node, "rep_call") and request.node.rep_call.excinfo:
-            new_step.update_step_from_result(
-                request.node.rep_call.excinfo,
-                request.node.rep_call.excinfo.value,
-                request.node.rep_call.excinfo.tb,
-            )
-
-
-@pytest.fixture(autouse=True)
-def step(
-    report_context: ReportContext | None,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create an outer step for the function.
-
-    No-ops when ``--sift-test-results-check-connection`` is set and the client
-    has no connection (or when the session-scoped ``report_context`` resolved to None).
-    """
-    if report_context is None or (
-        _check_connection_enabled(pytestconfig) and not _has_sift_connection(request)
-    ):
-        yield None
-        return
-    yield from _step_impl(report_context, request)
-
-
-@pytest.fixture(scope="module", autouse=True)
-def module_substep(
-    report_context: ReportContext | None,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create a step per module.
-
-    No-ops when ``--sift-test-results-check-connection`` is set and the client
-    has no connection (or when the session-scoped ``report_context`` resolved to None).
-    """
-    if report_context is None or (
-        _check_connection_enabled(pytestconfig) and not _has_sift_connection(request)
-    ):
-        yield None
-        return
-    yield from _step_impl(report_context, request)
-
-
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client):
-    """Check if the SiftClient has a connection to the Sift server.
-
-    Can be used to skip tests that require a connection to the Sift server, and is
-    consulted by the Sift fixtures when ``--sift-test-results-check-connection`` is set.
-    """
-    has_connection = False
-    try:
-        sift_client.ping.ping()
-        has_connection = True
-    except Exception:
-        has_connection = False
-    return has_connection
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index 90bfd10ed..5108b7e4a 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -59,7 +59,9 @@ nav:
       - examples/index.md
       - Basic Usage: examples/basic.ipynb
       - Data Ingestion: examples/ingestion.ipynb
+        # Will migrate to Guides in the future
       - Pytest Plugin: examples/pytest_plugin.md
+      - Pytest Plugin Quickstart: examples/pytest_plugin_quickstart.md
 #  - Guides:
 #      - Logging
 #      - Error Handling
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 0fc34e914..def843fcb 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.16.0"
+version = "0.17.0.dev0"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -26,7 +26,11 @@ keywords = ["sift", "sift-stack", "siftstack", "sift_py"]
 dependencies = [
     "grpcio~=1.13",
     "PyYAML~=6.0",
-    "rapidyaml~=0.11",
+    # TODO: rapidyaml 0.13.0 ships C++ source that fails to compile against
+    # the GCC version on current GitHub Actions runners (csubstr operator=
+    # and SFINAE errors in the bundled c4core). Cap below 0.13 until either
+    # rapidyaml ships fixed sdists or we move to binary wheels.
+    "rapidyaml>=0.11,<0.13",
     "pandas>=2.0,<3.1",
     "protobuf>=5.0",
     "pydantic~=2.10",
@@ -333,6 +337,7 @@ ignore_errors = true
 
 [tool.setuptools.packages.find]
 where = ["lib"]
+exclude = ["sift_client._tests", "sift_client._tests.*"]
 
 [tool.setuptools.package-data]
 sift_grafana = ["py.typed"]
@@ -388,6 +393,21 @@ select = [
 env_files = [
     ".env"
 ]
+# `pytester` is registered globally because pytest 8+ disallows `pytest_plugins`
+# in non-top-level conftests. Only the plugin test suite uses it; activating it
+# globally is harmless since the fixture is opt-in.
+# The Sift pytest plugin is loaded so the project's own integration tests can
+# use its fixtures. Unit-test runs are flipped to `--sift-disabled` mode by
+# `lib/sift_client/_tests/conftest.py`.
+# `--import-mode=importlib` loads test files by path with unique synthetic
+# module names. The default `prepend` mode would try to import
+# `lib/sift_client/_tests/conftest.py` as `sift_client._tests.conftest`, which
+# fails because `_tests` is excluded from the wheel (see packages.find above).
+addopts = "-p pytester -p sift_client.pytest_plugin --import-mode=importlib"
+# The autouse gate defaults to off so unit tests don't use the Sift
+# fixtures. The integration subtree (lib/sift_client/_tests/util/) opts
+# back in via `pytest.mark.sift_include` applied in its conftest.
+sift_autouse = false
 testpaths = [
     "lib/sift_py",
     "lib/sift_client/_tests",
diff --git a/python/scripts/dev b/python/scripts/dev
index 5a1397803..81d5ec826 100755
--- a/python/scripts/dev
+++ b/python/scripts/dev
@@ -201,4 +201,5 @@ case "$1" in
         ;;
 esac
 
-exit 0
+# Leave the script's exit code as the subcommand's. A trailing `exit 0` here
+# silently masked ruff / mypy / pytest failures from the pre-push hook.