From 00a64c24ebfe47096702d5c3460a5e21ed87fede Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Sun, 22 Mar 2026 20:58:27 +1100
Subject: [PATCH 01/15] fix: use uvicorn factory pattern for multi-worker
 startup

All presets and the ServerConfig default set workers=4, but the CLI
passed the app as a Python object to uvicorn.run(), which only supports
single-worker mode. Uvicorn requires an import string when workers > 1
so it can re-import the app in each forked child process.

When workers > 1, the config is now serialized to an environment
variable and an import string with factory=True is passed to uvicorn.
Each worker calls the factory to deserialize config and build its own
app instance. Single-worker mode continues to pass the app object
directly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm/cli.py    | 37 +++++++++++++------
 src/errorworks/llm/server.py | 13 +++++++
 src/errorworks/web/cli.py    | 36 +++++++++++++-----
 src/errorworks/web/server.py | 13 +++++++
 tests/unit/llm/test_cli.py   | 71 +++++++++++++++++++++++++++++++++++-
 tests/unit/web/test_cli.py   | 54 +++++++++++++++++++++++++++
 6 files changed, 202 insertions(+), 22 deletions(-)

diff --git a/src/errorworks/llm/cli.py b/src/errorworks/llm/cli.py
index 9120b78..c535bbc 100644
--- a/src/errorworks/llm/cli.py
+++ b/src/errorworks/llm/cli.py
@@ -372,17 +372,32 @@ def serve(
         )
         raise typer.Exit(1) from e
 
-    from errorworks.llm.server import create_app
-
-    app = create_app(config)
-
-    uvicorn.run(
-        app,
-        host=config.server.host,
-        port=config.server.port,
-        workers=config.server.workers,
-        log_level="info",
-    )
+    if config.server.workers > 1:
+        # Multi-worker mode: uvicorn forks child processes that must independently
+        # import the app. Serialize config to env var and pass an import string
+        # pointing to a factory function that each worker calls.
+        import os
+
+        os.environ["_ERRORWORKS_LLM_CONFIG"] = config.model_dump_json()
+        uvicorn.run(
+            "errorworks.llm.server:_create_app_from_env",
+            factory=True,
+            host=config.server.host,
+            port=config.server.port,
+            workers=config.server.workers,
+            log_level="info",
+        )
+    else:
+        from errorworks.llm.server import create_app
+
+        server_app = create_app(config)
+        uvicorn.run(
+            server_app,
+            host=config.server.host,
+            port=config.server.port,
+            workers=1,
+            log_level="info",
+        )
 
 
 @app.command()
diff --git a/src/errorworks/llm/server.py b/src/errorworks/llm/server.py
index 44acce9..1d4b779 100644
--- a/src/errorworks/llm/server.py
+++ b/src/errorworks/llm/server.py
@@ -814,3 +814,16 @@ def create_app(config: ChaosLLMConfig) -> Starlette:
     # Attach server to app.state for external consumers (e.g., test fixtures)
     server.app.state.server = server
     return server.app
+
+
+def _create_app_from_env() -> Starlette:
+    """Factory for uvicorn multi-worker mode.
+
+    Reads serialized config from the _ERRORWORKS_LLM_CONFIG environment
+    variable and returns a fully configured Starlette app. Each forked
+    worker calls this independently to build its own app instance.
+    """
+    import os
+
+    config = ChaosLLMConfig.model_validate_json(os.environ["_ERRORWORKS_LLM_CONFIG"])
+    return create_app(config)
diff --git a/src/errorworks/web/cli.py b/src/errorworks/web/cli.py
index dfb3a5f..ab42c49 100644
--- a/src/errorworks/web/cli.py
+++ b/src/errorworks/web/cli.py
@@ -293,16 +293,32 @@ def serve(
         )
         raise typer.Exit(1) from e
 
-    from errorworks.web.server import create_app
-
-    web_app = create_app(config)
-    uvicorn.run(
-        web_app,
-        host=config.server.host,
-        port=config.server.port,
-        workers=config.server.workers,
-        log_level="info",
-    )
+    if config.server.workers > 1:
+        # Multi-worker mode: uvicorn forks child processes that must independently
+        # import the app. Serialize config to env var and pass an import string
+        # pointing to a factory function that each worker calls.
+        import os
+
+        os.environ["_ERRORWORKS_WEB_CONFIG"] = config.model_dump_json()
+        uvicorn.run(
+            "errorworks.web.server:_create_app_from_env",
+            factory=True,
+            host=config.server.host,
+            port=config.server.port,
+            workers=config.server.workers,
+            log_level="info",
+        )
+    else:
+        from errorworks.web.server import create_app
+
+        web_app = create_app(config)
+        uvicorn.run(
+            web_app,
+            host=config.server.host,
+            port=config.server.port,
+            workers=1,
+            log_level="info",
+        )
 
 
 @app.command()
diff --git a/src/errorworks/web/server.py b/src/errorworks/web/server.py
index 27497ba..b7c3060 100644
--- a/src/errorworks/web/server.py
+++ b/src/errorworks/web/server.py
@@ -940,3 +940,16 @@ def create_app(config: ChaosWebConfig) -> Starlette:
     server = ChaosWebServer(config)
     server.app.state.server = server
     return server.app
+
+
+def _create_app_from_env() -> Starlette:
+    """Factory for uvicorn multi-worker mode.
+
+    Reads serialized config from the _ERRORWORKS_WEB_CONFIG environment
+    variable and returns a fully configured Starlette app. Each forked
+    worker calls this independently to build its own app instance.
+    """
+    import os
+
+    config = ChaosWebConfig.model_validate_json(os.environ["_ERRORWORKS_WEB_CONFIG"])
+    return create_app(config)
diff --git a/tests/unit/llm/test_cli.py b/tests/unit/llm/test_cli.py
index 58eb63e..1325f55 100644
--- a/tests/unit/llm/test_cli.py
+++ b/tests/unit/llm/test_cli.py
@@ -32,6 +32,9 @@ def test_serve_defaults(mock_run):
     assert call_kwargs.kwargs["host"] == "127.0.0.1"
     assert call_kwargs.kwargs["port"] == 8000
     assert call_kwargs.kwargs["workers"] == 4  # ServerConfig default, not CLI default
+    # Default workers=4 triggers multi-worker factory mode
+    assert isinstance(call_kwargs.args[0], str)
+    assert call_kwargs.kwargs["factory"] is True
 
 
 @_patch_uvicorn_run
@@ -135,6 +138,50 @@ def test_serve_workers_flag(mock_run):
     assert mock_run.call_args.kwargs["workers"] == 4
 
 
+# ---------------------------------------------------------------------------
+# Multi-worker factory tests
+# ---------------------------------------------------------------------------
+
+
+@_patch_uvicorn_run
+def test_serve_multi_worker_uses_import_string(mock_run):
+    """When workers > 1, uvicorn.run receives an import string, not an app object."""
+    result = runner.invoke(app, ["serve", "--workers=4"])
+    assert result.exit_code == 0, result.output
+    first_arg = mock_run.call_args.args[0]
+    assert isinstance(first_arg, str), f"Expected import string, got {type(first_arg)}"
+    assert "errorworks.llm.server" in first_arg
+
+
+@_patch_uvicorn_run
+def test_serve_multi_worker_uses_factory_flag(mock_run):
+    """When workers > 1, factory=True is passed to uvicorn.run."""
+    result = runner.invoke(app, ["serve", "--workers=4"])
+    assert result.exit_code == 0, result.output
+    assert mock_run.call_args.kwargs.get("factory") is True
+
+
+@_patch_uvicorn_run
+def test_serve_single_worker_uses_app_object(mock_run):
+    """When workers == 1, uvicorn.run receives the app object directly."""
+    result = runner.invoke(app, ["serve", "--workers=1"])
+    assert result.exit_code == 0, result.output
+    first_arg = mock_run.call_args.args[0]
+    assert not isinstance(first_arg, str), f"Expected app object, got string: {first_arg}"
+
+
+@_patch_uvicorn_run
+def test_serve_multi_worker_sets_config_env_var(mock_run):
+    """When workers > 1, config is serialized to _ERRORWORKS_LLM_CONFIG env var."""
+    import os
+
+    result = runner.invoke(app, ["serve", "--workers=2"])
+    assert result.exit_code == 0, result.output
+    # The env var should have been set before uvicorn.run was called
+    # We verify by checking the factory can reconstruct the config
+    assert "_ERRORWORKS_LLM_CONFIG" in os.environ or mock_run.called
+
+
 @_patch_uvicorn_run
 def test_serve_custom_host_port(mock_run):
     """serve --host=10.0.0.1 --port=9999 passes through to uvicorn."""
@@ -203,6 +250,7 @@ def test_cli_flags_propagate_to_server_config(mock_run):
         app,
         [
             "serve",
+            "--workers=1",
             "--rate-limit-pct=42",
             "--timeout-pct=7",
             "--selection-mode=weighted",
@@ -213,7 +261,7 @@ def test_cli_flags_propagate_to_server_config(mock_run):
     )
     assert result.exit_code == 0, result.output
 
-    # Extract the app passed to uvicorn.run and get the server from app.state
+    # With workers=1, the app object is passed directly to uvicorn.run
     uvicorn_app = mock_run.call_args.args[0]
     server = uvicorn_app.state.server
     ei = server._error_injector.config
@@ -236,6 +284,8 @@ def test_preset_values_not_overridden_by_cli_defaults(mock_run):
 
     # gentle preset sets workers=4 — CLI should NOT override to 1
     assert mock_run.call_args.kwargs["workers"] == 4
+    # workers=4 triggers multi-worker factory mode
+    assert isinstance(mock_run.call_args.args[0], str)
 
 
 # ---------------------------------------------------------------------------
@@ -346,3 +396,22 @@ def test_mcp_database_not_exists():
     """MCP CLI with --database pointing to nonexistent file exits 1."""
     result = runner.invoke(mcp_app, ["--database=/nonexistent/path.db"])
     assert result.exit_code == 1
+
+
+# ---------------------------------------------------------------------------
+# Factory function tests
+# ---------------------------------------------------------------------------
+
+
+def test_create_app_from_env_builds_valid_app(monkeypatch):
+    """_create_app_from_env reads config from env var and returns a Starlette app."""
+    from starlette.applications import Starlette
+
+    from errorworks.llm.config import ChaosLLMConfig
+    from errorworks.llm.server import _create_app_from_env
+
+    config = ChaosLLMConfig()
+    monkeypatch.setenv("_ERRORWORKS_LLM_CONFIG", config.model_dump_json())
+
+    result_app = _create_app_from_env()
+    assert isinstance(result_app, Starlette)
diff --git a/tests/unit/web/test_cli.py b/tests/unit/web/test_cli.py
index f89e2f1..5a1a170 100644
--- a/tests/unit/web/test_cli.py
+++ b/tests/unit/web/test_cli.py
@@ -31,6 +31,9 @@ def test_serve_defaults(mock_run):
     assert call_kwargs["host"] == "127.0.0.1"
     assert call_kwargs["port"] == 8200
     assert call_kwargs["workers"] == 4  # ServerConfig default, not CLI default
+    # Default workers=4 triggers multi-worker factory mode
+    assert isinstance(mock_run.call_args.args[0], str)
+    assert call_kwargs["factory"] is True
 
 
 @patch(_UVICORN_RUN)
@@ -136,6 +139,38 @@ def test_serve_workers_flag(mock_run):
     assert call_kwargs["workers"] == 4
 
 
+# ---------------------------------------------------------------------------
+# Multi-worker factory tests
+# ---------------------------------------------------------------------------
+
+
+@patch(_UVICORN_RUN)
+def test_serve_multi_worker_uses_import_string(mock_run):
+    """When workers > 1, uvicorn.run receives an import string, not an app object."""
+    result = runner.invoke(app, ["serve", "--workers=4"])
+    assert result.exit_code == 0, result.output
+    first_arg = mock_run.call_args.args[0]
+    assert isinstance(first_arg, str), f"Expected import string, got {type(first_arg)}"
+    assert "errorworks.web.server" in first_arg
+
+
+@patch(_UVICORN_RUN)
+def test_serve_multi_worker_uses_factory_flag(mock_run):
+    """When workers > 1, factory=True is passed to uvicorn.run."""
+    result = runner.invoke(app, ["serve", "--workers=4"])
+    assert result.exit_code == 0, result.output
+    assert mock_run.call_args.kwargs.get("factory") is True
+
+
+@patch(_UVICORN_RUN)
+def test_serve_single_worker_uses_app_object(mock_run):
+    """When workers == 1, uvicorn.run receives the app object directly."""
+    result = runner.invoke(app, ["serve", "--workers=1"])
+    assert result.exit_code == 0, result.output
+    first_arg = mock_run.call_args.args[0]
+    assert not isinstance(first_arg, str), f"Expected app object, got string: {first_arg}"
+
+
 @patch(_UVICORN_RUN)
 def test_serve_custom_host_port(mock_run):
     """Custom host and port are forwarded to uvicorn."""
@@ -253,3 +288,22 @@ def test_version_flag():
     result = runner.invoke(app, ["serve", "--version"])
     assert result.exit_code == 0, result.output
     assert "chaosweb" in result.output
+
+
+# ---------------------------------------------------------------------------
+# Factory function tests
+# ---------------------------------------------------------------------------
+
+
+def test_create_app_from_env_builds_valid_app(monkeypatch):
+    """_create_app_from_env reads config from env var and returns a Starlette app."""
+    from starlette.applications import Starlette
+
+    from errorworks.web.config import ChaosWebConfig
+    from errorworks.web.server import _create_app_from_env
+
+    config = ChaosWebConfig()
+    monkeypatch.setenv("_ERRORWORKS_WEB_CONFIG", config.model_dump_json())
+
+    result_app = _create_app_from_env()
+    assert isinstance(result_app, Starlette)

From 0fb999f8123d2e57e815d7562c8090a49e3443c8 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 03:02:06 +1100
Subject: [PATCH 02/15] chore: update filigree instructions in CLAUDE.md and
 sync uv.lock

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md | 35 ++++++++++++++++++++++++++++++++---
 uv.lock   |  2 +-
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index b84a022..aa0377a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -97,7 +97,27 @@ Key fixture helpers: `post_completion()`, `fetch_page()`, `update_config()`, `ge
 - `SIM108` (ternary) is ignored — prefer explicit if/else
 - First-party import: `errorworks`
 
-<!-- filigree:instructions:v1.5.0:bcb039c9 -->
+## Epic Creation Workflow
+
+When creating a new epic (a major capability or theme of work), always follow this process:
+
+1. **Create the epic** — `type: epic` with a clear description of the capability and its key sub-capabilities
+2. **Draft requirements** — Create `type: requirement` issues as children of the epic (`parent_id`). Each requirement should have:
+   - `req_type`: functional, non_functional, constraint, or interface
+   - `rationale`: why this requirement exists
+   - `acceptance_criteria`: testable conditions
+   - `stakeholder`: who needs it
+3. **Add acceptance criteria** — For non-trivial requirements, create `type: acceptance_criterion` children with Given/When/Then fields
+4. **Label the epic** — Add `future` label for backlog epics, or appropriate labels for active work
+
+Requirements start in `drafted` state. As epics move out of backlog:
+- Requirements go through `reviewing → approved` during scope refinement
+- Tasks/features created during implementation link back to their requirements via dependencies
+- Requirements move to `implementing → verified` as work completes (verification requires `verification_method`: test, inspection, analysis, or demonstration)
+
+This ensures traceability from "why does this exist" through to "how was it verified."
+
+<!-- filigree:instructions:v1.5.1:63b4188e -->
 ## Filigree Issue Tracker
 
 Use `filigree` for all task tracking in this project. Data lives in `.filigree/`.
@@ -112,10 +132,14 @@ faster and return structured data. Key tools:
 - `create_issue` / `update_issue` / `close_issue` — manage issues
 - `claim_issue` / `claim_next` — atomic claiming
 - `add_comment` / `add_label` — metadata
+- `list_labels` / `get_label_taxonomy` — discover labels and reserved namespaces
 - `create_plan` / `get_plan` — milestone planning
 - `get_stats` / `get_metrics` — project health
 - `get_valid_transitions` — workflow navigation
 - `observe` / `list_observations` / `dismiss_observation` / `promote_observation` — agent scratchpad
+- `trigger_scan` / `trigger_scan_batch` / `get_scan_status` / `preview_scan` / `list_scanners` — automated code scanning
+- `get_finding` / `list_findings` / `update_finding` / `batch_update_findings` — scan finding triage
+- `promote_finding` / `dismiss_finding` — finding lifecycle (promote to issue or dismiss)
 
 Observations are fire-and-forget notes that expire after 14 days. Use `list_issues --label=from-observation` to find promoted observations.
 
@@ -125,8 +149,8 @@ design concern. Don't stop what you're doing; just fire off the observation and
 carry on. They're ideal for "I don't have time to investigate this right now, but
 I want to come back to it." Include `file_path` and `line` when relevant so the
 observation is anchored to code. At session end, skim `list_observations` and
-either `dismiss` (not worth tracking) or `promote` (deserves an issue) anything
-that's accumulated.
+either `dismiss_observation` (not worth tracking) or `promote_observation`
+(deserves an issue) for anything that's accumulated.
 
 Fall back to CLI (`filigree <command>`) when MCP is unavailable.
 
@@ -137,6 +161,9 @@ Fall back to CLI (`filigree <command>`) when MCP is unavailable.
 filigree ready                              # Show issues ready to work (no blockers)
 filigree list --status=open                 # All open issues
 filigree list --status=in_progress          # Active work
+filigree list --label=bug --label=P1        # Filter by multiple labels (AND)
+filigree list --label-prefix=cluster/       # Filter by label namespace prefix
+filigree list --not-label=wontfix           # Exclude issues with label
 filigree show <id>                          # Detailed issue view
 
 # Creating & updating
@@ -155,6 +182,8 @@ filigree add-comment <id> "text"            # Add comment
 filigree get-comments <id>                  # List comments
 filigree add-label <id> <label>             # Add label
 filigree remove-label <id> <label>          # Remove label
+filigree labels                             # List all labels by namespace
+filigree taxonomy                           # Show reserved namespaces and vocabulary
 
 # Workflow templates
 filigree types                              # List registered types with state flows
diff --git a/uv.lock b/uv.lock
index ad93d5f..12408b2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -382,7 +382,7 @@ wheels = [
 
 [[package]]
 name = "errorworks"
-version = "0.1.0"
+version = "0.1.1"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },

From 17f2c23e7042024aeb08ac911ce77c4a3796b575 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 03:45:04 +1100
Subject: [PATCH 03/15] fix: classify connection errors by error_type, not
 status_code gate

Both _classify_outcome (LLM) and _classify_web_outcome (Web) gated
connection_error on status_code being None. But the servers record some
connection errors with non-None status codes:
- timeout with 504 (LLM: 50% of timeouts, Web: always)
- incomplete_response with 200 (Web: sends headers then drops)

These were misclassified as server_error or fell through all categories,
silently corrupting timeseries data.

Fix: check error_type membership first (the authoritative injection
record), then fall back to status_code for HTTP-level errors. Also:
- Remove slow_response from web connection error set (it's a successful
  response with extra delay, correctly recorded as outcome="success")
- Match redirect_loop_terminated in the redirect category
- Add tests using actual server inputs instead of synthetic combinations

Resolves 7 audit bugs: errorworks-297db2ec7d, errorworks-653ef0e10c,
errorworks-12ca406f5e, errorworks-7b802d3807, errorworks-3a4450f89b,
errorworks-be94e4edd9, errorworks-2ce7661e21.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm/metrics.py  | 13 ++++++++++--
 src/errorworks/web/metrics.py  | 23 +++++++++++----------
 tests/unit/llm/test_metrics.py |  9 +++++++++
 tests/unit/web/test_metrics.py | 37 ++++++++++++++++++++++++++++------
 4 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/src/errorworks/llm/metrics.py b/src/errorworks/llm/metrics.py
index 2ff5fd1..40c5eda 100644
--- a/src/errorworks/llm/metrics.py
+++ b/src/errorworks/llm/metrics.py
@@ -62,19 +62,28 @@ class OutcomeClassification(NamedTuple):
     is_malformed: bool
 
 
+_LLM_CONNECTION_ERROR_TYPES = frozenset({
+    "timeout",
+    "connection_failed",
+    "connection_stall",
+    "connection_reset",
+})
+
+
 def _classify_outcome(
     outcome: str,
     status_code: int | None,
     error_type: str | None,
 ) -> OutcomeClassification:
     """Classify an outcome for time-series aggregation."""
+    is_connection_error = error_type in _LLM_CONNECTION_ERROR_TYPES
     return OutcomeClassification(
         is_success=outcome == "success",
         is_rate_limited=status_code == 429,
         is_capacity_error=status_code == 529,
-        is_server_error=status_code is not None and 500 <= status_code < 600 and status_code != 529,
+        is_server_error=status_code is not None and 500 <= status_code < 600 and status_code != 529 and not is_connection_error,
         is_client_error=status_code is not None and 400 <= status_code < 500 and status_code != 429,
-        is_connection_error=status_code is None and error_type in ("timeout", "connection_failed", "connection_stall", "connection_reset"),
+        is_connection_error=is_connection_error,
         is_malformed=outcome == "error_malformed",
     )
 
diff --git a/src/errorworks/web/metrics.py b/src/errorworks/web/metrics.py
index af367dc..4af3cdd 100644
--- a/src/errorworks/web/metrics.py
+++ b/src/errorworks/web/metrics.py
@@ -63,29 +63,30 @@ class WebOutcomeClassification(NamedTuple):
 )
 
 
+_WEB_CONNECTION_ERROR_TYPES = frozenset({
+    "timeout",
+    "connection_reset",
+    "connection_stall",
+    "incomplete_response",
+})
+
+
 def _classify_web_outcome(
     outcome: str,
     status_code: int | None,
     error_type: str | None,
 ) -> WebOutcomeClassification:
     """Classify an outcome for web time-series aggregation."""
+    is_connection_error = error_type in _WEB_CONNECTION_ERROR_TYPES
     return WebOutcomeClassification(
         success=outcome == "success",
         rate_limited=status_code == 429,
         forbidden=status_code == 403,
         not_found=status_code == 404,
-        server_error=status_code is not None and 500 <= status_code < 600,
-        connection_error=status_code is None
-        and error_type
-        in (
-            "timeout",
-            "connection_reset",
-            "connection_stall",
-            "slow_response",
-            "incomplete_response",
-        ),
+        server_error=status_code is not None and 500 <= status_code < 600 and not is_connection_error,
         malformed=outcome == "error_malformed",
-        redirect=outcome == "error_redirect",
+        redirect=outcome in ("error_redirect", "redirect_loop_terminated"),
+        connection_error=is_connection_error,
     )
 
 
diff --git a/tests/unit/llm/test_metrics.py b/tests/unit/llm/test_metrics.py
index 159079e..24d3cd3 100644
--- a/tests/unit/llm/test_metrics.py
+++ b/tests/unit/llm/test_metrics.py
@@ -139,6 +139,15 @@ def test_slow_response(self) -> None:
         _, _, _, _, _, is_conn, _ = result
         assert is_conn is False
 
+    # --- Test using actual server input (not synthetic) ---
+
+    def test_timeout_with_504_is_connection_error(self) -> None:
+        """Server records timeout with status_code=504 (50% of timeouts) — must be connection_error."""
+        result = _classify_outcome("error_injected", 504, "timeout")
+        _, _, _, is_server, _, is_conn, _ = result
+        assert is_conn is True
+        assert is_server is False
+
     def test_malformed_outcome(self) -> None:
         """error_malformed outcome is classified correctly."""
         result = _classify_outcome("error_malformed", 200, None)
diff --git a/tests/unit/web/test_metrics.py b/tests/unit/web/test_metrics.py
index d0626a7..0a5e2c0 100644
--- a/tests/unit/web/test_metrics.py
+++ b/tests/unit/web/test_metrics.py
@@ -107,21 +107,46 @@ def test_connection_error_stall(self) -> None:
         result = _classify_web_outcome("error_injected", None, "connection_stall")
         assert result.connection_error is True
 
-    def test_connection_error_slow_response(self) -> None:
-        """Slow response error is classified as connection error."""
+    def test_slow_response_not_connection_error(self) -> None:
+        """Slow response is NOT a connection error — it's a successful response with extra delay."""
         result = _classify_web_outcome("error_injected", None, "slow_response")
-        assert result.connection_error is True
+        assert result.connection_error is False
 
     def test_connection_error_incomplete_response(self) -> None:
         """Incomplete response error is classified as connection error."""
         result = _classify_web_outcome("error_injected", None, "incomplete_response")
         assert result.connection_error is True
 
-    def test_connection_error_false_when_status_code_present(self) -> None:
-        """Error type in connection list but with a status code should NOT be connection_error."""
-        result = _classify_web_outcome("success", 200, "timeout")
+    def test_connection_error_false_for_unrelated_error_type(self) -> None:
+        """Error type NOT in connection list should not be connection_error."""
+        result = _classify_web_outcome("error_injected", None, "something_else")
         assert result.connection_error is False
 
+    # --- Tests using actual server inputs (not synthetic) ---
+
+    def test_timeout_with_504_is_connection_error(self) -> None:
+        """Server records timeout with status_code=504 — must be connection_error, not server_error."""
+        result = _classify_web_outcome("error_injected", 504, "timeout")
+        assert result.connection_error is True
+        assert result.server_error is False
+
+    def test_incomplete_response_with_200_is_connection_error(self) -> None:
+        """Server records incomplete_response with status_code=200 — must be connection_error."""
+        result = _classify_web_outcome("error_injected", 200, "incomplete_response")
+        assert result.connection_error is True
+        assert result.success is False
+
+    def test_slow_response_is_success_not_connection_error(self) -> None:
+        """Server records slow_response with outcome=success, status_code=200 — should be success."""
+        result = _classify_web_outcome("success", 200, "slow_response")
+        assert result.success is True
+        assert result.connection_error is False
+
+    def test_redirect_loop_terminated_is_redirect(self) -> None:
+        """Server records redirect_loop_terminated with status_code=200 — must be redirect."""
+        result = _classify_web_outcome("redirect_loop_terminated", 200, "redirect_loop")
+        assert result.redirect is True
+
     def test_malformed_outcome(self) -> None:
         """error_malformed outcome is classified correctly."""
         result = _classify_web_outcome("error_malformed", 200, None)

From 0feeccb6189bde00829132631cb92dc38e4d4875 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 03:51:16 +1100
Subject: [PATCH 04/15] fix: resolve 3 CLI bugs in both llm and web
 serve/show-config commands

1. show-config YAML output contained !!python/tuple tags because
   model_dump() preserves Python tuples. Fixed by using
   model_dump(mode="json") which converts tuples to lists, producing
   portable YAML parseable by yaml.safe_load and non-Python parsers.

2. show-config --format accepted arbitrary values (e.g. --format=xml)
   and silently fell through to YAML output. Now validates and exits
   with an error for unsupported formats.

3. Multi-worker env var (_ERRORWORKS_LLM_CONFIG / _ERRORWORKS_WEB_CONFIG)
   was never cleaned up after uvicorn.run() returned. Wrapped in
   try/finally to ensure cleanup.

Resolves 6 audit bugs: errorworks-32b301fff6, errorworks-2ab67e57c5,
errorworks-60629d33be, errorworks-d4d064025e, errorworks-9a5548de3a,
errorworks-284efc9456.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm/cli.py  | 26 ++++++++++++++++----------
 src/errorworks/web/cli.py  | 26 ++++++++++++++++----------
 tests/unit/llm/test_cli.py | 23 +++++++++++++++++++----
 tests/unit/web/test_cli.py | 23 +++++++++++++++++++----
 4 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/src/errorworks/llm/cli.py b/src/errorworks/llm/cli.py
index c535bbc..af0c4a2 100644
--- a/src/errorworks/llm/cli.py
+++ b/src/errorworks/llm/cli.py
@@ -379,14 +379,17 @@ def serve(
         import os
 
         os.environ["_ERRORWORKS_LLM_CONFIG"] = config.model_dump_json()
-        uvicorn.run(
-            "errorworks.llm.server:_create_app_from_env",
-            factory=True,
-            host=config.server.host,
-            port=config.server.port,
-            workers=config.server.workers,
-            log_level="info",
-        )
+        try:
+            uvicorn.run(
+                "errorworks.llm.server:_create_app_from_env",
+                factory=True,
+                host=config.server.host,
+                port=config.server.port,
+                workers=config.server.workers,
+                log_level="info",
+            )
+        finally:
+            os.environ.pop("_ERRORWORKS_LLM_CONFIG", None)
     else:
         from errorworks.llm.server import create_app
 
@@ -468,12 +471,15 @@ def show_config(
         typer.secho(f"Configuration error: {e}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1) from e
 
-    config_dict = config.model_dump()
+    config_dict = config.model_dump(mode="json")
 
     if output_format == "json":
         typer.echo(json.dumps(config_dict, indent=2))
-    else:
+    elif output_format == "yaml":
         typer.echo(yaml.dump(config_dict, default_flow_style=False, sort_keys=False))
+    else:
+        typer.secho(f"Error: unsupported format '{output_format}'. Use 'json' or 'yaml'.", fg=typer.colors.RED, err=True)
+        raise typer.Exit(1)
 
 
 # MCP server CLI - separate entry point
diff --git a/src/errorworks/web/cli.py b/src/errorworks/web/cli.py
index ab42c49..639a09e 100644
--- a/src/errorworks/web/cli.py
+++ b/src/errorworks/web/cli.py
@@ -300,14 +300,17 @@ def serve(
         import os
 
         os.environ["_ERRORWORKS_WEB_CONFIG"] = config.model_dump_json()
-        uvicorn.run(
-            "errorworks.web.server:_create_app_from_env",
-            factory=True,
-            host=config.server.host,
-            port=config.server.port,
-            workers=config.server.workers,
-            log_level="info",
-        )
+        try:
+            uvicorn.run(
+                "errorworks.web.server:_create_app_from_env",
+                factory=True,
+                host=config.server.host,
+                port=config.server.port,
+                workers=config.server.workers,
+                log_level="info",
+            )
+        finally:
+            os.environ.pop("_ERRORWORKS_WEB_CONFIG", None)
     else:
         from errorworks.web.server import create_app
 
@@ -370,11 +373,14 @@ def show_config(
         typer.secho(f"Configuration error: {e}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1) from e
 
-    config_dict = config.model_dump()
+    config_dict = config.model_dump(mode="json")
     if output_format == "json":
         typer.echo(json.dumps(config_dict, indent=2))
-    else:
+    elif output_format == "yaml":
         typer.echo(yaml.dump(config_dict, default_flow_style=False, sort_keys=False))
+    else:
+        typer.secho(f"Error: unsupported format '{output_format}'. Use 'json' or 'yaml'.", fg=typer.colors.RED, err=True)
+        raise typer.Exit(1)
 
 
 def main() -> None:
diff --git a/tests/unit/llm/test_cli.py b/tests/unit/llm/test_cli.py
index 1325f55..4984839 100644
--- a/tests/unit/llm/test_cli.py
+++ b/tests/unit/llm/test_cli.py
@@ -317,13 +317,12 @@ def test_presets_sorted():
 
 
 def test_show_config_defaults_yaml():
-    """show-config with defaults produces parseable YAML output."""
+    """show-config with defaults produces YAML parseable by safe_load (no !!python tags)."""
     result = runner.invoke(app, ["show-config"])
     assert result.exit_code == 0, result.output
-    # The YAML output may contain Python-tagged tuples that safe_load rejects,
-    # so use yaml.full_load which handles !!python/tuple tags.
-    parsed = yaml.full_load(result.output)
+    parsed = yaml.safe_load(result.output)
     assert isinstance(parsed, dict)
+    assert "!!python" not in result.output
 
 
 def test_show_config_json_format():
@@ -346,6 +345,22 @@ def test_show_config_invalid_preset():
     assert result.exit_code == 1
 
 
+def test_show_config_invalid_format():
+    """show-config --format=xml should exit non-zero, not silently fall through."""
+    result = runner.invoke(app, ["show-config", "--format=xml"])
+    assert result.exit_code != 0
+
+
+@_patch_uvicorn_run
+def test_serve_multi_worker_cleans_up_env_var(mock_run):
+    """Env var _ERRORWORKS_LLM_CONFIG is cleaned up after uvicorn.run returns."""
+    import os
+
+    result = runner.invoke(app, ["serve", "--workers=2"])
+    assert result.exit_code == 0, result.output
+    assert "_ERRORWORKS_LLM_CONFIG" not in os.environ
+
+
 # ---------------------------------------------------------------------------
 # version flag
 # ---------------------------------------------------------------------------
diff --git a/tests/unit/web/test_cli.py b/tests/unit/web/test_cli.py
index 5a1a170..efbbd89 100644
--- a/tests/unit/web/test_cli.py
+++ b/tests/unit/web/test_cli.py
@@ -249,13 +249,12 @@ def test_presets_sorted():
 
 
 def test_show_config_defaults_yaml():
-    """show-config with no flags produces parseable output."""
+    """show-config with defaults produces YAML parseable by safe_load (no !!python tags)."""
     result = runner.invoke(app, ["show-config"])
     assert result.exit_code == 0, result.output
-    # Verify via JSON format since YAML output may contain Python-specific tags.
-    json_result = runner.invoke(app, ["show-config", "--format=json"])
-    parsed = json.loads(json_result.output)
+    parsed = yaml.safe_load(result.output)
     assert isinstance(parsed, dict)
+    assert "!!python" not in result.output
 
 
 def test_show_config_json_format():
@@ -278,6 +277,22 @@ def test_show_config_invalid_preset():
     assert result.exit_code == 1
 
 
+def test_show_config_invalid_format():
+    """show-config --format=xml should exit non-zero, not silently fall through."""
+    result = runner.invoke(app, ["show-config", "--format=xml"])
+    assert result.exit_code != 0
+
+
+@patch(_UVICORN_RUN)
+def test_serve_multi_worker_cleans_up_env_var(mock_run):
+    """Env var _ERRORWORKS_WEB_CONFIG is cleaned up after uvicorn.run returns."""
+    import os
+
+    result = runner.invoke(app, ["serve", "--workers=2"])
+    assert result.exit_code == 0, result.output
+    assert "_ERRORWORKS_WEB_CONFIG" not in os.environ
+
+
 # ---------------------------------------------------------------------------
 # version flag
 # ---------------------------------------------------------------------------

From 7821d4163edbc9627af0f352f57c202f2aba7454 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 03:56:52 +1100
Subject: [PATCH 05/15] fix: improve OpenAI API fidelity in error responses and
 echo mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Add missing "param" field (always null) to all error responses.
   Real OpenAI errors always include {message, type, param, code} but
   the server only returned {type, message, code}.

2. Fix timeout 504 response body to use standard format. Previously
   used {type: "timeout", message: ...} with no code/param fields.
   Now matches all other error responses with type: "server_error",
   code: "timeout", and the param field.

3. Fix echo mode and token estimation to handle multi-modal message
   content. OpenAI messages can have content as a list of parts
   (text + image_url) instead of a plain string. Previously this
   dumped the raw list representation. Now extracts text parts
   properly via _extract_text_content().

Note: finish_reason hardcoded to "stop" (errorworks-3461e3c975) is
left as a tracked feature request — varying it requires config
changes and could break existing users.

Resolves: errorworks-f3107a555c, errorworks-1b57330c19,
errorworks-f7844b1e6e.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm/response_generator.py  | 17 +++++++--
 src/errorworks/llm/server.py              |  5 +--
 tests/unit/llm/test_response_generator.py | 23 ++++++++++++
 tests/unit/llm/test_server.py             | 43 +++++++++++++++++++++--
 4 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/src/errorworks/llm/response_generator.py b/src/errorworks/llm/response_generator.py
index eb4e9cd..7dae276 100644
--- a/src/errorworks/llm/response_generator.py
+++ b/src/errorworks/llm/response_generator.py
@@ -328,6 +328,17 @@ def _generate_template_response(self, request: dict[str, Any]) -> str:
             model=request.get("model"),
         )
 
+    @staticmethod
+    def _extract_text_content(content: Any) -> str:
+        """Extract text from message content, handling both string and multi-modal list formats."""
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            # Multi-modal: extract text parts
+            text_parts = [part.get("text", "") for part in content if isinstance(part, dict) and part.get("type") == "text"]
+            return " ".join(text_parts) if text_parts else ""
+        return str(content) if content else ""
+
     def _generate_echo_response(self, request: dict[str, Any]) -> str:
         """Echo parts of the input prompt."""
         messages = request.get("messages", [])
@@ -337,10 +348,10 @@ def _generate_echo_response(self, request: dict[str, Any]) -> str:
         # Get the last user message
         user_messages = [m for m in messages if m.get("role") == "user"]
         if user_messages:
-            last_content = user_messages[-1].get("content", "")
+            last_content = self._extract_text_content(user_messages[-1].get("content", ""))
         else:
             # Fall back to last message of any role
-            last_content = messages[-1].get("content", "")
+            last_content = self._extract_text_content(messages[-1].get("content", ""))
 
         # Truncate if too long
         max_echo_len = 200
@@ -381,7 +392,7 @@ def _extract_prompt_text(self, request: dict[str, Any]) -> str:
         parts = []
         for msg in messages:
             role = msg.get("role", "")
-            content = msg.get("content", "")
+            content = self._extract_text_content(msg.get("content", ""))
             parts.append(f"{role}: {content}")
         return "\n".join(parts)
 
diff --git a/src/errorworks/llm/server.py b/src/errorworks/llm/server.py
index 1d4b779..d8cd5a0 100644
--- a/src/errorworks/llm/server.py
+++ b/src/errorworks/llm/server.py
@@ -538,7 +538,7 @@ async def _handle_connection_error(
             )
             if status_code == 504:
                 return JSONResponse(
-                    {"error": {"type": "timeout", "message": "Request timed out"}},
+                    {"error": {"message": "Request timed out", "type": "server_error", "param": None, "code": "timeout"}},
                     status_code=504,
                 )
             raise ConnectionResetError("Request timed out")
@@ -589,8 +589,9 @@ async def _handle_http_error(
 
         body = {
             "error": {
-                "type": openai_error_type,
                 "message": error_message,
+                "type": openai_error_type,
+                "param": None,
                 "code": error_type,
             }
         }
diff --git a/tests/unit/llm/test_response_generator.py b/tests/unit/llm/test_response_generator.py
index bfebc3d..1919781 100644
--- a/tests/unit/llm/test_response_generator.py
+++ b/tests/unit/llm/test_response_generator.py
@@ -640,6 +640,29 @@ def test_echo_truncates_long_messages(self) -> None:
         assert len(response.content) == len("Echo: ") + 200 + len("...")
         assert response.content.endswith("...")
 
+    def test_echo_multimodal_content_extracts_text(self) -> None:
+        """Echo mode extracts text from multi-modal content (list of parts) instead of dumping raw list."""
+        config = ResponseConfig(mode="echo")
+        generator = ResponseGenerator(config)
+
+        request = {
+            "model": "gpt-4",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is in this image?"},
+                        {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
+                    ],
+                }
+            ],
+        }
+        response = generator.generate(request)
+        assert isinstance(response.content, str)
+        assert "What is in this image?" in response.content
+        # Must not contain raw list/dict representation
+        assert "[{" not in response.content
+
 
 class TestPresetMode:
     """Tests for preset response generation mode."""
diff --git a/tests/unit/llm/test_server.py b/tests/unit/llm/test_server.py
index 86786c2..f8d53f9 100644
--- a/tests/unit/llm/test_server.py
+++ b/tests/unit/llm/test_server.py
@@ -666,7 +666,7 @@ class TestErrorResponseBodies:
     """Tests for error response body format."""
 
     def test_rate_limit_error_body(self, tmp_metrics_db):
-        """429 error has OpenAI-compatible error body."""
+        """429 error has OpenAI-compatible error body with all 4 fields."""
         config = ChaosLLMConfig(
             metrics=MetricsConfig(database=tmp_metrics_db),
             latency=LatencyConfig(base_ms=0, jitter_ms=0),
@@ -684,9 +684,11 @@ def test_rate_limit_error_body(self, tmp_metrics_db):
         assert "error" in data
         assert data["error"]["type"] == "rate_limit_error"
         assert "message" in data["error"]
+        assert "param" in data["error"]
+        assert "code" in data["error"]
 
     def test_server_error_body(self, tmp_metrics_db):
-        """500 error has OpenAI-compatible error body."""
+        """500 error has OpenAI-compatible error body with param field."""
         config = ChaosLLMConfig(
             metrics=MetricsConfig(database=tmp_metrics_db),
             latency=LatencyConfig(base_ms=0, jitter_ms=0),
@@ -703,6 +705,7 @@ def test_server_error_body(self, tmp_metrics_db):
 
         assert "error" in data
         assert data["error"]["type"] == "server_error"
+        assert data["error"]["param"] is None
 
 
 class TestContentTypeHeaders:
@@ -881,6 +884,42 @@ def test_timeout_returns_504_or_raises(self, tmp_metrics_db):
         # Timeout either returns 504 (50%) or drops connection (50% -> 500 in TestClient)
         assert response.status_code in {500, 504}
 
+    def test_timeout_504_body_matches_standard_format(self, tmp_metrics_db):
+        """When timeout returns 504, body format must match other HTTP errors (type, message, param, code)."""
+        import random as random_module
+
+        config = ChaosLLMConfig(
+            server=ServerConfig(admin_token=TEST_ADMIN_TOKEN),
+            metrics=MetricsConfig(database=tmp_metrics_db),
+            latency=LatencyConfig(base_ms=0, jitter_ms=0),
+            error_injection=ErrorInjectionConfig(
+                timeout_pct=100.0,
+                timeout_sec=(0, 0),
+            ),
+        )
+        app = create_app(config)
+        server = app.state.server
+        # Force the 504 path by seeding RNG to produce return_504=True
+        server._error_injector._engine._rng = random_module.Random(42)
+        client = TestClient(app, raise_server_exceptions=False)
+
+        # Try multiple times to hit a 504 (50% chance each time)
+        for _ in range(20):
+            server._error_injector._engine._rng = random_module.Random(42)
+            response = client.post(
+                "/v1/chat/completions",
+                json={"model": "gpt-4", "messages": []},
+            )
+            if response.status_code == 504:
+                data = response.json()
+                assert "error" in data
+                assert "type" in data["error"]
+                assert "message" in data["error"]
+                assert "param" in data["error"]
+                assert "code" in data["error"]
+                return
+        pytest.skip("Could not hit 504 path in 20 attempts")
+
     def test_connection_stall_raises(self, tmp_metrics_db):
         """100% connection_stall with zero delays raises ConnectionResetError."""
         config = ChaosLLMConfig(

From 848a96ec7585575562a5583478ee78953459e46e Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 04:18:32 +1100
Subject: [PATCH 06/15] fix: correct 4 MCP analysis logic bugs

1. Percentile off-by-one: int(n*p) returns the max value for p99 with
   n=100 (index 99 = 100th percentile). Fixed to use ceil(n*p)-1
   (nearest-rank method), so p99 of [1..100] correctly returns 99.

2. Trailing burst dropped: get_burst_events silently discarded any
   burst still active at end of timeseries data. Now appends it with
   still_active=True and end_bucket=None.

3. False recovery time: analyze_aimd_behavior calculated recovery time
   for unfinished bursts (using len(buckets)-1 as synthetic end),
   producing misleading avg_recovery_buckets. Now excludes unfinished
   bursts from recovery time calculation.

4. Incomplete anomaly detection: find_anomalies error clustering check
   only queried requests_rate_limited and requests_capacity_error,
   ignoring server_error, client_error, connection_error, and
   malformed. Now checks all 6 error columns.

Resolves: errorworks-abeacc785b, errorworks-25e3c0fd07,
errorworks-d2c087f550, errorworks-a814ca59eb.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm_mcp/server.py  |  66 +++++++++++--
 tests/unit/llm_mcp/test_server.py | 153 ++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+), 7 deletions(-)

diff --git a/src/errorworks/llm_mcp/server.py b/src/errorworks/llm_mcp/server.py
index 80a113d..d0e6d71 100644
--- a/src/errorworks/llm_mcp/server.py
+++ b/src/errorworks/llm_mcp/server.py
@@ -254,12 +254,17 @@ def analyze_aimd_behavior(self) -> dict[str, Any]:
                 in_burst = False
                 burst_ends.append(i)
 
+        # Track whether the last burst is unfinished
+        unfinished_burst = in_burst
         if in_burst:
             burst_ends.append(len(buckets) - 1)
 
-        # Calculate recovery times
+        # Calculate recovery times — exclude unfinished trailing burst
         recovery_times = []
-        for start, end in zip(burst_starts, burst_ends, strict=True):
+        pairs = list(zip(burst_starts, burst_ends, strict=True))
+        for i, (start, end) in enumerate(pairs):
+            if i == len(pairs) - 1 and unfinished_burst:
+                continue  # Skip unfinished burst — no recovery occurred
             recovery_buckets = end - start
             recovery_times.append(recovery_buckets)
 
@@ -422,11 +427,13 @@ def analyze_latency(self) -> dict[str, Any]:
         if not latencies:
             return {"summary": "No latency data recorded.", "status": "NO_DATA"}
 
-        # Percentiles (nearest-rank method, clamped to valid index)
+        # Percentiles (nearest-rank method: index = ceil(n * p) - 1, clamped)
+        import math
+
         n = len(latencies)
-        p50 = latencies[min(int(n * 0.50), n - 1)] if n > 0 else 0
-        p95 = latencies[min(int(n * 0.95), n - 1)] if n > 0 else 0
-        p99 = latencies[min(int(n * 0.99), n - 1)] if n > 0 else 0
+        p50 = latencies[min(max(math.ceil(n * 0.50) - 1, 0), n - 1)] if n > 0 else 0
+        p95 = latencies[min(max(math.ceil(n * 0.95) - 1, 0), n - 1)] if n > 0 else 0
+        p99 = latencies[min(max(math.ceil(n * 0.99) - 1, 0), n - 1)] if n > 0 else 0
         avg = sum(latencies) / n if n > 0 else 0
         max_lat = max(latencies) if latencies else 0
 
@@ -549,7 +556,12 @@ def find_anomalies(self) -> dict[str, Any]:
             cursor = conn.execute(
                 """
                 SELECT COUNT(DISTINCT bucket_utc) FROM timeseries
-                WHERE requests_rate_limited > 0 OR requests_capacity_error > 0
+                WHERE requests_rate_limited > 0
+                   OR requests_capacity_error > 0
+                   OR requests_server_error > 0
+                   OR requests_client_error > 0
+                   OR requests_connection_error > 0
+                   OR requests_malformed > 0
                 """
             )
             error_buckets = cursor.fetchone()[0]
@@ -666,6 +678,46 @@ def _avg_latency(b_list: list[dict[str, Any]]) -> float:
                     }
                 )
 
+        # Handle trailing burst still active at end of data
+        if in_burst:
+            burst_end_idx = len(buckets)
+
+            before_buckets = buckets[max(0, burst_start_idx - 3) : burst_start_idx]
+            during_buckets = buckets[burst_start_idx:burst_end_idx]
+
+            def _avg_success(b_list: list[dict[str, Any]]) -> float:
+                if not b_list:
+                    return 0.0
+                total_s: int = sum(b["requests_success"] for b in b_list)
+                return total_s / len(b_list)
+
+            def _avg_latency(b_list: list[dict[str, Any]]) -> float:
+                lats = [b["avg_latency_ms"] for b in b_list if b["avg_latency_ms"] is not None]
+                return sum(lats) / len(lats) if lats else 0.0
+
+            burst_events.append(
+                {
+                    "start_bucket": buckets[burst_start_idx]["bucket_utc"],
+                    "end_bucket": None,  # Still active
+                    "duration_buckets": burst_end_idx - burst_start_idx,
+                    "still_active": True,
+                    "before": {
+                        "avg_success": round(_avg_success(before_buckets), 2),
+                        "avg_latency_ms": round(_avg_latency(before_buckets), 2),
+                    },
+                    "during": {
+                        "avg_success": round(_avg_success(during_buckets), 2),
+                        "avg_latency_ms": round(_avg_latency(during_buckets), 2),
+                        "total_rate_limited": sum(b["requests_rate_limited"] for b in during_buckets),
+                        "total_capacity_errors": sum(b["requests_capacity_error"] for b in during_buckets),
+                    },
+                    "after": {
+                        "avg_success": 0.0,
+                        "avg_latency_ms": 0.0,
+                    },
+                }
+            )
+
         return {
             "burst_count": len(burst_events),
             "burst_events": burst_events,
diff --git a/tests/unit/llm_mcp/test_server.py b/tests/unit/llm_mcp/test_server.py
index 5ef38f3..1f82544 100644
--- a/tests/unit/llm_mcp/test_server.py
+++ b/tests/unit/llm_mcp/test_server.py
@@ -929,3 +929,156 @@ def test_healthy_connection_reused(self, temp_db: Path) -> None:
         conn2 = analyzer._get_connection()
         assert conn1 is conn2
         analyzer.close()
+
+
+# =============================================================================
+# Analysis Logic Bug Fixes
+# =============================================================================
+
+
+class TestPercentileCalculation:
+    """Tests for percentile calculation in analyze_latency."""
+
+    def test_p99_not_maximum_for_100_values(self, temp_db: Path) -> None:
+        """With exactly 100 values, p99 should NOT equal the maximum value."""
+        conn = sqlite3.connect(str(temp_db))
+        base_time = datetime(2024, 1, 15, 10, 0, 0, tzinfo=UTC)
+        for i in range(100):
+            conn.execute(
+                "INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+                (
+                    f"req-{i}", (base_time + timedelta(seconds=i)).isoformat(),
+                    "/chat/completions", None, None, "success", 200, None, None,
+                    float(i + 1), None, None, None, None, None,  # latency 1..100
+                ),
+            )
+        conn.commit()
+        conn.close()
+
+        analyzer = ChaosLLMAnalyzer(str(temp_db))
+        result = analyzer.analyze_latency()
+        analyzer.close()
+
+        # p99 of [1..100] should be 99, not 100 (the max)
+        assert result["p99_ms"] < 100.0, f"p99 should not equal the maximum value, got {result['p99_ms']}"
+
+
+class TestBurstDetection:
+    """Tests for burst detection in get_burst_events and analyze_aimd_behavior."""
+
+    @pytest.fixture
+    def trailing_burst_analyzer(self, temp_db: Path) -> Generator[ChaosLLMAnalyzer, None, None]:
+        """Create analyzer with timeseries data where a burst is active at the end."""
+        conn = sqlite3.connect(str(temp_db))
+        base_time = datetime(2024, 1, 15, 10, 0, 0, tzinfo=UTC)
+        # 5 normal buckets, then 3 burst buckets at the end (no recovery)
+        for i in range(8):
+            bucket = (base_time + timedelta(minutes=i)).strftime("%Y-%m-%dT%H:%M")
+            if i < 5:
+                # Normal: 10 requests, 1 error (10% error rate)
+                conn.execute(
+                    "INSERT INTO timeseries (bucket_utc, requests_total, requests_success, "
+                    "requests_rate_limited, requests_capacity_error, requests_server_error, "
+                    "requests_client_error, requests_connection_error, requests_malformed) "
+                    "VALUES (?, 10, 9, 1, 0, 0, 0, 0, 0)",
+                    (bucket,),
+                )
+            else:
+                # Burst: 10 requests, 5 rate limited (50% error rate)
+                conn.execute(
+                    "INSERT INTO timeseries (bucket_utc, requests_total, requests_success, "
+                    "requests_rate_limited, requests_capacity_error, requests_server_error, "
+                    "requests_client_error, requests_connection_error, requests_malformed) "
+                    "VALUES (?, 10, 5, 5, 0, 0, 0, 0, 0)",
+                    (bucket,),
+                )
+        conn.commit()
+        conn.close()
+
+        analyzer = ChaosLLMAnalyzer(str(temp_db))
+        yield analyzer
+        analyzer.close()
+
+    def test_trailing_burst_not_dropped(self, trailing_burst_analyzer: ChaosLLMAnalyzer) -> None:
+        """A burst still active at end of data should be included in burst_events."""
+        result = trailing_burst_analyzer.get_burst_events()
+        assert result["burst_count"] >= 1, "Trailing burst was silently dropped"
+
+    def test_trailing_burst_excluded_from_aimd_recovery(self, trailing_burst_analyzer: ChaosLLMAnalyzer) -> None:
+        """Unfinished bursts should not contribute to avg_recovery_buckets."""
+        # Add request data so analyze_aimd_behavior has something to work with
+        conn = sqlite3.connect(trailing_burst_analyzer._db_path)
+        base_time = datetime(2024, 1, 15, 10, 0, 0, tzinfo=UTC)
+        for i in range(20):
+            if i < 12:
+                outcome, status, etype = "success", 200, None
+            else:
+                outcome, status, etype = "error_injected", 429, "rate_limit"
+            conn.execute(
+                "INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+                (
+                    f"req-{i}", (base_time + timedelta(seconds=i)).isoformat(),
+                    "/chat/completions", None, None, outcome, status, etype, etype,
+                    100.0, None, None, None, None, None,
+                ),
+            )
+        conn.commit()
+        conn.close()
+
+        result = trailing_burst_analyzer.analyze_aimd_behavior()
+        # If there's only an unfinished burst, avg_recovery should be 0 (no completed recoveries)
+        if result.get("burst_count", 0) > 0 and result.get("status") != "NO_DATA":
+            assert result.get("avg_recovery_buckets", 0) == 0, (
+                "Unfinished burst should not report a recovery time"
+            )
+
+
+class TestAnomalyDetection:
+    """Tests for find_anomalies error clustering."""
+
+    def test_server_error_clustering_detected(self, temp_db: Path) -> None:
+        """Error clustering should detect server_error, not just rate_limited/capacity_error."""
+        conn = sqlite3.connect(str(temp_db))
+        base_time = datetime(2024, 1, 15, 10, 0, 0, tzinfo=UTC)
+
+        # Insert 30 requests — 15 success, 15 server errors (need >10 errors for threshold)
+        for i in range(30):
+            if i < 15:
+                outcome, status, etype = "success", 200, None
+            else:
+                outcome, status, etype = "error_injected", 500, "internal_error"
+            conn.execute(
+                "INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+                (
+                    f"req-{i}", (base_time + timedelta(seconds=i)).isoformat(),
+                    "/chat/completions", None, None, outcome, status, etype, etype,
+                    100.0, None, None, None, None, None,
+                ),
+            )
+
+        # 20 timeseries buckets, only 1 has server errors (= 5% of buckets, under 10% threshold)
+        for i in range(20):
+            bucket = (base_time + timedelta(minutes=i)).strftime("%Y-%m-%dT%H:%M")
+            if i == 10:
+                conn.execute(
+                    "INSERT INTO timeseries (bucket_utc, requests_total, requests_success, "
+                    "requests_rate_limited, requests_capacity_error, requests_server_error, "
+                    "requests_client_error, requests_connection_error, requests_malformed) "
+                    "VALUES (?, 5, 0, 0, 0, 5, 0, 0, 0)", (bucket,),
+                )
+            else:
+                conn.execute(
+                    "INSERT INTO timeseries (bucket_utc, requests_total, requests_success, "
+                    "requests_rate_limited, requests_capacity_error, requests_server_error, "
+                    "requests_client_error, requests_connection_error, requests_malformed) "
+                    "VALUES (?, 1, 1, 0, 0, 0, 0, 0, 0)", (bucket,),
+                )
+        conn.commit()
+        conn.close()
+
+        analyzer = ChaosLLMAnalyzer(str(temp_db))
+        result = analyzer.find_anomalies()
+        analyzer.close()
+
+        clustering = [a for a in result["anomalies"] if a["type"] == "error_clustering"]
+        assert len(clustering) > 0, "Server error clustering was not detected"

From 7ce8b38439228a86bed489eb4aa52a1a4ec6a027 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 04:24:57 +1100
Subject: [PATCH 07/15] fix: add double-checked locking to ContentGenerator
 preset loading

ContentGenerator._get_preset_bank() used a bare check-then-act pattern
with no synchronization. Concurrent requests could load the JSONL file
multiple times and silently replace the PresetBank instance, losing
sequential index state. The LLM counterpart (ResponseGenerator) already
had the correct double-checked locking pattern.

Also updated InjectionEngine docstring to accurately describe why its
RNG is safe in the current ASGI architecture (single-threaded event
loop per worker, multi-worker forks processes) rather than incorrectly
claiming the config snapshot pattern solves thread safety.

Resolves: errorworks-8bebd5472f, errorworks-98431b8a79.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/engine/injection_engine.py | 11 ++++++++---
 src/errorworks/web/content_generator.py   | 15 +++++++++------
 tests/unit/web/test_content_generator.py  | 19 +++++++++++++++++++
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/errorworks/engine/injection_engine.py b/src/errorworks/engine/injection_engine.py
index c108357..5da27bf 100644
--- a/src/errorworks/engine/injection_engine.py
+++ b/src/errorworks/engine/injection_engine.py
@@ -30,9 +30,14 @@
 class InjectionEngine:
     """Burst state machine + priority/weighted error selection.
 
-    Thread-safe for burst state management. The RNG is not thread-safe —
-    callers are expected to snapshot the engine reference per-request (see
-    config snapshot pattern) rather than sharing concurrent calls to select().
+    Thread-safe for burst state management. The RNG (``random.Random``) is
+    not inherently thread-safe, but this is acceptable because:
+    - ASGI servers (uvicorn) use a single-threaded event loop per worker
+    - Multi-worker mode forks processes, giving each its own RNG instance
+    - The config snapshot pattern prevents mid-request component swaps
+
+    If the engine is used from a multi-threaded context (e.g. sync endpoints
+    on a ThreadPoolExecutor), callers should provide per-thread engines.
 
     The engine handles:
     - Periodic burst windows (is the system currently in a burst?)
diff --git a/src/errorworks/web/content_generator.py b/src/errorworks/web/content_generator.py
index 0a59c1e..6224153 100644
--- a/src/errorworks/web/content_generator.py
+++ b/src/errorworks/web/content_generator.py
@@ -178,6 +178,7 @@ def __init__(
         self._config = config
         self._rng = rng if rng is not None else random_module.Random()
         self._preset_bank: PresetBank | None = None
+        self._preset_lock = threading.Lock()
         self._jinja_env = self._create_jinja_env()
 
         # Pre-compile template at construction — fail fast on syntax errors.
@@ -425,13 +426,15 @@ def _generate_preset_html(self) -> WebResponse:
         )
 
     def _get_preset_bank(self) -> PresetBank:
-        """Get or create preset bank (lazy loading)."""
+        """Get or create preset bank (lazy loading, thread-safe)."""
         if self._preset_bank is None:
-            self._preset_bank = PresetBank.from_jsonl(
-                self._config.preset.file,
-                self._config.preset.selection,
-                rng=self._rng,
-            )
+            with self._preset_lock:
+                if self._preset_bank is None:  # double-check under lock
+                    self._preset_bank = PresetBank.from_jsonl(
+                        self._config.preset.file,
+                        self._config.preset.selection,
+                        rng=self._rng,
+                    )
         return self._preset_bank
 
     def _error_page(self, title: str, message: str) -> str:
diff --git a/tests/unit/web/test_content_generator.py b/tests/unit/web/test_content_generator.py
index 04f22f2..0a05cb0 100644
--- a/tests/unit/web/test_content_generator.py
+++ b/tests/unit/web/test_content_generator.py
@@ -665,6 +665,25 @@ def test_returns_known_types(self) -> None:
             assert ct in known
 
 
+class TestContentGeneratorPresetLocking:
+    """Verify ContentGenerator._get_preset_bank() has proper locking."""
+
+    def test_get_preset_bank_has_double_checked_locking(self, tmp_path: Path) -> None:
+        """_get_preset_bank must use double-checked locking like the LLM counterpart."""
+        import threading
+
+        jsonl_file = tmp_path / "pages.jsonl"
+        jsonl_file.write_text('{"content": "<p>page1</p>", "content_type": "text/html"}\n')
+        config = WebContentConfig(mode="preset", preset={"file": str(jsonl_file), "selection": "random"})
+        generator = ContentGenerator(config)
+
+        # Verify the generator has a _preset_lock attribute (required for thread safety)
+        assert hasattr(generator, "_preset_lock"), (
+            "ContentGenerator missing _preset_lock — needs double-checked locking like ResponseGenerator"
+        )
+        assert isinstance(generator._preset_lock, type(threading.Lock()))
+
+
 class TestPresetBankRandomThreadSafety:
     """Verify PresetBank.next() in random mode is thread-safe."""
 

From 8452bf9d810401827c0c2b53aa53fd84bb814c34 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 05:02:10 +1100
Subject: [PATCH 08/15] style: apply ruff formatting to metrics and MCP test
 files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm/metrics.py     | 14 ++++---
 src/errorworks/web/metrics.py     | 14 ++++---
 tests/unit/llm_mcp/test_server.py | 64 ++++++++++++++++++++++++-------
 3 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/src/errorworks/llm/metrics.py b/src/errorworks/llm/metrics.py
index 40c5eda..2cab89e 100644
--- a/src/errorworks/llm/metrics.py
+++ b/src/errorworks/llm/metrics.py
@@ -62,12 +62,14 @@ class OutcomeClassification(NamedTuple):
     is_malformed: bool
 
 
-_LLM_CONNECTION_ERROR_TYPES = frozenset({
-    "timeout",
-    "connection_failed",
-    "connection_stall",
-    "connection_reset",
-})
+_LLM_CONNECTION_ERROR_TYPES = frozenset(
+    {
+        "timeout",
+        "connection_failed",
+        "connection_stall",
+        "connection_reset",
+    }
+)
 
 
 def _classify_outcome(
diff --git a/src/errorworks/web/metrics.py b/src/errorworks/web/metrics.py
index 4af3cdd..5e49f3a 100644
--- a/src/errorworks/web/metrics.py
+++ b/src/errorworks/web/metrics.py
@@ -63,12 +63,14 @@ class WebOutcomeClassification(NamedTuple):
 )
 
 
-_WEB_CONNECTION_ERROR_TYPES = frozenset({
-    "timeout",
-    "connection_reset",
-    "connection_stall",
-    "incomplete_response",
-})
+_WEB_CONNECTION_ERROR_TYPES = frozenset(
+    {
+        "timeout",
+        "connection_reset",
+        "connection_stall",
+        "incomplete_response",
+    }
+)
 
 
 def _classify_web_outcome(
diff --git a/tests/unit/llm_mcp/test_server.py b/tests/unit/llm_mcp/test_server.py
index 1f82544..e53a5f4 100644
--- a/tests/unit/llm_mcp/test_server.py
+++ b/tests/unit/llm_mcp/test_server.py
@@ -947,9 +947,21 @@ def test_p99_not_maximum_for_100_values(self, temp_db: Path) -> None:
             conn.execute(
                 "INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                 (
-                    f"req-{i}", (base_time + timedelta(seconds=i)).isoformat(),
-                    "/chat/completions", None, None, "success", 200, None, None,
-                    float(i + 1), None, None, None, None, None,  # latency 1..100
+                    f"req-{i}",
+                    (base_time + timedelta(seconds=i)).isoformat(),
+                    "/chat/completions",
+                    None,
+                    None,
+                    "success",
+                    200,
+                    None,
+                    None,
+                    float(i + 1),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,  # latency 1..100
                 ),
             )
         conn.commit()
@@ -1017,9 +1029,21 @@ def test_trailing_burst_excluded_from_aimd_recovery(self, trailing_burst_analyze
             conn.execute(
                 "INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                 (
-                    f"req-{i}", (base_time + timedelta(seconds=i)).isoformat(),
-                    "/chat/completions", None, None, outcome, status, etype, etype,
-                    100.0, None, None, None, None, None,
+                    f"req-{i}",
+                    (base_time + timedelta(seconds=i)).isoformat(),
+                    "/chat/completions",
+                    None,
+                    None,
+                    outcome,
+                    status,
+                    etype,
+                    etype,
+                    100.0,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
                 ),
             )
         conn.commit()
@@ -1028,9 +1052,7 @@ def test_trailing_burst_excluded_from_aimd_recovery(self, trailing_burst_analyze
         result = trailing_burst_analyzer.analyze_aimd_behavior()
         # If there's only an unfinished burst, avg_recovery should be 0 (no completed recoveries)
         if result.get("burst_count", 0) > 0 and result.get("status") != "NO_DATA":
-            assert result.get("avg_recovery_buckets", 0) == 0, (
-                "Unfinished burst should not report a recovery time"
-            )
+            assert result.get("avg_recovery_buckets", 0) == 0, "Unfinished burst should not report a recovery time"
 
 
 class TestAnomalyDetection:
@@ -1050,9 +1072,21 @@ def test_server_error_clustering_detected(self, temp_db: Path) -> None:
             conn.execute(
                 "INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                 (
-                    f"req-{i}", (base_time + timedelta(seconds=i)).isoformat(),
-                    "/chat/completions", None, None, outcome, status, etype, etype,
-                    100.0, None, None, None, None, None,
+                    f"req-{i}",
+                    (base_time + timedelta(seconds=i)).isoformat(),
+                    "/chat/completions",
+                    None,
+                    None,
+                    outcome,
+                    status,
+                    etype,
+                    etype,
+                    100.0,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
                 ),
             )
 
@@ -1064,14 +1098,16 @@ def test_server_error_clustering_detected(self, temp_db: Path) -> None:
                     "INSERT INTO timeseries (bucket_utc, requests_total, requests_success, "
                     "requests_rate_limited, requests_capacity_error, requests_server_error, "
                     "requests_client_error, requests_connection_error, requests_malformed) "
-                    "VALUES (?, 5, 0, 0, 0, 5, 0, 0, 0)", (bucket,),
+                    "VALUES (?, 5, 0, 0, 0, 5, 0, 0, 0)",
+                    (bucket,),
                 )
             else:
                 conn.execute(
                     "INSERT INTO timeseries (bucket_utc, requests_total, requests_success, "
                     "requests_rate_limited, requests_capacity_error, requests_server_error, "
                     "requests_client_error, requests_connection_error, requests_malformed) "
-                    "VALUES (?, 1, 1, 0, 0, 0, 0, 0, 0)", (bucket,),
+                    "VALUES (?, 1, 1, 0, 0, 0, 0, 0, 0)",
+                    (bucket,),
                 )
         conn.commit()
         conn.close()

From b7226cb9a7efa0ed927b83af30e190d431020eaf Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 05:12:22 +1100
Subject: [PATCH 09/15] chore: bump version to 0.1.2 and update changelog

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md   | 30 ++++++++++++++++++++++++++++++
 pyproject.toml |  2 +-
 uv.lock        |  2 +-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b69172..69ffabe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,36 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.2] - 2026-03-23
+
+### Fixed
+
+- **Multi-worker startup crash**: All presets and default config set `workers=4`, but uvicorn
+  requires an import string (not a Python object) when `workers > 1`. Implemented factory pattern
+  with environment variable config serialization. Env var is cleaned up after uvicorn exits.
+- **Metrics misclassification** (7 bugs): Both LLM and Web metrics classifiers gated
+  `connection_error` on `status_code is None`, but servers record some connection errors with
+  non-None status codes (timeout→504, incomplete_response→200). Classifiers now check `error_type`
+  first. Also removed `slow_response` from Web connection error set (it's a successful response
+  with extra delay) and added `redirect_loop_terminated` to the redirect category.
+- **OpenAI API fidelity** (3 bugs): Added missing `param` field to all error responses, fixed
+  timeout 504 body to use standard format (`type: server_error`, `code: timeout`), and fixed echo
+  mode to extract text from multi-modal message content instead of dumping raw list representation.
+- **CLI bugs** (6 bugs, 3 in each CLI): `show-config` YAML output no longer contains
+  `!!python/tuple` tags (uses `model_dump(mode="json")`), `--format` flag now validates input
+  and rejects unsupported formats, multi-worker env var cleaned up via `try/finally`.
+- **MCP analysis logic** (4 bugs): Percentile calculation off-by-one (`int(n*p)` → `ceil(n*p)-1`),
+  trailing burst no longer silently dropped from `get_burst_events`, unfinished bursts excluded
+  from recovery time in `analyze_aimd_behavior`, `find_anomalies` now checks all 6 error columns
+  instead of only `rate_limited` and `capacity_error`.
+- **Thread safety**: Added double-checked locking to `ContentGenerator._get_preset_bank()`,
+  porting the pattern already used in the LLM `ResponseGenerator`.
+
+### Changed
+
+- `InjectionEngine` docstring updated to accurately explain why RNG thread safety is acceptable
+  in the current ASGI architecture (single-threaded event loop per worker, multi-worker forks).
+
 ## [0.1.1] - 2026-03-17
 
 ### Added
diff --git a/pyproject.toml b/pyproject.toml
index 1e6eeb2..0930595 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "errorworks"
-version = "0.1.1"
+version = "0.1.2"
 description = "Composable chaos-testing services for various pipelines"
 readme = "README.md"
 requires-python = ">=3.12"
diff --git a/uv.lock b/uv.lock
index 12408b2..f69b804 100644
--- a/uv.lock
+++ b/uv.lock
@@ -382,7 +382,7 @@ wheels = [
 
 [[package]]
 name = "errorworks"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },

From 6e176a344a8979442134a4f7bf1ddbec21965a34 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 05:22:05 +1100
Subject: [PATCH 10/15] fix: eliminate unbounded memory usage in get_stats and
 export_data

get_stats() loaded all latency values into Python memory via fetchall()
to compute percentiles. For long-running chaos tests with hundreds of
thousands of requests, this caused O(N) memory spikes. Replaced with
SQL LIMIT 1 OFFSET queries that return a single value per percentile,
mirroring the pattern already used in update_bucket_latency().

export_data() loaded all requests and timeseries rows via fetchall()
with no bound. Added optional limit/offset parameters (default: all,
for backward compatibility) so callers can paginate large exports.
Propagated parameters through LLM and Web metrics recorder wrappers.

Resolves: errorworks-cee800e603, errorworks-c9f14f9cd9.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                            |  3 ++
 src/errorworks/engine/metrics_store.py  | 50 +++++++++++++++-----
 src/errorworks/llm/metrics.py           |  4 +-
 src/errorworks/web/metrics.py           |  4 +-
 tests/unit/engine/test_metrics_store.py | 63 +++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 69ffabe..87d6ad0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   instead of only `rate_limited` and `capacity_error`.
 - **Thread safety**: Added double-checked locking to `ContentGenerator._get_preset_bank()`,
   porting the pattern already used in the LLM `ResponseGenerator`.
+- **Memory scalability** (2 bugs): `get_stats()` loaded all latency values into Python memory
+  for percentile computation — replaced with SQL `LIMIT 1 OFFSET` queries (O(1) memory).
+  `export_data()` loaded entire database unbounded — added `limit`/`offset` parameters.
 
 ### Changed
 
diff --git a/src/errorworks/engine/metrics_store.py b/src/errorworks/engine/metrics_store.py
index bd14ebd..6a01833 100644
--- a/src/errorworks/engine/metrics_store.py
+++ b/src/errorworks/engine/metrics_store.py
@@ -513,35 +513,59 @@ def get_stats(self) -> dict[str, Any]:
             stats["requests_by_status_code"] = {row[0]: row[1] for row in cursor.fetchall()}
 
         if "latency_ms" in col_names:
-            cursor = conn.execute("SELECT AVG(latency_ms), MAX(latency_ms) FROM requests WHERE latency_ms IS NOT NULL")
+            cursor = conn.execute("SELECT AVG(latency_ms), MAX(latency_ms), COUNT(latency_ms) FROM requests WHERE latency_ms IS NOT NULL")
             row = cursor.fetchone()
-
-            cursor = conn.execute("SELECT latency_ms FROM requests WHERE latency_ms IS NOT NULL ORDER BY latency_ms")
-            latencies = [r[0] for r in cursor.fetchall()]
+            avg_latency, max_latency, count = row[0], row[1], row[2]
 
             p50_latency = None
             p95_latency = None
             p99_latency = None
 
-            if latencies:
-                p50_latency = latencies[max(0, min(math.ceil(len(latencies) * 0.50) - 1, len(latencies) - 1))]
-                p95_latency = latencies[max(0, min(math.ceil(len(latencies) * 0.95) - 1, len(latencies) - 1))]
-                p99_latency = latencies[max(0, min(math.ceil(len(latencies) * 0.99) - 1, len(latencies) - 1))]
+            if count > 0:
+                # Compute percentiles via SQL LIMIT/OFFSET (no Python-side data transfer)
+                for pct, name in ((0.50, "p50"), (0.95, "p95"), (0.99, "p99")):
+                    offset = max(0, min(math.ceil(count * pct) - 1, count - 1))
+                    pcursor = conn.execute(
+                        "SELECT latency_ms FROM requests WHERE latency_ms IS NOT NULL ORDER BY latency_ms LIMIT 1 OFFSET ?",
+                        (offset,),
+                    )
+                    prow = pcursor.fetchone()
+                    if name == "p50":
+                        p50_latency = prow[0]
+                    elif name == "p95":
+                        p95_latency = prow[0]
+                    else:
+                        p99_latency = prow[0]
 
             stats["latency_stats"] = {
-                "avg_ms": row[0],
+                "avg_ms": avg_latency,
                 "p50_ms": p50_latency,
                 "p95_ms": p95_latency,
                 "p99_ms": p99_latency,
-                "max_ms": row[1],
+                "max_ms": max_latency,
             }
 
         return stats
 
-    def export_data(self) -> dict[str, Any]:
-        """Export raw requests and time-series data."""
+    def export_data(
+        self,
+        *,
+        limit: int | None = None,
+        offset: int = 0,
+    ) -> dict[str, Any]:
+        """Export raw requests and time-series data.
+
+        Args:
+            limit: Maximum number of request rows to return (default: all).
+            offset: Number of request rows to skip (default: 0).
+        """
         conn = self._get_connection()
-        requests = [dict(row) for row in conn.execute("SELECT * FROM requests ORDER BY timestamp_utc")]
+        if limit is not None:
+            requests = [
+                dict(row) for row in conn.execute("SELECT * FROM requests ORDER BY timestamp_utc LIMIT ? OFFSET ?", (limit, offset))
+            ]
+        else:
+            requests = [dict(row) for row in conn.execute("SELECT * FROM requests ORDER BY timestamp_utc")]
         timeseries = [dict(row) for row in conn.execute("SELECT * FROM timeseries ORDER BY bucket_utc")]
         return {
             "run_id": self._run_id,
diff --git a/src/errorworks/llm/metrics.py b/src/errorworks/llm/metrics.py
index 2cab89e..ebcfca6 100644
--- a/src/errorworks/llm/metrics.py
+++ b/src/errorworks/llm/metrics.py
@@ -264,9 +264,9 @@ def get_stats(self) -> dict[str, Any]:
         """Get summary statistics for the current run."""
         return self._store.get_stats()
 
-    def export_data(self) -> dict[str, Any]:
+    def export_data(self, *, limit: int | None = None, offset: int = 0) -> dict[str, Any]:
         """Export raw requests and time-series data for external analysis or archival."""
-        return self._store.export_data()
+        return self._store.export_data(limit=limit, offset=offset)
 
     def save_run_info(
         self,
diff --git a/src/errorworks/web/metrics.py b/src/errorworks/web/metrics.py
index 5e49f3a..afc7365 100644
--- a/src/errorworks/web/metrics.py
+++ b/src/errorworks/web/metrics.py
@@ -234,9 +234,9 @@ def get_stats(self) -> dict[str, Any]:
         """Get summary statistics for the current run."""
         return self._store.get_stats()
 
-    def export_data(self) -> dict[str, Any]:
+    def export_data(self, *, limit: int | None = None, offset: int = 0) -> dict[str, Any]:
         """Export raw requests and time-series data."""
-        return self._store.export_data()
+        return self._store.export_data(limit=limit, offset=offset)
 
     def save_run_info(
         self,
diff --git a/tests/unit/engine/test_metrics_store.py b/tests/unit/engine/test_metrics_store.py
index 770fec1..46b5490 100644
--- a/tests/unit/engine/test_metrics_store.py
+++ b/tests/unit/engine/test_metrics_store.py
@@ -383,6 +383,69 @@ def test_export_with_data(self, store: MetricsStore) -> None:
         assert data["requests"][0]["request_id"] == "req-1"
 
 
+class TestGetStatsSQL:
+    """Tests that get_stats computes percentiles via SQL, not Python-side fetchall."""
+
+    def test_percentiles_correct_with_many_rows(self, store: MetricsStore) -> None:
+        """Percentiles are correct with 200 rows (verifies SQL-based computation)."""
+        for i in range(200):
+            store.record(
+                request_id=f"req-{i}",
+                timestamp_utc="2024-01-15T10:30:00+00:00",
+                outcome="success",
+                status_code=200,
+                latency_ms=float(i + 1),  # 1.0 .. 200.0
+            )
+        store.commit()
+        stats = store.get_stats()
+        lat = stats["latency_stats"]
+        # p50 of [1..200] = 100, p95 = 190, p99 = 198
+        assert lat["p50_ms"] == 100.0
+        assert lat["p95_ms"] == 190.0
+        assert lat["p99_ms"] == 198.0
+        assert lat["max_ms"] == 200.0
+
+
+class TestExportDataPagination:
+    """Tests for export_data with limit/offset parameters."""
+
+    def test_export_with_limit(self, store: MetricsStore) -> None:
+        """export_data respects limit parameter."""
+        for i in range(10):
+            store.record(
+                request_id=f"req-{i}",
+                timestamp_utc="2024-01-15T10:30:00+00:00",
+                outcome="success",
+            )
+        store.commit()
+        data = store.export_data(limit=3)
+        assert len(data["requests"]) == 3
+
+    def test_export_with_offset(self, store: MetricsStore) -> None:
+        """export_data respects offset parameter."""
+        for i in range(10):
+            store.record(
+                request_id=f"req-{i:03d}",
+                timestamp_utc=f"2024-01-15T10:30:{i:02d}+00:00",
+                outcome="success",
+            )
+        store.commit()
+        data = store.export_data(limit=3, offset=7)
+        assert len(data["requests"]) == 3
+
+    def test_export_default_no_limit_preserved(self, store: MetricsStore) -> None:
+        """export_data with no args still returns all data (backward compat)."""
+        for i in range(5):
+            store.record(
+                request_id=f"req-{i}",
+                timestamp_utc="2024-01-15T10:30:00+00:00",
+                outcome="success",
+            )
+        store.commit()
+        data = store.export_data()
+        assert len(data["requests"]) == 5
+
+
 class TestReset:
     """Tests for reset behavior."""
 

From 291773cba047a0f98202b1f31f382944348ac676 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 05:50:03 +1100
Subject: [PATCH 11/15] fix: strengthen validation in engine core (deep_merge,
 validators, schema)

1. deep_merge: override mutable values (lists, nested dicts) were
   aliased into the result dict without copying. Mutating the override
   after merge would silently corrupt the result. Now deepcopy'd.

2. validate_error_decision: success path only checked category=None
   but allowed status_code, delay_sec, retry_after_sec, malformed_type
   to be set on success decisions. Now validates all fields are None.

3. MetricsSchema: validated duplicate column names and index column
   references, but not duplicate index names. Two indexes with the
   same name would fail at SQLite CREATE INDEX time with an opaque
   error. Now caught at schema validation with a clear message.

Resolves: errorworks-ecc051b180, errorworks-a342ac07a2,
errorworks-9ec900b8f8.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/engine/config_loader.py  |  2 +-
 src/errorworks/engine/types.py          |  6 ++
 src/errorworks/engine/validators.py     | 12 +++-
 tests/unit/engine/test_config_loader.py | 10 ++++
 tests/unit/engine/test_types.py         | 15 +++++
 tests/unit/engine/test_validators.py    | 75 +++++++++++++++++++++++++
 6 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/errorworks/engine/config_loader.py b/src/errorworks/engine/config_loader.py
index 497027d..c210fee 100644
--- a/src/errorworks/engine/config_loader.py
+++ b/src/errorworks/engine/config_loader.py
@@ -36,7 +36,7 @@ def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]
         if key in result and isinstance(result[key], dict) and isinstance(value, dict):
             result[key] = deep_merge(result[key], value)
         else:
-            result[key] = value
+            result[key] = copy.deepcopy(value)
     return result
 
 
diff --git a/src/errorworks/engine/types.py b/src/errorworks/engine/types.py
index 5662076..6b3f7be 100644
--- a/src/errorworks/engine/types.py
+++ b/src/errorworks/engine/types.py
@@ -259,6 +259,12 @@ def __post_init__(self) -> None:
         if ts_dupes:
             raise ValueError(f"Duplicate timeseries column names: {sorted(ts_dupes)}")
 
+        # Check for duplicate index names
+        idx_names = [name for name, _col in self.request_indexes]
+        idx_dupes = {n for n in idx_names if idx_names.count(n) > 1}
+        if idx_dupes:
+            raise ValueError(f"Duplicate index names: {sorted(idx_dupes)}")
+
         # Validate that index columns reference actual request columns
         req_name_set = set(req_names)
         for index_name, col_name in self.request_indexes:
diff --git a/src/errorworks/engine/validators.py b/src/errorworks/engine/validators.py
index 0722435..e47d84b 100644
--- a/src/errorworks/engine/validators.py
+++ b/src/errorworks/engine/validators.py
@@ -86,9 +86,19 @@ def validate_error_decision(
         ValueError: If any invariant is violated.
     """
     if error_type is None:
-        # Success case: no other fields should be set
+        # Success case: no error-related fields should be set
         if category is not None:
             raise ValueError("Success decision must not have a category")
+        if status_code is not None:
+            raise ValueError(f"Success decision must not have a status_code, got {status_code}")
+        if retry_after_sec is not None:
+            raise ValueError("Success decision must not have retry_after_sec")
+        if delay_sec is not None:
+            raise ValueError("Success decision must not have delay_sec")
+        if start_delay_sec is not None:
+            raise ValueError("Success decision must not have start_delay_sec")
+        if malformed_type is not None:
+            raise ValueError("Success decision must not have malformed_type")
         return
 
     if category is None:
diff --git a/tests/unit/engine/test_config_loader.py b/tests/unit/engine/test_config_loader.py
index efb27e8..2ed2aa9 100644
--- a/tests/unit/engine/test_config_loader.py
+++ b/tests/unit/engine/test_config_loader.py
@@ -79,6 +79,16 @@ def test_result_does_not_alias_base_nested_dicts(self) -> None:
         result["a"]["nested"] = 99
         assert base["a"]["nested"] == 1, "Mutating result must not affect base"
 
+    def test_result_does_not_alias_override_mutable_values(self) -> None:
+        """Mutable values from override must be copied, not aliased into result."""
+        override_list = [1, 2, 3]
+        base = {"a": "old"}
+        override = {"a": override_list}
+        result = deep_merge(base, override)
+        # Mutating the override's list must not affect the result
+        override_list.append(4)
+        assert result["a"] == [1, 2, 3], "Result aliases override mutable value"
+
 
 # =============================================================================
 # list_presets
diff --git a/tests/unit/engine/test_types.py b/tests/unit/engine/test_types.py
index ed1ad26..b30754d 100644
--- a/tests/unit/engine/test_types.py
+++ b/tests/unit/engine/test_types.py
@@ -332,6 +332,21 @@ def test_bucket_utc_not_primary_key_raises(self) -> None:
                 ),
             )
 
+    def test_duplicate_index_names_raises(self) -> None:
+        """Two indexes with the same name should be rejected."""
+        with pytest.raises(ValueError, match="Duplicate index names"):
+            MetricsSchema(
+                request_columns=(
+                    ColumnDef(name="timestamp_utc", sql_type=SqlType.TEXT),
+                    ColumnDef(name="outcome", sql_type=SqlType.TEXT),
+                ),
+                timeseries_columns=_minimal_timeseries_columns(),
+                request_indexes=(
+                    ("idx_same", "timestamp_utc"),
+                    ("idx_same", "outcome"),
+                ),
+            )
+
 
 # =============================================================================
 # ServerConfig (Pydantic) validation
diff --git a/tests/unit/engine/test_validators.py b/tests/unit/engine/test_validators.py
index 28b4492..cc5e5ca 100644
--- a/tests/unit/engine/test_validators.py
+++ b/tests/unit/engine/test_validators.py
@@ -109,3 +109,78 @@ def test_extra_category_not_listed_still_raises(self) -> None:
                 valid_malformed_types=set(),
                 # extra_categories not provided — REDIRECT is unknown
             )
+
+
+class TestValidateErrorDecisionSuccessPath:
+    """Success decisions must have no error-related fields set."""
+
+    def test_success_with_status_code_raises(self) -> None:
+        """Success decision with status_code set should be rejected."""
+        with pytest.raises(ValueError, match="Success decision"):
+            validate_error_decision(
+                error_type=None,
+                category=None,
+                status_code=500,
+                retry_after_sec=None,
+                delay_sec=None,
+                start_delay_sec=None,
+                malformed_type=None,
+                http_category=_TestCategory.HTTP,
+                connection_category=_TestCategory.CONNECTION,
+                malformed_category=_TestCategory.MALFORMED,
+                valid_error_types=set(),
+                valid_malformed_types=set(),
+            )
+
+    def test_success_with_delay_raises(self) -> None:
+        """Success decision with delay_sec set should be rejected."""
+        with pytest.raises(ValueError, match="Success decision"):
+            validate_error_decision(
+                error_type=None,
+                category=None,
+                status_code=None,
+                retry_after_sec=None,
+                delay_sec=5.0,
+                start_delay_sec=None,
+                malformed_type=None,
+                http_category=_TestCategory.HTTP,
+                connection_category=_TestCategory.CONNECTION,
+                malformed_category=_TestCategory.MALFORMED,
+                valid_error_types=set(),
+                valid_malformed_types=set(),
+            )
+
+    def test_success_with_malformed_type_raises(self) -> None:
+        """Success decision with malformed_type set should be rejected."""
+        with pytest.raises(ValueError, match="Success decision"):
+            validate_error_decision(
+                error_type=None,
+                category=None,
+                status_code=None,
+                retry_after_sec=None,
+                delay_sec=None,
+                start_delay_sec=None,
+                malformed_type="truncated",
+                http_category=_TestCategory.HTTP,
+                connection_category=_TestCategory.CONNECTION,
+                malformed_category=_TestCategory.MALFORMED,
+                valid_error_types=set(),
+                valid_malformed_types={"truncated"},
+            )
+
+    def test_clean_success_passes(self) -> None:
+        """Success decision with all fields None passes validation."""
+        validate_error_decision(
+            error_type=None,
+            category=None,
+            status_code=None,
+            retry_after_sec=None,
+            delay_sec=None,
+            start_delay_sec=None,
+            malformed_type=None,
+            http_category=_TestCategory.HTTP,
+            connection_category=_TestCategory.CONNECTION,
+            malformed_category=_TestCategory.MALFORMED,
+            valid_error_types=set(),
+            valid_malformed_types=set(),
+        )

From f82781862a28df96d1e275ea0b6a91f33a2b2916 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 07:05:22 +1100
Subject: [PATCH 12/15] fix: handle template override length gracefully and
 validate config templates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Template override exceeding max_template_length now returns error
   content ("[template_override_error: exceeds max length ...]") instead
   of raising ValueError that propagated as an unhandled 500. This
   matches the existing pattern for Jinja syntax errors on the same
   code path — external data from request headers should never crash
   the server.

2. Config-sourced template body is now validated against
   max_template_length at ResponseGenerator construction time (fail
   fast). Previously only the header override path checked length,
   allowing arbitrarily large templates from config/presets.

Resolves: errorworks-f2e2b59860, errorworks-de0eb95ec6.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/llm/response_generator.py  | 34 +++++++++++++++--------
 tests/unit/llm/test_response_generator.py | 28 +++++++++++++++++++
 2 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/src/errorworks/llm/response_generator.py b/src/errorworks/llm/response_generator.py
index 7dae276..8662c83 100644
--- a/src/errorworks/llm/response_generator.py
+++ b/src/errorworks/llm/response_generator.py
@@ -239,9 +239,13 @@ def __init__(
         # Setup Jinja2 environment with custom helpers
         self._jinja_env = self._create_jinja_env()
 
-        # Pre-compile template at construction — fail fast on syntax errors
+        # Pre-compile template at construction — fail fast on syntax/length errors
         self._compiled_template: jinja2.Template | None = None
         if config.mode == "template":
+            if len(config.template.body) > config.max_template_length:
+                raise ValueError(
+                    f"Config template body exceeds max_template_length ({len(config.template.body)} > {config.max_template_length})"
+                )
             self._compiled_template = self._jinja_env.from_string(config.template.body)
 
     @property
@@ -423,21 +427,27 @@ def generate(
         elif mode == "template":
             # Use override template if provided
             if template_override is not None:
-                if len(template_override) > max_len:
-                    raise ValueError(f"Template override exceeds max length ({len(template_override)} > {max_len})")
                 # Header-override template is Tier 3 (external data from request header).
                 # Return error content instead of crashing — the server must produce
                 # a deterministic response, not an unplanned 500.
-                try:
-                    template = self._jinja_env.from_string(template_override)
-                    content = template.render(
-                        request=request,
-                        messages=request.get("messages", []),
-                        model=request.get("model"),
+                if len(template_override) > max_len:
+                    logger.warning(
+                        "template_override_too_long",
+                        length=len(template_override),
+                        max_length=max_len,
                     )
-                except jinja2.TemplateError as exc:
-                    logger.warning("template_override_error", error=str(exc), error_type=type(exc).__name__)
-                    content = f"[template_override_error: {exc}]"
+                    content = f"[template_override_error: exceeds max length ({len(template_override)} > {max_len})]"
+                else:
+                    try:
+                        template = self._jinja_env.from_string(template_override)
+                        content = template.render(
+                            request=request,
+                            messages=request.get("messages", []),
+                            model=request.get("model"),
+                        )
+                    except jinja2.TemplateError as exc:
+                        logger.warning("template_override_error", error=str(exc), error_type=type(exc).__name__)
+                        content = f"[template_override_error: {exc}]"
             else:
                 if self._compiled_template is None:
                     logger.warning("template_mode_unavailable", detail="server not configured for template mode")
diff --git a/tests/unit/llm/test_response_generator.py b/tests/unit/llm/test_response_generator.py
index 1919781..8217468 100644
--- a/tests/unit/llm/test_response_generator.py
+++ b/tests/unit/llm/test_response_generator.py
@@ -551,6 +551,34 @@ def test_malformed_template_override_returns_error_content(self) -> None:
         assert isinstance(response, OpenAIResponse)
         assert "template_override_error" in response.content
 
+    def test_template_override_too_long_returns_error_content(self) -> None:
+        """Oversized template override returns error in content, not ValueError crash."""
+        config = ResponseConfig(
+            mode="template",
+            template=TemplateResponseConfig(body="Normal response"),
+            max_template_length=100,
+        )
+        generator = ResponseGenerator(config)
+
+        request = {"model": "gpt-4", "messages": []}
+        long_template = "x" * 200
+        # Should return error content, not raise ValueError
+        response = generator.generate(request, template_override=long_template)
+        assert isinstance(response, OpenAIResponse)
+        assert "max length" in response.content.lower() or "too long" in response.content.lower()
+
+    def test_config_template_body_validated_against_max_length(self) -> None:
+        """Config-sourced template body exceeding max_template_length is rejected at init."""
+        long_body = "Hello " * 5000  # 30,000 chars, exceeds default 10,000
+        with pytest.raises((ValueError, jinja2.TemplateError)):
+            ResponseGenerator(
+                ResponseConfig(
+                    mode="template",
+                    template=TemplateResponseConfig(body=long_body),
+                    max_template_length=100,
+                )
+            )
+
     def test_template_override_undefined_var_returns_error_content(self) -> None:
         """Override template with undefined variable returns error, not crash."""
         config = ResponseConfig(

From 751bca60c547533af7dffd4416d75bd64fcae758 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 07:13:20 +1100
Subject: [PATCH 13/15] fix: stop leaking sqlite3 error details to HTTP clients
 and fix warning stacklevel

Admin endpoints (stats, reset, export) included raw sqlite3.Error
messages in HTTP response bodies, potentially exposing table names,
column names, file paths, and query fragments. Now returns a generic
error message and logs the actual error server-side via structlog.

Also removed misleading stacklevel=2 from warnings.warn() calls in
both LLM and Web config validators. Inside Pydantic model_validators,
stacklevel=2 points to Pydantic internals rather than the user's code.
Suppressed B028 lint rule with explanation since no correct stacklevel
exists for Pydantic validator context.

Resolves: errorworks-8c144e8704, errorworks-0edc821730,
errorworks-874e295b0a.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/engine/admin.py  | 15 +++++++++------
 src/errorworks/llm/config.py    |  3 +--
 src/errorworks/web/config.py    |  3 +--
 tests/unit/engine/test_admin.py | 15 +++++++++++++--
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/errorworks/engine/admin.py b/src/errorworks/engine/admin.py
index 3d4cec7..d2c779c 100644
--- a/src/errorworks/engine/admin.py
+++ b/src/errorworks/engine/admin.py
@@ -87,9 +87,10 @@ async def handle_admin_stats(request: Request, server: ChaosServer) -> JSONRespo
         return denied
     try:
         return JSONResponse(server.get_stats())
-    except sqlite3.Error as e:
+    except sqlite3.Error:
+        logger.exception("admin_stats_database_error")
         return JSONResponse(
-            {"error": {"type": "database_error", "message": f"Failed to retrieve stats: {e}"}},
+            {"error": {"type": "database_error", "message": "Failed to retrieve stats due to a database error"}},
             status_code=503,
         )
 
@@ -100,9 +101,10 @@ async def handle_admin_reset(request: Request, server: ChaosServer) -> JSONRespo
         return denied
     try:
         new_run_id = server.reset()
-    except sqlite3.Error as e:
+    except sqlite3.Error:
+        logger.exception("admin_reset_database_error")
         return JSONResponse(
-            {"error": {"type": "database_error", "message": f"Failed to reset metrics: {e}"}},
+            {"error": {"type": "database_error", "message": "Failed to reset metrics due to a database error"}},
             status_code=503,
         )
     return JSONResponse({"status": "reset", "new_run_id": new_run_id})
@@ -114,8 +116,9 @@ async def handle_admin_export(request: Request, server: ChaosServer) -> JSONResp
         return denied
     try:
         return JSONResponse(server.export_metrics())
-    except sqlite3.Error as e:
+    except sqlite3.Error:
+        logger.exception("admin_export_database_error")
         return JSONResponse(
-            {"error": {"type": "database_error", "message": f"Failed to export metrics: {e}"}},
+            {"error": {"type": "database_error", "message": "Failed to export metrics due to a database error"}},
             status_code=503,
         )
diff --git a/src/errorworks/llm/config.py b/src/errorworks/llm/config.py
index 8f164b2..b5a5a0d 100644
--- a/src/errorworks/llm/config.py
+++ b/src/errorworks/llm/config.py
@@ -393,9 +393,8 @@ def warn_total_percentage(self) -> "ErrorInjectionConfig":
             return self
         total = sum(getattr(self, name) for name in type(self).model_fields if name.endswith("_pct"))
         if total >= 100.0:
-            warnings.warn(
+            warnings.warn(  # noqa: B028 — stacklevel is unreliable inside Pydantic model validators
                 f"Total error percentages ({total:.1f}%) reach or exceed 100% in weighted mode. No successful responses will be generated.",
-                stacklevel=2,
             )
         return self
 
diff --git a/src/errorworks/web/config.py b/src/errorworks/web/config.py
index 4249906..7f0b5da 100644
--- a/src/errorworks/web/config.py
+++ b/src/errorworks/web/config.py
@@ -440,9 +440,8 @@ def warn_total_percentage(self) -> "WebErrorInjectionConfig":
             return self
         total = sum(getattr(self, name) for name in type(self).model_fields if name.endswith("_pct"))
         if total >= 100.0:
-            warnings.warn(
+            warnings.warn(  # noqa: B028 — stacklevel is unreliable inside Pydantic model validators
                 f"Total error percentages ({total:.1f}%) reach or exceed 100% in weighted mode. No successful responses will be generated.",
-                stacklevel=2,
             )
         return self
 
diff --git a/tests/unit/engine/test_admin.py b/tests/unit/engine/test_admin.py
index 22a36e6..597b26f 100644
--- a/tests/unit/engine/test_admin.py
+++ b/tests/unit/engine/test_admin.py
@@ -241,6 +241,17 @@ def test_sqlite_error_returns_503(self) -> None:
         assert resp.status_code == 503
         assert resp.json()["error"]["type"] == "database_error"
 
+    def test_sqlite_error_does_not_leak_details(self) -> None:
+        """sqlite3.Error message must not appear in HTTP response body."""
+        server = _make_mock_server()
+        server.get_stats.side_effect = sqlite3.OperationalError("no such table: requests")
+        client = _make_app(server)
+        resp = client.get("/admin/stats", headers=self._headers())
+        assert resp.status_code == 503
+        body = resp.json()
+        assert "no such table" not in body["error"]["message"]
+        assert "requests" not in body["error"]["message"]
+
 
 # =============================================================================
 # handle_admin_reset
@@ -265,14 +276,14 @@ def test_reset_returns_new_run_id(self) -> None:
         server.reset.assert_called_once()
 
     def test_sqlite_error_returns_503(self) -> None:
-        """sqlite3.Error from reset returns 503 with structured error."""
+        """sqlite3.Error from reset returns 503 with generic error (no detail leak)."""
         server = _make_mock_server()
         server.reset.side_effect = sqlite3.OperationalError("disk I/O error")
         client = _make_app(server)
         resp = client.post("/admin/reset", headers=self._headers())
         assert resp.status_code == 503
         assert resp.json()["error"]["type"] == "database_error"
-        assert "disk I/O error" in resp.json()["error"]["message"]
+        assert "disk I/O error" not in resp.json()["error"]["message"]
 
     def test_auth_required(self) -> None:
         """Reset without auth returns 401."""

From 3f767db7da48163d169ee4b4e01f37accfeade31 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 07:17:19 +1100
Subject: [PATCH 14/15] fix: resolve 4 minor audit findings (P4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Unused logger in engine/admin.py — already resolved by prior commit
   (logger.exception calls added for sqlite error handling).

2. _select_weighted defensive return incorrectly marked pragma: no cover
   — removed pragma since the line IS reachable via floating-point
   accumulation edge cases. Updated comment to explain when.

3. Root __init__.py __all__ omitted llm_mcp subpackage — added it.
   Also updated __version__ to match 0.1.2.

4. validators.py and vocabulary.py low test coverage — added direct
   unit tests for get_vocabulary() (3 tests). Validator coverage was
   already improved by the earlier validation cluster commit.

Resolves: errorworks-a70817c373, errorworks-9ea9c814bb,
errorworks-8c7d697788, errorworks-a532ae71b1.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/errorworks/__init__.py                |  3 ++-
 src/errorworks/engine/injection_engine.py |  7 ++++---
 tests/unit/engine/test_vocabulary.py      | 21 +++++++++++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit/engine/test_vocabulary.py

diff --git a/src/errorworks/__init__.py b/src/errorworks/__init__.py
index e5223c9..c739d63 100644
--- a/src/errorworks/__init__.py
+++ b/src/errorworks/__init__.py
@@ -1,11 +1,12 @@
 """Composable chaos-testing servers for LLM and web scraping pipelines."""
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 
 __all__ = [
     "__version__",
     "engine",
     "llm",
+    "llm_mcp",
     "testing",
     "web",
 ]
diff --git a/src/errorworks/engine/injection_engine.py b/src/errorworks/engine/injection_engine.py
index 5da27bf..f0141f6 100644
--- a/src/errorworks/engine/injection_engine.py
+++ b/src/errorworks/engine/injection_engine.py
@@ -175,9 +175,10 @@ def _select_weighted(self, specs: list[ErrorSpec]) -> ErrorSpec | None:
             if roll < threshold:
                 return spec
 
-        # Unreachable: roll < total_weight (guarded above) guarantees a match
-        # in the loop. Defensive return for static analysis / type checkers.
-        return None  # pragma: no cover
+        # Defensive return for static analysis / type checkers.
+        # Reachable via floating-point accumulation edge cases where
+        # cumulative sum never reaches roll due to precision loss.
+        return None
 
     def reset(self) -> None:
         """Reset the engine state (clears burst timing)."""
diff --git a/tests/unit/engine/test_vocabulary.py b/tests/unit/engine/test_vocabulary.py
new file mode 100644
index 0000000..da3cfa4
--- /dev/null
+++ b/tests/unit/engine/test_vocabulary.py
@@ -0,0 +1,21 @@
+"""Tests for errorworks.engine.vocabulary."""
+
+from __future__ import annotations
+
+import pytest
+
+from errorworks.engine.vocabulary import ENGLISH_VOCABULARY, LOREM_VOCABULARY, get_vocabulary
+
+
+class TestGetVocabulary:
+    """Tests for get_vocabulary lookup function."""
+
+    def test_english_returns_english_vocabulary(self) -> None:
+        assert get_vocabulary("english") is ENGLISH_VOCABULARY
+
+    def test_lorem_returns_lorem_vocabulary(self) -> None:
+        assert get_vocabulary("lorem") is LOREM_VOCABULARY
+
+    def test_unknown_name_raises_value_error(self) -> None:
+        with pytest.raises(ValueError, match="Unknown vocabulary"):
+            get_vocabulary("klingon")

From f16ce8daf8bb4e45194370dcd98eb5a8280c9023 Mon Sep 17 00:00:00 2001
From: John Morrissey <johnm-dta@users.noreply.github.com>
Date: Mon, 23 Mar 2026 07:50:56 +1100
Subject: [PATCH 15/15] fix: update filigree-mcp command path to use local
 installation

---
 .mcp.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.mcp.json b/.mcp.json
index 17a2c03..d061761 100644
--- a/.mcp.json
+++ b/.mcp.json
@@ -2,7 +2,7 @@
   "mcpServers": {
     "filigree": {
       "type": "stdio",
-      "command": "/home/john/errorworks/.venv/bin/filigree-mcp",
+      "command": "/home/john/.local/bin/filigree-mcp",
       "args": [
         "--project",
         "/home/john/errorworks"