From 8df287b4605c4f5a9a7df92a3eb5ff5d575134a7 Mon Sep 17 00:00:00 2001
From: Ghost Scripter <ghostscripter@zerolend.xyz>
Date: Thu, 28 May 2026 03:24:46 +0530
Subject: [PATCH] fix(observability): demote reliable_chat aggregate to
 user-config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`reliable::format_failure_aggregate` (no-configured-fallbacks branch)
wraps every exhausted `reliable_chat_with_system` turn with:

  "The model `<name>` may not be available on your provider.
   Configure a fallback chain via `reliability.model_fallbacks` in
   your OpenHuman config, or change your default model in Settings
   → AI.\n\nAll providers/models failed. Attempts:\n…"

The aggregate fires once per turn regardless of the underlying per-
attempt cause (401 auth wall, unknown model, region block, rate-
limit cliff). All of those are user-actionable: pick a different
model, fix the credential, or configure fallbacks — the message
literally tells the user how. Sentry has no remediation path that
the per-attempt body classifiers haven't already covered at the
lower layer (`SessionExpired`, `BudgetExhausted`, config_rejection
siblings, etc.).

Adds `"reliability.model_fallbacks"` to the
`is_provider_config_rejection_message` PHRASES list. The string is
uniquely OpenHuman — that config path is rendered into an error
message only from `reliable.rs:332-334`, verified via grep across
`src/`. A stray "may not be available" log line elsewhere will not
collide. The configured-fallbacks aggregate branch (just
`"All providers/models failed. Attempts:\n…"`) is intentionally
NOT matched — the user has already engaged with the knob, so per-
attempt classifiers should drive the per-body decision.

Targets Sentry OPENHUMAN-TAURI-4JS (issue 5215): 25 events on
v0.56.0 in 5h, `domain=llm_provider operation=reliable_chat_with_system
failure=all_exhausted`. The current 25-event sample carries an
"Invalid token" 401 underlying cause (body-equivalent to the
already-open PR #2786, which would also demote this aggregate via
the body substring match). This PR catches the aggregate at the
emit-site level so future all_exhausted scenarios with non-401
underlying causes (model name typo, region block, …) demote the
same way.

Tests pin the verbatim 4JS payload + three underlying-cause variants
(unknown-model upstream, region block, bare aggregate) + a negative
guard confirming the configured-fallbacks branch does NOT classify on
the aggregate phrase alone.
---
 .../inference/provider/config_rejection.rs    | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
diff --git a/src/openhuman/inference/provider/config_rejection.rs b/src/openhuman/inference/provider/config_rejection.rs
index df69b159f2..561bf584b5 100644
--- a/src/openhuman/inference/provider/config_rejection.rs
+++ b/src/openhuman/inference/provider/config_rejection.rs
@@ -172,6 +172,29 @@ pub fn is_provider_config_rejection_message(body: &str) -> bool {
         // cover. The UI surfaces an actionable upgrade link in the
         // remediation message itself.
         "requires a subscription, upgrade for access",
+        // OPENHUMAN-TAURI-4JS — `reliable.rs::format_failure_aggregate`
+        // (no-configured-fallbacks branch) wraps every exhausted
+        // `reliable_chat_with_system` turn with:
+        //
+        //   "The model `<name>` may not be available on your provider.
+        //    Configure a fallback chain via `reliability.model_fallbacks`
+        //    in your OpenHuman config, or change your default model in
+        //    Settings → AI.\n\nAll providers/models failed. Attempts:\n…"
+        //
+        // The aggregate fires once per turn regardless of the underlying
+        // per-attempt cause (auth wall, unknown model, region block,
+        // rate-limit cliff). All of those are user-actionable: pick a
+        // different model, fix the credential, or configure fallbacks —
+        // the message body literally tells the user how. Sentry has no
+        // remediation path the per-attempt classifiers haven't already
+        // covered at the lower layer (provider/ops.rs:486 publishes
+        // SessionExpired, billing_error covers credit walls, etc.).
+        //
+        // Anchored on `reliability.model_fallbacks` because that
+        // OpenHuman-specific config path appears only in this aggregate
+        // emit site (verified via grep across `src/`). A stray "may not
+        // be available" log line elsewhere will not collide.
+        "reliability.model_fallbacks",
     ];
 
     let lower = body.to_ascii_lowercase();
@@ -294,6 +317,78 @@ mod tests {
         }
     }
 
+    #[test]
+    fn detects_reliable_aggregate_no_fallbacks_envelope() {
+        // OPENHUMAN-TAURI-4JS — `reliable::format_failure_aggregate`
+        // (no-configured-fallbacks branch) wraps every exhausted turn.
+        // Pin a few realistic shapes:
+        //
+        //   1. Verbatim Sentry 4JS payload (auth wall as the per-attempt cause).
+        //   2. Same aggregate, unknown-model upstream body (proves the matcher
+        //      is per-emit-site, not per-underlying-cause).
+        //   3. Same aggregate, region-block per-attempt body (R1-sibling cause).
+        //   4. Bare two-line aggregate (only the literal prefix + an empty
+        //      attempts dump).
+        //
+        // All four must classify; the unique anchor is the
+        // `reliability.model_fallbacks` config path the message literally
+        // tells the user to set.
+        for raw in [
+            // 1) Verbatim 4JS payload.
+            "The model `reasoning-quick-v1` may not be available on your provider. \
+             Configure a fallback chain via `reliability.model_fallbacks` in your \
+             OpenHuman config, or change your default model in Settings → AI.\n\n\
+             All providers/models failed. Attempts:\n\
+             provider=openhuman model=reasoning-quick-v1 attempt 1/3: non_retryable; \
+             error=OpenHuman API error (401 Unauthorized): {\"success\":false,\"error\":\"Invalid token\"}",
+            // 2) Unknown-model upstream cause.
+            "The model `gpt-5.5` may not be available on your provider. \
+             Configure a fallback chain via `reliability.model_fallbacks` in your \
+             OpenHuman config, or change your default model in Settings → AI.\n\n\
+             All providers/models failed. Attempts:\n\
+             provider=custom_openai model=gpt-5.5 attempt 1/3: non_retryable; \
+             error=custom_openai API error (404 Not Found): {\"error\":\"model not found\"}",
+            // 3) Region-block (R1-sibling) per-attempt cause.
+            "The model `gpt-4o` may not be available on your provider. \
+             Configure a fallback chain via `reliability.model_fallbacks` in your \
+             OpenHuman config, or change your default model in Settings → AI.\n\n\
+             All providers/models failed. Attempts:\n\
+             provider=custom_openai model=gpt-4o attempt 1/3: non_retryable; \
+             error=custom_openai API error (403 Forbidden): {\"error\":{\"message\":\"This model is not available in your region.\"}}",
+            // 4) Bare aggregate — minimal anchor surface.
+            "The model `x` may not be available on your provider. \
+             Configure a fallback chain via `reliability.model_fallbacks` in your \
+             OpenHuman config, or change your default model in Settings → AI.\n\n\
+             All providers/models failed. Attempts:\n",
+        ] {
+            assert!(
+                is_provider_config_rejection_message(raw),
+                "OPENHUMAN-TAURI-4JS aggregate must classify as provider config-rejection: {raw:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn does_not_classify_reliable_aggregate_with_configured_fallbacks() {
+        // The configured-fallbacks branch of `format_failure_aggregate`
+        // emits ONLY the attempts dump (`"All providers/models failed.
+        // Attempts:\n…"`), with no `reliability.model_fallbacks`
+        // remediation hint — the user has already engaged with the knob,
+        // so the aggregate is closer to a real diagnostic surface than a
+        // user-config nudge. Without the anchor phrase, this matcher
+        // must NOT fire on its own — only the per-attempt body
+        // classifiers (#2786 SessionExpired, config_rejection siblings,
+        // …) can demote it on a per-shape basis.
+        let aggregate_with_fallbacks = "All providers/models failed. Attempts:\n\
+             provider=openhuman model=gpt-5.5 attempt 1/3: non_retryable; \
+             error=OpenHuman API error (404 Not Found): {\"error\":\"unknown model\"}";
+        assert!(
+            !is_provider_config_rejection_message(aggregate_with_fallbacks),
+            "configured-fallbacks aggregate (no `reliability.model_fallbacks` anchor) \
+             must NOT classify on the aggregate phrase alone"
+        );
+    }
+
     #[test]
     fn detection_is_case_insensitive() {
         assert!(is_provider_config_rejection_message(