From d7bd55384176c10bd553a7ec7b7c35bd41874cd3 Mon Sep 17 00:00:00 2001
From: liwenkai <2020583117@qq.com>
Date: Tue, 30 Jun 2026 20:55:47 +0800
Subject: [PATCH 1/2] =?UTF-8?q?refactor(auto):=20Flash-first=20=E7=BA=A7?=
 =?UTF-8?q?=E8=81=94=E8=B7=AF=E7=94=B1,=E7=A7=BB=E9=99=A4=E9=95=BF?=
 =?UTF-8?q?=E5=BA=A6=E7=8C=9C=E6=B5=8B=E4=B8=8E=20Flash=20=E8=B7=AF?=
 =?UTF-8?q?=E7=94=B1=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

auto 模式不再靠提示长度或 Flash 分类器预判难度。除上下文压力、
debug/重构/架构/安全等明显难关键词直接走 Pro 外,其余一律 Flash-first;
当一轮内工具调用执行失败累计 ≥2 次(Flash 明显搞不定)时,本会话
粘性升级到 Pro(RouteSource::Cascade),经 telemetry 的 route_source /
route_reason 透出以便观测。

- 删除 runtime/routing.rs 整个 Flash LLM 路由器子系统及其测试
- 删除 auto_mode 的 ModelClass/Ambiguous 灰区与 classify_by_length 长度启发式
- classify_model 改为直接返回 (model, reason, source)
- run_loop 同步直调 resolve_turn_route(不再 select! 等网络路由)
---
 crates/deep-code-agent/src/auto_mode.rs       | 169 ++++-----
 crates/deep-code-agent/src/runtime.rs         |   6 +-
 .../src/runtime/integration_tests.rs          | 142 -------
 .../src/runtime/persistence.rs                |   2 +
 crates/deep-code-agent/src/runtime/routing.rs | 353 ------------------
 crates/deep-code-agent/src/runtime/state.rs   |   7 +
 .../src/runtime/tool_result.rs                |  17 +
 .../deep-code-agent/src/runtime/turn_loop.rs  |  26 +-
 8 files changed, 125 insertions(+), 597 deletions(-)
 delete mode 100644 crates/deep-code-agent/src/runtime/routing.rs

diff --git a/crates/deep-code-agent/src/auto_mode.rs b/crates/deep-code-agent/src/auto_mode.rs
index 6397ea0..f858aa6 100644
--- a/crates/deep-code-agent/src/auto_mode.rs
+++ b/crates/deep-code-agent/src/auto_mode.rs
@@ -8,18 +8,17 @@ use crate::task_class::{TaskWeight, classify_keyword};
 /// Force the strong model once the session fills this fraction of the context
 /// window — long contexts need Pro regardless of how the prompt reads.
 const CONTEXT_PRESSURE_PERCENT: u64 = 70;
-/// Prompts shorter than this (and free of difficulty keywords) default to Flash.
-const SHORT_PROMPT_CHARS: usize = 100;
 
 /// What decided a turn's route, for explainable telemetry.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum RouteSource {
     /// A non-negotiable rule (sub-agent, fixed model, context pressure).
     HardRule,
-    /// The keyword/length heuristic.
+    /// The keyword heuristic (difficulty keyword → Pro), else Flash-first.
     Heuristic,
-    /// The Flash classifier resolved an otherwise-ambiguous turn.
-    FlashRouter,
+    /// Cascade escalation: Flash visibly struggled earlier this session, so
+    /// later turns run on Pro until the session ends.
+    Cascade,
 }
 
 impl RouteSource {
@@ -28,7 +27,7 @@ impl RouteSource {
         match self {
             Self::HardRule => "hard-rule",
             Self::Heuristic => "heuristic",
-            Self::FlashRouter => "flash-router",
+            Self::Cascade => "cascade",
         }
     }
 }
@@ -40,6 +39,9 @@ pub struct RouteContext {
     pub context_tokens: u32,
     /// Context window of the model family (0 disables the pressure rule).
     pub context_window: u32,
+    /// Cascade escalation latch: Flash already struggled (repeated tool-call
+    /// failures) earlier this session, so force Pro for the rest of it.
+    pub escalated: bool,
 }
 
 impl RouteContext {
@@ -120,21 +122,6 @@ pub fn api_fallback_model(route: &TurnRoute) -> Option<&'static str> {
     }
 }
 
-/// A model-selection outcome from the deterministic heuristic.
-///
-/// `Ambiguous` is the gray zone the Phase-2 Flash router resolves; callers
-/// without a router fall back to Flash.
-pub(crate) enum ModelClass {
-    Decisive {
-        model: String,
-        reason: String,
-        source: RouteSource,
-    },
-    Ambiguous {
-        reason: String,
-    },
-}
-
 /// Resolve the concrete model + reasoning effort for one user turn.
 #[must_use]
 pub fn resolve_turn_route(
@@ -168,21 +155,7 @@ pub fn resolve_turn_route(
     }
 
     let (effective_model, route_reason, source) =
-        match classify_model(user_prompt, &ctx, config.auto_cost_saving) {
-            ModelClass::Decisive {
-                model,
-                reason,
-                source,
-            } => (model, reason, source),
-            ModelClass::Ambiguous { reason } => {
-                // No router yet: the gray zone defaults to Flash.
-                (
-                    DEEPSEEK_V4_FLASH.to_string(),
-                    reason,
-                    RouteSource::Heuristic,
-                )
-            }
-        };
+        classify_model(user_prompt, &ctx, config.auto_cost_saving);
 
     // Effort and model both derive from `task_class`, so they stay coherent.
     let effort = config.reasoning_effort.resolve(is_subagent, user_prompt);
@@ -211,68 +184,64 @@ pub fn select_auto_model(input: &str, cost_saving: bool) -> String {
 /// Model + human-readable reason for status surfaces (no session context).
 #[must_use]
 pub fn select_auto_model_with_reason(input: &str, cost_saving: bool) -> (String, String) {
-    match classify_model(input, &RouteContext::default(), cost_saving) {
-        ModelClass::Decisive { model, reason, .. } => (model, reason),
-        ModelClass::Ambiguous { reason } => (DEEPSEEK_V4_FLASH.to_string(), reason),
-    }
+    let (model, reason, _) = classify_model(input, &RouteContext::default(), cost_saving);
+    (model, reason)
 }
 
-/// Deterministic model selection over the shared [`crate::task_class`] table.
-/// Priority: context pressure → difficulty keyword → length, with the
-/// 100‑to‑threshold gray zone left `Ambiguous` for the Flash router.
-pub(crate) fn classify_model(input: &str, ctx: &RouteContext, cost_saving: bool) -> ModelClass {
+/// Flash-first model selection over the shared [`crate::task_class`] table.
+///
+/// Returns `(model, human-readable reason, source)`. Pro is forced only by hard
+/// facts (cascade escalation, context pressure) or an explicit difficulty
+/// keyword. Everything else starts on Flash — cascade escalation (driven by
+/// observed tool-call failures) upgrades later turns when Flash actually
+/// struggles, so we no longer guess difficulty from prompt length.
+pub(crate) fn classify_model(
+    input: &str,
+    ctx: &RouteContext,
+    cost_saving: bool,
+) -> (String, String, RouteSource) {
+    if ctx.escalated {
+        return (
+            DEEPSEEK_V4_PRO.to_string(),
+            "级联升级：本会话内 Flash 工具调用反复失败，改用 Pro 接管".to_string(),
+            RouteSource::Cascade,
+        );
+    }
+
     if ctx.under_pressure() {
-        return ModelClass::Decisive {
-            model: DEEPSEEK_V4_PRO.to_string(),
-            reason: format!(
+        return (
+            DEEPSEEK_V4_PRO.to_string(),
+            format!(
                 "上下文占用约 {}%（≥{CONTEXT_PRESSURE_PERCENT}% 阈值），使用 Pro 处理长上下文",
                 ctx.usage_percent()
             ),
-            source: RouteSource::HardRule,
-        };
+            RouteSource::HardRule,
+        );
     }
 
     match classify_keyword(input) {
-        Some((TaskWeight::Deep, keyword)) => ModelClass::Decisive {
-            model: DEEPSEEK_V4_PRO.to_string(),
-            reason: format!("命中调试/报错类关键词“{keyword}”，使用 Pro 配深推理"),
-            source: RouteSource::Heuristic,
-        },
-        Some((TaskWeight::Heavy, keyword)) => ModelClass::Decisive {
-            model: DEEPSEEK_V4_PRO.to_string(),
-            reason: format!("命中复杂任务关键词“{keyword}”，使用 Pro 以获得更强推理和工具规划能力"),
-            source: RouteSource::Heuristic,
-        },
-        Some((TaskWeight::Borderline, keyword)) if !cost_saving => ModelClass::Decisive {
-            model: DEEPSEEK_V4_PRO.to_string(),
-            reason: format!("任务包含“{keyword}”，且未开启成本优先，使用 Pro"),
-            source: RouteSource::Heuristic,
-        },
-        // Borderline under cost-saving and Light keywords fall through to the
-        // length check below (Light shouldn't force Flash on a long prompt).
-        _ => classify_by_length(input, cost_saving),
-    }
-}
-
-fn classify_by_length(input: &str, cost_saving: bool) -> ModelClass {
-    let len = input.chars().count();
-    if len < SHORT_PROMPT_CHARS {
-        return ModelClass::Decisive {
-            model: DEEPSEEK_V4_FLASH.to_string(),
-            reason: "短提示优先使用 Flash，降低延迟和成本".to_string(),
-            source: RouteSource::Heuristic,
-        };
-    }
-    let long_threshold = if cost_saving { 1_000 } else { 500 };
-    if len > long_threshold {
-        return ModelClass::Decisive {
-            model: DEEPSEEK_V4_PRO.to_string(),
-            reason: format!("输入长度 {len} 超过阈值 {long_threshold}，使用 Pro 处理长上下文"),
-            source: RouteSource::Heuristic,
-        };
-    }
-    ModelClass::Ambiguous {
-        reason: format!("中等长度（{len} 字）且无明确难度信号，待进一步判定"),
+        Some((TaskWeight::Deep, keyword)) => (
+            DEEPSEEK_V4_PRO.to_string(),
+            format!("命中调试/报错类关键词“{keyword}”，使用 Pro 配深推理"),
+            RouteSource::Heuristic,
+        ),
+        Some((TaskWeight::Heavy, keyword)) => (
+            DEEPSEEK_V4_PRO.to_string(),
+            format!("命中复杂任务关键词“{keyword}”，使用 Pro 以获得更强推理和工具规划能力"),
+            RouteSource::Heuristic,
+        ),
+        Some((TaskWeight::Borderline, keyword)) if !cost_saving => (
+            DEEPSEEK_V4_PRO.to_string(),
+            format!("任务包含“{keyword}”，且未开启成本优先，使用 Pro"),
+            RouteSource::Heuristic,
+        ),
+        // Everything else (Light keywords, Borderline under cost-saving, no
+        // keyword) starts on Flash; cascade upgrades it if Flash struggles.
+        _ => (
+            DEEPSEEK_V4_FLASH.to_string(),
+            "默认先用 Flash（更快更省）；若工具调用反复失败，级联会升级到 Pro".to_string(),
+            RouteSource::Heuristic,
+        ),
     }
 }
 
@@ -409,12 +378,32 @@ mod tests {
         let ctx = RouteContext {
             context_tokens: 800_000,
             context_window: 1_000_000,
+            escalated: false,
         };
         let route = resolve_turn_route(&config, &ModelRegistry::default(), "hi", false, ctx);
         assert_eq!(route.effective_model, DEEPSEEK_V4_PRO);
         assert_eq!(route.source, RouteSource::HardRule);
     }
 
+    #[test]
+    fn cascade_escalation_forces_pro_on_trivial_prompt() {
+        // Once Flash has struggled this session, even a short trivial prompt
+        // that would normally be Flash routes to Pro, tagged as Cascade.
+        let config = AgentConfig {
+            model: AUTO_MODEL.to_string(),
+            reasoning_effort: ReasoningEffortSetting::Auto,
+            ..AgentConfig::default()
+        };
+        let ctx = RouteContext {
+            escalated: true,
+            ..RouteContext::default()
+        };
+        let route = resolve_turn_route(&config, &ModelRegistry::default(), "hi", false, ctx);
+        assert_eq!(route.effective_model, DEEPSEEK_V4_PRO);
+        assert_eq!(route.source, RouteSource::Cascade);
+        assert!(route.route_reason.contains("级联升级"));
+    }
+
     #[test]
     fn api_fallback_only_for_auto_pro() {
         let route = TurnRoute {
@@ -440,7 +429,7 @@ mod tests {
 
         let (model, reason) = select_auto_model_with_reason("hi", false);
         assert_eq!(model, DEEPSEEK_V4_FLASH);
-        assert!(reason.contains("短提示"));
+        assert!(reason.contains("Flash"));
     }
 
     #[test]
diff --git a/crates/deep-code-agent/src/runtime.rs b/crates/deep-code-agent/src/runtime.rs
index 9500ccd..700b33b 100644
--- a/crates/deep-code-agent/src/runtime.rs
+++ b/crates/deep-code-agent/src/runtime.rs
@@ -19,7 +19,6 @@ mod event;
 mod handle;
 mod persistence;
 mod persistence_actor;
-mod routing;
 mod state;
 mod streaming;
 mod telemetry;
@@ -145,6 +144,8 @@ impl<C: LlmClient + 'static> AgentRuntime<C> {
                 cancel: CancellationToken::new(),
                 session_approved: Default::default(),
                 session_trusted_shell_prefixes: Default::default(),
+                cascade_escalated: false,
+                turn_tool_errors: 0,
             })),
             checkpoints: None,
             workspace: None,
@@ -189,6 +190,9 @@ impl<C: LlmClient + 'static> AgentRuntime<C> {
             state.current_prompt = Some(prompt);
             state.current_turn_id = Some(turn_id);
             state.cancel = CancellationToken::new();
+            // Per-turn struggle counter resets; the `cascade_escalated` latch
+            // intentionally persists for the rest of the session.
+            state.turn_tool_errors = 0;
         }
         self.persist().await;
     }
diff --git a/crates/deep-code-agent/src/runtime/integration_tests.rs b/crates/deep-code-agent/src/runtime/integration_tests.rs
index e9b58aa..4ffdd39 100644
--- a/crates/deep-code-agent/src/runtime/integration_tests.rs
+++ b/crates/deep-code-agent/src/runtime/integration_tests.rs
@@ -1937,145 +1937,3 @@ async fn cancel_turn_when_idle_is_silent_noop() {
         Some(RuntimeEvent::TurnFinished { .. })
     ));
 }
-
-const AMBIGUOUS_PROMPT: &str = "Walk me through the overall layout of this project and what the different \
-parts are generally for, so I can get my bearings before we begin making changes together.";
-
-fn auto_runtime(client: ScriptedClient) -> AgentRuntime<ScriptedClient> {
-    let config = AgentConfig {
-        model: crate::model_registry::AUTO_MODEL.to_string(),
-        reasoning_effort: crate::reasoning::ReasoningEffortSetting::Auto,
-        ..AgentConfig::builtin()
-    };
-    AgentRuntime::with_config(client, ToolRegistry::default(), config)
-}
-
-#[tokio::test]
-async fn flash_router_resolves_ambiguous_turn() {
-    // The ambiguous gray zone escalates to the Flash classifier, which here
-    // returns Pro/max; the route is sourced from the router.
-    let client = ScriptedClient::new(vec![vec![
-        AgentEvent::TextDelta {
-            text: r#"{"model":"pro","thinking":"max"}"#.to_string(),
-        },
-        AgentEvent::Done { usage: None },
-    ]]);
-    let runtime = auto_runtime(client);
-
-    let route = runtime
-        .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default())
-        .await;
-
-    assert_eq!(route.source, crate::auto_mode::RouteSource::FlashRouter);
-    assert_eq!(
-        route.effective_model,
-        crate::model_registry::DEEPSEEK_V4_PRO
-    );
-    assert_eq!(
-        route.effective_effort,
-        crate::reasoning::ReasoningEffort::Max
-    );
-}
-
-#[tokio::test]
-async fn flash_router_assembles_prefix_completion() {
-    // /beta prefix completion: the model returns only the JSON continuation
-    // (the assistant reply was seeded with `{"model":"`).
-    let client = ScriptedClient::new(vec![vec![
-        AgentEvent::TextDelta {
-            text: "pro\",\"thinking\":\"high\"".to_string(),
-        },
-        AgentEvent::Done { usage: None },
-    ]]);
-    let runtime = auto_runtime(client);
-
-    let route = runtime
-        .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default())
-        .await;
-
-    assert_eq!(route.source, crate::auto_mode::RouteSource::FlashRouter);
-    assert_eq!(
-        route.effective_model,
-        crate::model_registry::DEEPSEEK_V4_PRO
-    );
-    assert_eq!(
-        route.effective_effort,
-        crate::reasoning::ReasoningEffort::High
-    );
-}
-
-#[tokio::test]
-async fn decisive_turn_skips_flash_router() {
-    // A keyword-decisive prompt must not consult the router at all.
-    let client = ScriptedClient::new(vec![vec![
-        AgentEvent::TextDelta {
-            text: r#"{"model":"flash","thinking":"off"}"#.to_string(),
-        },
-        AgentEvent::Done { usage: None },
-    ]]);
-    let runtime = auto_runtime(client);
-
-    let route = runtime
-        .route_turn(
-            "refactor the authentication module",
-            crate::auto_mode::RouteContext::default(),
-        )
-        .await;
-
-    assert_eq!(route.source, crate::auto_mode::RouteSource::Heuristic);
-    assert_eq!(
-        route.effective_model,
-        crate::model_registry::DEEPSEEK_V4_PRO
-    );
-}
-
-#[tokio::test]
-async fn router_disabled_uses_heuristic_only() {
-    // With the router off, the ambiguous gray zone stays on the heuristic
-    // (Flash fallback) and never consults the classifier.
-    let client = ScriptedClient::new(vec![vec![
-        AgentEvent::TextDelta {
-            text: r#"{"model":"pro","thinking":"max"}"#.to_string(),
-        },
-        AgentEvent::Done { usage: None },
-    ]]);
-    let config = AgentConfig {
-        model: crate::model_registry::AUTO_MODEL.to_string(),
-        reasoning_effort: crate::reasoning::ReasoningEffortSetting::Auto,
-        router_enabled: false,
-        ..AgentConfig::builtin()
-    };
-    let runtime = AgentRuntime::with_config(client, ToolRegistry::default(), config);
-
-    let route = runtime
-        .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default())
-        .await;
-
-    assert_eq!(route.source, crate::auto_mode::RouteSource::Heuristic);
-    assert_eq!(
-        route.effective_model,
-        crate::model_registry::DEEPSEEK_V4_FLASH
-    );
-}
-
-#[tokio::test]
-async fn flash_router_garbage_falls_back_to_heuristic() {
-    // Unparseable classifier output must not break routing — fall back.
-    let client = ScriptedClient::new(vec![vec![
-        AgentEvent::TextDelta {
-            text: "I cannot decide".to_string(),
-        },
-        AgentEvent::Done { usage: None },
-    ]]);
-    let runtime = auto_runtime(client);
-
-    let route = runtime
-        .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default())
-        .await;
-
-    assert_eq!(route.source, crate::auto_mode::RouteSource::Heuristic);
-    assert_eq!(
-        route.effective_model,
-        crate::model_registry::DEEPSEEK_V4_FLASH
-    );
-}
diff --git a/crates/deep-code-agent/src/runtime/persistence.rs b/crates/deep-code-agent/src/runtime/persistence.rs
index 7ee0288..981fbf5 100644
--- a/crates/deep-code-agent/src/runtime/persistence.rs
+++ b/crates/deep-code-agent/src/runtime/persistence.rs
@@ -78,6 +78,8 @@ impl<C: LlmClient + 'static> AgentRuntime<C> {
                 cancel: tokio_util::sync::CancellationToken::new(),
                 session_approved: Default::default(),
                 session_trusted_shell_prefixes: Default::default(),
+                cascade_escalated: false,
+                turn_tool_errors: 0,
             })),
             checkpoints: None,
             workspace: Some(workspace.clone()),
diff --git a/crates/deep-code-agent/src/runtime/routing.rs b/crates/deep-code-agent/src/runtime/routing.rs
deleted file mode 100644
index 4852e83..0000000
--- a/crates/deep-code-agent/src/runtime/routing.rs
+++ /dev/null
@@ -1,353 +0,0 @@
-//! Phase-2 Flash router: when the deterministic heuristic
-//! ([`crate::auto_mode::classify_model`]) can't decide a turn, ask a cheap
-//! `deepseek-v4-flash` thinking-off classifier — leveraging DeepSeek's cheap
-//! Flash tier — to pick the model and thinking level from the recent context.
-//! Bounded and best-effort: it only fires on the ambiguous gray zone, has a
-//! hard timeout, and silently falls back to the heuristic on any failure.
-
-use std::time::Duration;
-
-use futures_util::StreamExt;
-use serde::Deserialize;
-use tokio::time::timeout;
-
-use crate::auto_mode::{
-    ModelClass, RouteContext, RouteSource, TurnRoute, clamp_effort_to_model, classify_model,
-    resolve_turn_route,
-};
-use crate::client::LlmClient;
-use crate::event::AgentEvent;
-use crate::message::{Message, Role};
-use crate::model::ChatRequest;
-use crate::model_registry::{DEEPSEEK_V4_FLASH, DEEPSEEK_V4_PRO};
-use crate::reasoning::ReasoningEffort;
-use crate::runtime::AgentRuntime;
-
-/// Per-message truncation for the context block.
-const ROUTER_CONTEXT_CHARS: usize = 900;
-/// Truncation for the latest prompt handed to the classifier.
-const ROUTER_PROMPT_CHARS: usize = 4000;
-/// Assistant prefix seed for DeepSeek `/beta` prefix completion: forces the
-/// classifier's reply to begin a JSON object whose first key is `model`.
-const ROUTER_JSON_PREFIX: &str = "{\"model\":\"";
-
-impl<C: LlmClient + 'static> AgentRuntime<C> {
-    /// Resolve a turn's route, escalating ambiguous turns to the Flash router.
-    pub(super) async fn route_turn(&self, user_prompt: &str, ctx: RouteContext) -> TurnRoute {
-        let heuristic = || {
-            resolve_turn_route(
-                &self.config,
-                &self.registry,
-                user_prompt,
-                self.is_subagent,
-                ctx,
-            )
-        };
-
-        // Only the auto + online + parent path consults the router.
-        if !self.config.router_enabled
-            || !self.config.auto_model_enabled()
-            || self.is_subagent
-            || self.client.provider_name() == "echo"
-        {
-            return heuristic();
-        }
-        if !matches!(
-            classify_model(user_prompt, &ctx, self.config.auto_cost_saving),
-            ModelClass::Ambiguous { .. }
-        ) {
-            return heuristic();
-        }
-
-        match self.flash_route(user_prompt).await {
-            Some(route) => route,
-            None => heuristic(),
-        }
-    }
-
-    async fn flash_route(&self, user_prompt: &str) -> Option<TurnRoute> {
-        let mut messages = self.router_messages(user_prompt).await;
-        // On `/beta`, seed the assistant reply so the classifier can only emit a
-        // parseable JSON object, and stop at the closing brace. Other endpoints
-        // free-form and are parsed leniently as before.
-        let beta = self.config.uses_beta_endpoint();
-        if beta {
-            messages.push(Message::assistant_prefix(ROUTER_JSON_PREFIX));
-        }
-        let mut request =
-            ChatRequest::streaming(DEEPSEEK_V4_FLASH, messages).with_reasoning_effort("off");
-        if beta {
-            request = request.with_stop(vec!["}".to_string()]);
-        }
-        request.max_tokens = Some(32);
-
-        let router_timeout = Duration::from_millis(self.config.router_timeout_ms);
-        let completion = timeout(router_timeout, collect_text(self.client.as_ref(), request))
-            .await
-            .ok()??;
-        let json = if beta {
-            assemble_prefix_json(&completion)
-        } else {
-            completion
-        };
-        let decision = parse_router_decision(&json)?;
-        Some(self.route_from_router(decision))
-    }
-
-    async fn router_messages(&self, user_prompt: &str) -> Vec<Message> {
-        let context = {
-            let state = self.state.lock().await;
-            recent_context(
-                state.session.messages(),
-                self.config.router_context_turns,
-                ROUTER_CONTEXT_CHARS,
-            )
-        };
-        let prompt = truncate(user_prompt, ROUTER_PROMPT_CHARS);
-        let user = if context.is_empty() {
-            format!("Latest request:\n{prompt}")
-        } else {
-            format!("Recent context (oldest first):\n{context}\n\nLatest request:\n{prompt}")
-        };
-        vec![
-            Message::system(router_system_prompt(self.config.auto_cost_saving)),
-            Message::user(user),
-        ]
-    }
-
-    fn route_from_router(&self, decision: RouterDecision) -> TurnRoute {
-        let model = if decision.is_pro() {
-            DEEPSEEK_V4_PRO
-        } else {
-            DEEPSEEK_V4_FLASH
-        }
-        .to_string();
-
-        // Honor an explicit reasoning setting; only let the router pick the tier
-        // when effort is on Auto.
-        let auto_effort = self.config.reasoning_effort.is_auto();
-        let effort = if auto_effort {
-            decision.thinking_effort()
-        } else {
-            self.config.reasoning_effort.resolve(self.is_subagent, "")
-        };
-        let effective_effort = clamp_effort_to_model(&model, effort);
-
-        let short_model = if model == DEEPSEEK_V4_PRO {
-            "Pro"
-        } else {
-            "Flash"
-        };
-        TurnRoute {
-            requested_model: self.config.model.clone(),
-            effective_model: model,
-            auto_model: true,
-            reasoning_setting: self.config.reasoning_effort,
-            effective_effort,
-            auto_effort,
-            used_model_fallback: false,
-            route_reason: format!(
-                "Flash 路由判定：中等难度任务交由 Flash 分类器，选择 {short_model}"
-            ),
-            fallback_reason: None,
-            source: RouteSource::FlashRouter,
-        }
-    }
-}
-
-fn router_system_prompt(cost_saving: bool) -> String {
-    let mut prompt = "You are deep-code's auto-routing classifier. Reply with ONLY compact JSON: \
-{\"model\":\"flash|pro\",\"thinking\":\"off|low|high|max\"}. \
-Use flash for trivial, conversational, status, or single-step work; \
-use pro for coding, debugging, multi-step, multi-file, high-risk, tool-heavy, or ambiguous work that benefits from deeper reasoning. \
-Use thinking off only for trivial no-tool answers, low for simple lookups, high for ordinary reasoning, and max for agentic/coding/debugging/architecture/security work."
-        .to_string();
-    if cost_saving {
-        prompt.push_str(
-            " Cost-saving mode is ON: resolve ambiguous cases in favour of flash, not pro.",
-        );
-    }
-    prompt
-}
-
-async fn collect_text<C: LlmClient>(client: &C, request: ChatRequest) -> Option<String> {
-    let mut stream = client.stream_chat(request).await.ok()?;
-    let mut text = String::new();
-    while let Some(event) = stream.next().await {
-        match event {
-            Ok(AgentEvent::TextDelta { text: delta }) => text.push_str(&delta),
-            Ok(AgentEvent::Done { .. }) => break,
-            Ok(_) => {}
-            Err(_) => return None,
-        }
-    }
-    let trimmed = text.trim();
-    if trimmed.is_empty() {
-        None
-    } else {
-        Some(trimmed.to_string())
-    }
-}
-
-#[derive(Debug, Deserialize)]
-struct RouterDecision {
-    model: String,
-    #[serde(default)]
-    thinking: Option<String>,
-}
-
-impl RouterDecision {
-    fn is_pro(&self) -> bool {
-        let model = self.model.trim().to_ascii_lowercase();
-        model.contains("pro")
-    }
-
-    fn thinking_effort(&self) -> ReasoningEffort {
-        match self
-            .thinking
-            .as_deref()
-            .map(str::trim)
-            .map(str::to_ascii_lowercase)
-            .as_deref()
-        {
-            Some("off") => ReasoningEffort::Off,
-            Some("low") => ReasoningEffort::Low,
-            Some("medium" | "med") => ReasoningEffort::Medium,
-            Some("max") => ReasoningEffort::Max,
-            _ => ReasoningEffort::High,
-        }
-    }
-}
-
-/// Reassemble a JSON object from a prefix-completion response. The model
-/// returns only the continuation of [`ROUTER_JSON_PREFIX`], so prepend the seed
-/// (unless the reply echoed it) and close the brace if `stop` consumed it.
-fn assemble_prefix_json(completion: &str) -> String {
-    let body = completion.trim();
-    let combined = if body.starts_with('{') {
-        body.to_string()
-    } else {
-        format!("{ROUTER_JSON_PREFIX}{body}")
-    };
-    if combined.contains('}') {
-        combined
-    } else {
-        format!("{combined}}}")
-    }
-}
-
-/// Extract and parse the first JSON object from the classifier's reply, which
-/// may carry stray prose or code fences around it.
-fn parse_router_decision(raw: &str) -> Option<RouterDecision> {
-    let start = raw.find('{')?;
-    let end = raw[start..].find('}')? + start;
-    let json = &raw[start..=end];
-    let decision: RouterDecision = serde_json::from_str(json).ok()?;
-    let model = decision.model.trim().to_ascii_lowercase();
-    // Reject hallucinated models so we fall back to the heuristic.
-    if model.contains("pro") || model.contains("flash") {
-        Some(decision)
-    } else {
-        None
-    }
-}
-
-fn recent_context(messages: &[Message], turns: usize, max_chars: usize) -> String {
-    let relevant: Vec<&Message> = messages
-        .iter()
-        .filter(|message| message.role != Role::System)
-        .collect();
-    // Drop the trailing current user prompt — it's sent separately.
-    let end = relevant.len().saturating_sub(1);
-    let start = end.saturating_sub(turns);
-    relevant[start..end]
-        .iter()
-        .filter(|message| !message.content.trim().is_empty())
-        .map(|message| {
-            let role = match message.role {
-                Role::User => "user",
-                Role::Assistant => "assistant",
-                Role::Tool => "tool",
-                Role::System => "system",
-            };
-            format!("{role}: {}", truncate(&message.content, max_chars))
-        })
-        .collect::<Vec<_>>()
-        .join("\n")
-}
-
-fn truncate(value: &str, max_chars: usize) -> String {
-    if value.chars().count() <= max_chars {
-        value.to_string()
-    } else {
-        let kept: String = value.chars().take(max_chars).collect();
-        format!("{kept}…")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn parses_plain_json() {
-        let decision = parse_router_decision(r#"{"model":"pro","thinking":"max"}"#).unwrap();
-        assert!(decision.is_pro());
-        assert_eq!(decision.thinking_effort(), ReasoningEffort::Max);
-    }
-
-    #[test]
-    fn parses_json_with_surrounding_prose() {
-        let decision = parse_router_decision(
-            "Here you go:\n```json\n{\"model\": \"flash\", \"thinking\": \"low\"}\n```",
-        )
-        .unwrap();
-        assert!(!decision.is_pro());
-        assert_eq!(decision.thinking_effort(), ReasoningEffort::Low);
-    }
-
-    #[test]
-    fn rejects_hallucinated_model() {
-        assert!(parse_router_decision(r#"{"model":"gpt-9","thinking":"high"}"#).is_none());
-    }
-
-    #[test]
-    fn assembles_completion_only_prefix_response() {
-        // Beta prefix completion returns only the continuation of `{"model":"`.
-        let decision =
-            parse_router_decision(&assemble_prefix_json("pro\",\"thinking\":\"max\"")).unwrap();
-        assert!(decision.is_pro());
-        assert_eq!(decision.thinking_effort(), ReasoningEffort::Max);
-    }
-
-    #[test]
-    fn assemble_passes_through_echoed_full_json() {
-        let decision = parse_router_decision(&assemble_prefix_json(
-            "{\"model\":\"flash\",\"thinking\":\"low\"}",
-        ))
-        .unwrap();
-        assert!(!decision.is_pro());
-        assert_eq!(decision.thinking_effort(), ReasoningEffort::Low);
-    }
-
-    #[test]
-    fn missing_thinking_defaults_to_high() {
-        let decision = parse_router_decision(r#"{"model":"pro"}"#).unwrap();
-        assert_eq!(decision.thinking_effort(), ReasoningEffort::High);
-    }
-
-    #[test]
-    fn recent_context_drops_current_prompt_and_system() {
-        let messages = vec![
-            Message::system("sys"),
-            Message::user("first"),
-            Message::assistant("reply"),
-            Message::user("current prompt"),
-        ];
-        let context = recent_context(&messages, 6, 900);
-        assert!(context.contains("user: first"));
-        assert!(context.contains("assistant: reply"));
-        assert!(!context.contains("current prompt"));
-        assert!(!context.contains("sys"));
-    }
-}
diff --git a/crates/deep-code-agent/src/runtime/state.rs b/crates/deep-code-agent/src/runtime/state.rs
index d892620..8a66004 100644
--- a/crates/deep-code-agent/src/runtime/state.rs
+++ b/crates/deep-code-agent/src/runtime/state.rs
@@ -39,6 +39,13 @@ pub(super) struct RuntimeState {
     /// session-approvable by tool name, so this trusts at command granularity.
     /// In-memory only; compound commands are never matched (they keep prompting).
     pub(super) session_trusted_shell_prefixes: HashSet<String>,
+    /// Cascade routing latch: set once Flash visibly struggles (repeated
+    /// tool-call execution failures within a turn). Sticky for the rest of the
+    /// session, forcing auto mode onto Pro. In-memory only.
+    pub(super) cascade_escalated: bool,
+    /// Tool-call execution failures observed in the current turn; reset at the
+    /// start of each turn. Crossing the cascade threshold latches `cascade_escalated`.
+    pub(super) turn_tool_errors: u32,
 }
 
 pub(super) struct Persistence {
diff --git a/crates/deep-code-agent/src/runtime/tool_result.rs b/crates/deep-code-agent/src/runtime/tool_result.rs
index ead76e4..11f043b 100644
--- a/crates/deep-code-agent/src/runtime/tool_result.rs
+++ b/crates/deep-code-agent/src/runtime/tool_result.rs
@@ -18,6 +18,12 @@ use crate::tool::{
 pub(super) const CANCELLED_TOOL_RESULT: &str =
     "用户取消了本轮，该工具调用未执行 (cancelled by user)";
 
+/// Tool-call execution failures within a single turn that latch cascade
+/// escalation (Flash → Pro for the rest of the session). Two mirrors the
+/// "2–3 failed self-corrections, then escalate" rule of thumb without waiting
+/// so long that a whole turn is wasted flailing on the weak model.
+const CASCADE_ESCALATE_TOOL_ERRORS: u32 = 2;
+
 /// Whether "approve for the whole session" may be recorded for a tool.
 /// Shell-class tools are excluded: their risk lives in the per-call
 /// arguments, so a blanket session consent would be misleading.
@@ -378,6 +384,17 @@ impl<C: LlmClient + 'static> AgentRuntime<C> {
             if let Some(turn) = state.current_turn.as_mut() {
                 turn.tool_results.push(result.clone());
             }
+            // Cascade signal: a genuine execution failure means the model
+            // fumbled this tool call. Denials carry their own status and user
+            // cancellations carry a known marker, so neither counts. Enough
+            // fumbles in one turn latch escalation onto Pro (sticky for the
+            // session); the latch is read by the next turn's router.
+            if result.status == ToolResultStatus::Error && result.content != CANCELLED_TOOL_RESULT {
+                state.turn_tool_errors += 1;
+                if state.turn_tool_errors >= CASCADE_ESCALATE_TOOL_ERRORS {
+                    state.cascade_escalated = true;
+                }
+            }
         }
         // Persistence and SessionUpdated are flushed once per batch boundary
         // (see process_tool_batch / finish_cancelled_calls), not per call.
diff --git a/crates/deep-code-agent/src/runtime/turn_loop.rs b/crates/deep-code-agent/src/runtime/turn_loop.rs
index 35510d8..cd0b196 100644
--- a/crates/deep-code-agent/src/runtime/turn_loop.rs
+++ b/crates/deep-code-agent/src/runtime/turn_loop.rs
@@ -2,7 +2,7 @@ use std::collections::{HashMap, VecDeque};
 
 use tokio::sync::mpsc;
 
-use crate::auto_mode::RouteContext;
+use crate::auto_mode::{RouteContext, resolve_turn_route};
 use crate::client::LlmClient;
 use crate::compaction::{estimate_token_count, stable_prefix_fingerprint};
 use crate::event::AgentEvent;
@@ -28,20 +28,24 @@ impl<C: LlmClient + 'static> AgentRuntime<C> {
                 RouteContext {
                     context_tokens,
                     context_window: context_window_for_model(DEEPSEEK_V4_PRO),
+                    escalated: state.cascade_escalated,
                 },
             )
         };
         let turn_id = self.current_turn_id().await;
-        // Routing may consult the Flash classifier (a short network call); let a
-        // cancel during that wait abort the turn instead of stalling.
-        let mut route = tokio::select! {
-            biased;
-            () = cancel.cancelled() => {
-                self.finish_turn_cancelled(&turn_id, tx).await;
-                return;
-            }
-            route = self.route_turn(&user_prompt, route_ctx) => route,
-        };
+        if cancel.is_cancelled() {
+            self.finish_turn_cancelled(&turn_id, tx).await;
+            return;
+        }
+        // Routing is deterministic and local (no network): Flash-first unless a
+        // hard rule or difficulty keyword forces Pro, plus the cascade latch.
+        let mut route = resolve_turn_route(
+            &self.config,
+            &self.registry,
+            &user_prompt,
+            self.is_subagent,
+            route_ctx,
+        );
 
         if self.maybe_compact(&route.effective_model, tx).await {
             // compaction event already emitted; continue with trimmed history

From e98f1b93ffc009b4952c8cbf89bc2dc480631889 Mon Sep 17 00:00:00 2001
From: liwenkai <2020583117@qq.com>
Date: Tue, 30 Jun 2026 21:06:36 +0800
Subject: [PATCH 2/2] =?UTF-8?q?refactor(config):=20=E7=A7=BB=E9=99=A4?=
 =?UTF-8?q?=E9=9A=8F=20Flash=20=E8=B7=AF=E7=94=B1=E5=99=A8=E4=B8=80?=
 =?UTF-8?q?=E5=B9=B6=E5=BA=9F=E5=BC=83=E7=9A=84=20router=5F*=20=E9=85=8D?=
 =?UTF-8?q?=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flash 路由器退役后,以下配置已无任何引用,删除:
- AgentConfig 的 router_enabled / router_timeout_ms / router_context_turns 字段
- 对应的默认常量与 DEEP_CODE_AUTO_ROUTER_* 环境变量
- 配置文件的 [auto] 段(AutoSection)及其文件/env 解析与测试
---
 crates/deep-code-agent/src/config/layers.rs | 56 +--------------------
 crates/deep-code-agent/src/config/mod.rs    | 16 ------
 2 files changed, 1 insertion(+), 71 deletions(-)

diff --git a/crates/deep-code-agent/src/config/layers.rs b/crates/deep-code-agent/src/config/layers.rs
index 128884c..fc9f3d3 100644
--- a/crates/deep-code-agent/src/config/layers.rs
+++ b/crates/deep-code-agent/src/config/layers.rs
@@ -10,8 +10,7 @@ use std::time::Duration;
 use serde::{Deserialize, Serialize};
 
 use super::{
-    APPROVAL_AUTO_ALLOW_ENV, AUTO_COST_SAVING_ENV, AUTO_ROUTER_CONTEXT_TURNS_ENV,
-    AUTO_ROUTER_ENABLED_ENV, AUTO_ROUTER_TIMEOUT_MS_ENV, AgentConfig, COMPACTION_THRESHOLD_ENV,
+    APPROVAL_AUTO_ALLOW_ENV, AUTO_COST_SAVING_ENV, AgentConfig, COMPACTION_THRESHOLD_ENV,
     COST_CURRENCY_ENV, DEEPSEEK_API_KEY_ENV, MODEL_ENV, REASONING_EFFORT_ENV,
     STREAM_CHUNK_TIMEOUT_ENV, STREAM_MAX_BYTES_ENV, STREAM_MAX_RETRIES_ENV,
     STREAM_TOTAL_TIMEOUT_ENV,
@@ -172,15 +171,6 @@ struct ConfigFile {
     context: ContextSection,
     stream: StreamSection,
     approval: ApprovalSection,
-    auto: AutoSection,
-}
-
-#[derive(Debug, Default, Deserialize)]
-#[serde(default)]
-struct AutoSection {
-    router_enabled: Option<bool>,
-    router_timeout_ms: Option<u64>,
-    router_context_turns: Option<usize>,
 }
 
 #[derive(Debug, Default, Deserialize)]
@@ -355,16 +345,6 @@ fn apply_file_overlay(
                 .collect();
         }
     }
-
-    if let Some(value) = file.auto.router_enabled {
-        config.router_enabled = value;
-    }
-    if let Some(value) = file.auto.router_timeout_ms {
-        config.router_timeout_ms = value;
-    }
-    if let Some(value) = file.auto.router_context_turns {
-        config.router_context_turns = value;
-    }
 }
 
 pub(super) fn apply_env_overlay(
@@ -416,16 +396,6 @@ pub(super) fn apply_env_overlay(
             .filter(|rule| !rule.is_empty())
             .collect();
     }
-    if let Some(value) = lookup(AUTO_ROUTER_ENABLED_ENV) {
-        config.router_enabled = matches!(value.trim(), "1" | "true" | "yes" | "on");
-    }
-    if let Some(value) = lookup(AUTO_ROUTER_TIMEOUT_MS_ENV).and_then(|value| value.parse().ok()) {
-        config.router_timeout_ms = value;
-    }
-    if let Some(value) = lookup(AUTO_ROUTER_CONTEXT_TURNS_ENV).and_then(|value| value.parse().ok())
-    {
-        config.router_context_turns = value;
-    }
 }
 
 #[cfg(unix)]
@@ -564,30 +534,6 @@ mod tests {
         assert_eq!(loaded.config.stream_chunk_timeout, Duration::from_secs(30));
     }
 
-    #[test]
-    fn auto_router_settings_from_file_and_env() {
-        let global_dir = tempfile::tempdir().unwrap();
-        let global = write_config(
-            global_dir.path(),
-            "[auto]\nrouter_enabled = false\nrouter_timeout_ms = 1500\nrouter_context_turns = 3\n",
-        );
-        let loaded = AgentConfig::load_with(Some(global), None, &no_env);
-        assert!(!loaded.config.router_enabled);
-        assert_eq!(loaded.config.router_timeout_ms, 1500);
-        assert_eq!(loaded.config.router_context_turns, 3);
-
-        // Env overrides the file.
-        let env = |name: &str| match name {
-            super::AUTO_ROUTER_ENABLED_ENV => Some("true".to_string()),
-            super::AUTO_ROUTER_TIMEOUT_MS_ENV => Some("2000".to_string()),
-            _ => None,
-        };
-        let global = write_config(global_dir.path(), "[auto]\nrouter_enabled = false\n");
-        let loaded = AgentConfig::load_with(Some(global), None, &env);
-        assert!(loaded.config.router_enabled);
-        assert_eq!(loaded.config.router_timeout_ms, 2000);
-    }
-
     #[test]
     fn invalid_toml_layer_is_skipped_with_warning_not_panic() {
         let global_dir = tempfile::tempdir().unwrap();
diff --git a/crates/deep-code-agent/src/config/mod.rs b/crates/deep-code-agent/src/config/mod.rs
index 5969530..962c82e 100644
--- a/crates/deep-code-agent/src/config/mod.rs
+++ b/crates/deep-code-agent/src/config/mod.rs
@@ -30,17 +30,11 @@ pub const STREAM_CHUNK_TIMEOUT_ENV: &str = "DEEP_CODE_STREAM_CHUNK_TIMEOUT_SECS"
 pub const STREAM_TOTAL_TIMEOUT_ENV: &str = "DEEP_CODE_STREAM_TOTAL_TIMEOUT_SECS";
 pub const STREAM_MAX_BYTES_ENV: &str = "DEEP_CODE_STREAM_MAX_BYTES";
 pub const APPROVAL_AUTO_ALLOW_ENV: &str = "DEEP_CODE_APPROVAL_AUTO_ALLOW";
-pub const AUTO_ROUTER_ENABLED_ENV: &str = "DEEP_CODE_AUTO_ROUTER_ENABLED";
-pub const AUTO_ROUTER_TIMEOUT_MS_ENV: &str = "DEEP_CODE_AUTO_ROUTER_TIMEOUT_MS";
-pub const AUTO_ROUTER_CONTEXT_TURNS_ENV: &str = "DEEP_CODE_AUTO_ROUTER_CONTEXT_TURNS";
 
 pub const DEFAULT_STREAM_MAX_RETRIES: u32 = 3;
 pub const DEFAULT_STREAM_CHUNK_TIMEOUT_SECS: u64 = 300;
 pub const DEFAULT_STREAM_TOTAL_TIMEOUT_SECS: u64 = 900;
 pub const DEFAULT_STREAM_MAX_BYTES: u64 = 50 * 1024 * 1024;
-pub const DEFAULT_AUTO_ROUTER_ENABLED: bool = true;
-pub const DEFAULT_AUTO_ROUTER_TIMEOUT_MS: u64 = 2500;
-pub const DEFAULT_AUTO_ROUTER_CONTEXT_TURNS: usize = 6;
 
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct AgentConfig {
@@ -65,13 +59,6 @@ pub struct AgentConfig {
     /// these run without prompting. Only env and the global config file may
     /// set this — project files are ignored (a repo must not disarm gates).
     pub approval_auto_allow: Vec<String>,
-    /// Consult the Flash classifier for ambiguous auto-mode turns. When false,
-    /// auto mode is the pure heuristic (no extra model call).
-    pub router_enabled: bool,
-    /// Hard timeout for the Flash router round-trip, in milliseconds.
-    pub router_timeout_ms: u64,
-    /// Recent turns of context handed to the Flash router.
-    pub router_context_turns: usize,
 }
 
 impl Default for AgentConfig {
@@ -103,9 +90,6 @@ impl AgentConfig {
             stream_total_timeout: Duration::from_secs(DEFAULT_STREAM_TOTAL_TIMEOUT_SECS),
             stream_max_bytes: DEFAULT_STREAM_MAX_BYTES,
             approval_auto_allow: Vec::new(),
-            router_enabled: DEFAULT_AUTO_ROUTER_ENABLED,
-            router_timeout_ms: DEFAULT_AUTO_ROUTER_TIMEOUT_MS,
-            router_context_turns: DEFAULT_AUTO_ROUTER_CONTEXT_TURNS,
         }
     }