From d7bd55384176c10bd553a7ec7b7c35bd41874cd3 Mon Sep 17 00:00:00 2001 From: liwenkai <2020583117@qq.com> Date: Tue, 30 Jun 2026 20:55:47 +0800 Subject: [PATCH 1/2] =?UTF-8?q?refactor(auto):=20Flash-first=20=E7=BA=A7?= =?UTF-8?q?=E8=81=94=E8=B7=AF=E7=94=B1,=E7=A7=BB=E9=99=A4=E9=95=BF?= =?UTF-8?q?=E5=BA=A6=E7=8C=9C=E6=B5=8B=E4=B8=8E=20Flash=20=E8=B7=AF?= =?UTF-8?q?=E7=94=B1=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit auto 模式不再靠提示长度或 Flash 分类器预判难度。除上下文压力、 debug/重构/架构/安全等明显难关键词直接走 Pro 外,其余一律 Flash-first; 当一轮内工具调用执行失败累计 ≥2 次(Flash 明显搞不定)时,本会话 粘性升级到 Pro(RouteSource::Cascade),经 telemetry 的 route_source / route_reason 透出以便观测。 - 删除 runtime/routing.rs 整个 Flash LLM 路由器子系统及其测试 - 删除 auto_mode 的 ModelClass/Ambiguous 灰区与 classify_by_length 长度启发式 - classify_model 改为直接返回 (model, reason, source) - run_loop 同步直调 resolve_turn_route(不再 select! 等网络路由) --- crates/deep-code-agent/src/auto_mode.rs | 169 ++++----- crates/deep-code-agent/src/runtime.rs | 6 +- .../src/runtime/integration_tests.rs | 142 ------- .../src/runtime/persistence.rs | 2 + crates/deep-code-agent/src/runtime/routing.rs | 353 ------------------ crates/deep-code-agent/src/runtime/state.rs | 7 + .../src/runtime/tool_result.rs | 17 + .../deep-code-agent/src/runtime/turn_loop.rs | 26 +- 8 files changed, 125 insertions(+), 597 deletions(-) delete mode 100644 crates/deep-code-agent/src/runtime/routing.rs diff --git a/crates/deep-code-agent/src/auto_mode.rs b/crates/deep-code-agent/src/auto_mode.rs index 6397ea0..f858aa6 100644 --- a/crates/deep-code-agent/src/auto_mode.rs +++ b/crates/deep-code-agent/src/auto_mode.rs @@ -8,18 +8,17 @@ use crate::task_class::{TaskWeight, classify_keyword}; /// Force the strong model once the session fills this fraction of the context /// window — long contexts need Pro regardless of how the prompt reads. const CONTEXT_PRESSURE_PERCENT: u64 = 70; -/// Prompts shorter than this (and free of difficulty keywords) default to Flash. -const SHORT_PROMPT_CHARS: usize = 100; /// What decided a turn's route, for explainable telemetry. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RouteSource { /// A non-negotiable rule (sub-agent, fixed model, context pressure). HardRule, - /// The keyword/length heuristic. + /// The keyword heuristic (difficulty keyword → Pro), else Flash-first. Heuristic, - /// The Flash classifier resolved an otherwise-ambiguous turn. - FlashRouter, + /// Cascade escalation: Flash visibly struggled earlier this session, so + /// later turns run on Pro until the session ends. + Cascade, } impl RouteSource { @@ -28,7 +27,7 @@ impl RouteSource { match self { Self::HardRule => "hard-rule", Self::Heuristic => "heuristic", - Self::FlashRouter => "flash-router", + Self::Cascade => "cascade", } } } @@ -40,6 +39,9 @@ pub struct RouteContext { pub context_tokens: u32, /// Context window of the model family (0 disables the pressure rule). pub context_window: u32, + /// Cascade escalation latch: Flash already struggled (repeated tool-call + /// failures) earlier this session, so force Pro for the rest of it. + pub escalated: bool, } impl RouteContext { @@ -120,21 +122,6 @@ pub fn api_fallback_model(route: &TurnRoute) -> Option<&'static str> { } } -/// A model-selection outcome from the deterministic heuristic. -/// -/// `Ambiguous` is the gray zone the Phase-2 Flash router resolves; callers -/// without a router fall back to Flash. -pub(crate) enum ModelClass { - Decisive { - model: String, - reason: String, - source: RouteSource, - }, - Ambiguous { - reason: String, - }, -} - /// Resolve the concrete model + reasoning effort for one user turn. #[must_use] pub fn resolve_turn_route( @@ -168,21 +155,7 @@ pub fn resolve_turn_route( } let (effective_model, route_reason, source) = - match classify_model(user_prompt, &ctx, config.auto_cost_saving) { - ModelClass::Decisive { - model, - reason, - source, - } => (model, reason, source), - ModelClass::Ambiguous { reason } => { - // No router yet: the gray zone defaults to Flash. - ( - DEEPSEEK_V4_FLASH.to_string(), - reason, - RouteSource::Heuristic, - ) - } - }; + classify_model(user_prompt, &ctx, config.auto_cost_saving); // Effort and model both derive from `task_class`, so they stay coherent. let effort = config.reasoning_effort.resolve(is_subagent, user_prompt); @@ -211,68 +184,64 @@ pub fn select_auto_model(input: &str, cost_saving: bool) -> String { /// Model + human-readable reason for status surfaces (no session context). #[must_use] pub fn select_auto_model_with_reason(input: &str, cost_saving: bool) -> (String, String) { - match classify_model(input, &RouteContext::default(), cost_saving) { - ModelClass::Decisive { model, reason, .. } => (model, reason), - ModelClass::Ambiguous { reason } => (DEEPSEEK_V4_FLASH.to_string(), reason), - } + let (model, reason, _) = classify_model(input, &RouteContext::default(), cost_saving); + (model, reason) } -/// Deterministic model selection over the shared [`crate::task_class`] table. -/// Priority: context pressure → difficulty keyword → length, with the -/// 100‑to‑threshold gray zone left `Ambiguous` for the Flash router. -pub(crate) fn classify_model(input: &str, ctx: &RouteContext, cost_saving: bool) -> ModelClass { +/// Flash-first model selection over the shared [`crate::task_class`] table. +/// +/// Returns `(model, human-readable reason, source)`. Pro is forced only by hard +/// facts (cascade escalation, context pressure) or an explicit difficulty +/// keyword. Everything else starts on Flash — cascade escalation (driven by +/// observed tool-call failures) upgrades later turns when Flash actually +/// struggles, so we no longer guess difficulty from prompt length. +pub(crate) fn classify_model( + input: &str, + ctx: &RouteContext, + cost_saving: bool, +) -> (String, String, RouteSource) { + if ctx.escalated { + return ( + DEEPSEEK_V4_PRO.to_string(), + "级联升级:本会话内 Flash 工具调用反复失败,改用 Pro 接管".to_string(), + RouteSource::Cascade, + ); + } + if ctx.under_pressure() { - return ModelClass::Decisive { - model: DEEPSEEK_V4_PRO.to_string(), - reason: format!( + return ( + DEEPSEEK_V4_PRO.to_string(), + format!( "上下文占用约 {}%(≥{CONTEXT_PRESSURE_PERCENT}% 阈值),使用 Pro 处理长上下文", ctx.usage_percent() ), - source: RouteSource::HardRule, - }; + RouteSource::HardRule, + ); } match classify_keyword(input) { - Some((TaskWeight::Deep, keyword)) => ModelClass::Decisive { - model: DEEPSEEK_V4_PRO.to_string(), - reason: format!("命中调试/报错类关键词“{keyword}”,使用 Pro 配深推理"), - source: RouteSource::Heuristic, - }, - Some((TaskWeight::Heavy, keyword)) => ModelClass::Decisive { - model: DEEPSEEK_V4_PRO.to_string(), - reason: format!("命中复杂任务关键词“{keyword}”,使用 Pro 以获得更强推理和工具规划能力"), - source: RouteSource::Heuristic, - }, - Some((TaskWeight::Borderline, keyword)) if !cost_saving => ModelClass::Decisive { - model: DEEPSEEK_V4_PRO.to_string(), - reason: format!("任务包含“{keyword}”,且未开启成本优先,使用 Pro"), - source: RouteSource::Heuristic, - }, - // Borderline under cost-saving and Light keywords fall through to the - // length check below (Light shouldn't force Flash on a long prompt). - _ => classify_by_length(input, cost_saving), - } -} - -fn classify_by_length(input: &str, cost_saving: bool) -> ModelClass { - let len = input.chars().count(); - if len < SHORT_PROMPT_CHARS { - return ModelClass::Decisive { - model: DEEPSEEK_V4_FLASH.to_string(), - reason: "短提示优先使用 Flash,降低延迟和成本".to_string(), - source: RouteSource::Heuristic, - }; - } - let long_threshold = if cost_saving { 1_000 } else { 500 }; - if len > long_threshold { - return ModelClass::Decisive { - model: DEEPSEEK_V4_PRO.to_string(), - reason: format!("输入长度 {len} 超过阈值 {long_threshold},使用 Pro 处理长上下文"), - source: RouteSource::Heuristic, - }; - } - ModelClass::Ambiguous { - reason: format!("中等长度({len} 字)且无明确难度信号,待进一步判定"), + Some((TaskWeight::Deep, keyword)) => ( + DEEPSEEK_V4_PRO.to_string(), + format!("命中调试/报错类关键词“{keyword}”,使用 Pro 配深推理"), + RouteSource::Heuristic, + ), + Some((TaskWeight::Heavy, keyword)) => ( + DEEPSEEK_V4_PRO.to_string(), + format!("命中复杂任务关键词“{keyword}”,使用 Pro 以获得更强推理和工具规划能力"), + RouteSource::Heuristic, + ), + Some((TaskWeight::Borderline, keyword)) if !cost_saving => ( + DEEPSEEK_V4_PRO.to_string(), + format!("任务包含“{keyword}”,且未开启成本优先,使用 Pro"), + RouteSource::Heuristic, + ), + // Everything else (Light keywords, Borderline under cost-saving, no + // keyword) starts on Flash; cascade upgrades it if Flash struggles. + _ => ( + DEEPSEEK_V4_FLASH.to_string(), + "默认先用 Flash(更快更省);若工具调用反复失败,级联会升级到 Pro".to_string(), + RouteSource::Heuristic, + ), } } @@ -409,12 +378,32 @@ mod tests { let ctx = RouteContext { context_tokens: 800_000, context_window: 1_000_000, + escalated: false, }; let route = resolve_turn_route(&config, &ModelRegistry::default(), "hi", false, ctx); assert_eq!(route.effective_model, DEEPSEEK_V4_PRO); assert_eq!(route.source, RouteSource::HardRule); } + #[test] + fn cascade_escalation_forces_pro_on_trivial_prompt() { + // Once Flash has struggled this session, even a short trivial prompt + // that would normally be Flash routes to Pro, tagged as Cascade. + let config = AgentConfig { + model: AUTO_MODEL.to_string(), + reasoning_effort: ReasoningEffortSetting::Auto, + ..AgentConfig::default() + }; + let ctx = RouteContext { + escalated: true, + ..RouteContext::default() + }; + let route = resolve_turn_route(&config, &ModelRegistry::default(), "hi", false, ctx); + assert_eq!(route.effective_model, DEEPSEEK_V4_PRO); + assert_eq!(route.source, RouteSource::Cascade); + assert!(route.route_reason.contains("级联升级")); + } + #[test] fn api_fallback_only_for_auto_pro() { let route = TurnRoute { @@ -440,7 +429,7 @@ mod tests { let (model, reason) = select_auto_model_with_reason("hi", false); assert_eq!(model, DEEPSEEK_V4_FLASH); - assert!(reason.contains("短提示")); + assert!(reason.contains("Flash")); } #[test] diff --git a/crates/deep-code-agent/src/runtime.rs b/crates/deep-code-agent/src/runtime.rs index 9500ccd..700b33b 100644 --- a/crates/deep-code-agent/src/runtime.rs +++ b/crates/deep-code-agent/src/runtime.rs @@ -19,7 +19,6 @@ mod event; mod handle; mod persistence; mod persistence_actor; -mod routing; mod state; mod streaming; mod telemetry; @@ -145,6 +144,8 @@ impl AgentRuntime { cancel: CancellationToken::new(), session_approved: Default::default(), session_trusted_shell_prefixes: Default::default(), + cascade_escalated: false, + turn_tool_errors: 0, })), checkpoints: None, workspace: None, @@ -189,6 +190,9 @@ impl AgentRuntime { state.current_prompt = Some(prompt); state.current_turn_id = Some(turn_id); state.cancel = CancellationToken::new(); + // Per-turn struggle counter resets; the `cascade_escalated` latch + // intentionally persists for the rest of the session. + state.turn_tool_errors = 0; } self.persist().await; } diff --git a/crates/deep-code-agent/src/runtime/integration_tests.rs b/crates/deep-code-agent/src/runtime/integration_tests.rs index e9b58aa..4ffdd39 100644 --- a/crates/deep-code-agent/src/runtime/integration_tests.rs +++ b/crates/deep-code-agent/src/runtime/integration_tests.rs @@ -1937,145 +1937,3 @@ async fn cancel_turn_when_idle_is_silent_noop() { Some(RuntimeEvent::TurnFinished { .. }) )); } - -const AMBIGUOUS_PROMPT: &str = "Walk me through the overall layout of this project and what the different \ -parts are generally for, so I can get my bearings before we begin making changes together."; - -fn auto_runtime(client: ScriptedClient) -> AgentRuntime { - let config = AgentConfig { - model: crate::model_registry::AUTO_MODEL.to_string(), - reasoning_effort: crate::reasoning::ReasoningEffortSetting::Auto, - ..AgentConfig::builtin() - }; - AgentRuntime::with_config(client, ToolRegistry::default(), config) -} - -#[tokio::test] -async fn flash_router_resolves_ambiguous_turn() { - // The ambiguous gray zone escalates to the Flash classifier, which here - // returns Pro/max; the route is sourced from the router. - let client = ScriptedClient::new(vec![vec![ - AgentEvent::TextDelta { - text: r#"{"model":"pro","thinking":"max"}"#.to_string(), - }, - AgentEvent::Done { usage: None }, - ]]); - let runtime = auto_runtime(client); - - let route = runtime - .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default()) - .await; - - assert_eq!(route.source, crate::auto_mode::RouteSource::FlashRouter); - assert_eq!( - route.effective_model, - crate::model_registry::DEEPSEEK_V4_PRO - ); - assert_eq!( - route.effective_effort, - crate::reasoning::ReasoningEffort::Max - ); -} - -#[tokio::test] -async fn flash_router_assembles_prefix_completion() { - // /beta prefix completion: the model returns only the JSON continuation - // (the assistant reply was seeded with `{"model":"`). - let client = ScriptedClient::new(vec![vec![ - AgentEvent::TextDelta { - text: "pro\",\"thinking\":\"high\"".to_string(), - }, - AgentEvent::Done { usage: None }, - ]]); - let runtime = auto_runtime(client); - - let route = runtime - .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default()) - .await; - - assert_eq!(route.source, crate::auto_mode::RouteSource::FlashRouter); - assert_eq!( - route.effective_model, - crate::model_registry::DEEPSEEK_V4_PRO - ); - assert_eq!( - route.effective_effort, - crate::reasoning::ReasoningEffort::High - ); -} - -#[tokio::test] -async fn decisive_turn_skips_flash_router() { - // A keyword-decisive prompt must not consult the router at all. - let client = ScriptedClient::new(vec![vec![ - AgentEvent::TextDelta { - text: r#"{"model":"flash","thinking":"off"}"#.to_string(), - }, - AgentEvent::Done { usage: None }, - ]]); - let runtime = auto_runtime(client); - - let route = runtime - .route_turn( - "refactor the authentication module", - crate::auto_mode::RouteContext::default(), - ) - .await; - - assert_eq!(route.source, crate::auto_mode::RouteSource::Heuristic); - assert_eq!( - route.effective_model, - crate::model_registry::DEEPSEEK_V4_PRO - ); -} - -#[tokio::test] -async fn router_disabled_uses_heuristic_only() { - // With the router off, the ambiguous gray zone stays on the heuristic - // (Flash fallback) and never consults the classifier. - let client = ScriptedClient::new(vec![vec![ - AgentEvent::TextDelta { - text: r#"{"model":"pro","thinking":"max"}"#.to_string(), - }, - AgentEvent::Done { usage: None }, - ]]); - let config = AgentConfig { - model: crate::model_registry::AUTO_MODEL.to_string(), - reasoning_effort: crate::reasoning::ReasoningEffortSetting::Auto, - router_enabled: false, - ..AgentConfig::builtin() - }; - let runtime = AgentRuntime::with_config(client, ToolRegistry::default(), config); - - let route = runtime - .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default()) - .await; - - assert_eq!(route.source, crate::auto_mode::RouteSource::Heuristic); - assert_eq!( - route.effective_model, - crate::model_registry::DEEPSEEK_V4_FLASH - ); -} - -#[tokio::test] -async fn flash_router_garbage_falls_back_to_heuristic() { - // Unparseable classifier output must not break routing — fall back. - let client = ScriptedClient::new(vec![vec![ - AgentEvent::TextDelta { - text: "I cannot decide".to_string(), - }, - AgentEvent::Done { usage: None }, - ]]); - let runtime = auto_runtime(client); - - let route = runtime - .route_turn(AMBIGUOUS_PROMPT, crate::auto_mode::RouteContext::default()) - .await; - - assert_eq!(route.source, crate::auto_mode::RouteSource::Heuristic); - assert_eq!( - route.effective_model, - crate::model_registry::DEEPSEEK_V4_FLASH - ); -} diff --git a/crates/deep-code-agent/src/runtime/persistence.rs b/crates/deep-code-agent/src/runtime/persistence.rs index 7ee0288..981fbf5 100644 --- a/crates/deep-code-agent/src/runtime/persistence.rs +++ b/crates/deep-code-agent/src/runtime/persistence.rs @@ -78,6 +78,8 @@ impl AgentRuntime { cancel: tokio_util::sync::CancellationToken::new(), session_approved: Default::default(), session_trusted_shell_prefixes: Default::default(), + cascade_escalated: false, + turn_tool_errors: 0, })), checkpoints: None, workspace: Some(workspace.clone()), diff --git a/crates/deep-code-agent/src/runtime/routing.rs b/crates/deep-code-agent/src/runtime/routing.rs deleted file mode 100644 index 4852e83..0000000 --- a/crates/deep-code-agent/src/runtime/routing.rs +++ /dev/null @@ -1,353 +0,0 @@ -//! Phase-2 Flash router: when the deterministic heuristic -//! ([`crate::auto_mode::classify_model`]) can't decide a turn, ask a cheap -//! `deepseek-v4-flash` thinking-off classifier — leveraging DeepSeek's cheap -//! Flash tier — to pick the model and thinking level from the recent context. -//! Bounded and best-effort: it only fires on the ambiguous gray zone, has a -//! hard timeout, and silently falls back to the heuristic on any failure. - -use std::time::Duration; - -use futures_util::StreamExt; -use serde::Deserialize; -use tokio::time::timeout; - -use crate::auto_mode::{ - ModelClass, RouteContext, RouteSource, TurnRoute, clamp_effort_to_model, classify_model, - resolve_turn_route, -}; -use crate::client::LlmClient; -use crate::event::AgentEvent; -use crate::message::{Message, Role}; -use crate::model::ChatRequest; -use crate::model_registry::{DEEPSEEK_V4_FLASH, DEEPSEEK_V4_PRO}; -use crate::reasoning::ReasoningEffort; -use crate::runtime::AgentRuntime; - -/// Per-message truncation for the context block. -const ROUTER_CONTEXT_CHARS: usize = 900; -/// Truncation for the latest prompt handed to the classifier. -const ROUTER_PROMPT_CHARS: usize = 4000; -/// Assistant prefix seed for DeepSeek `/beta` prefix completion: forces the -/// classifier's reply to begin a JSON object whose first key is `model`. -const ROUTER_JSON_PREFIX: &str = "{\"model\":\""; - -impl AgentRuntime { - /// Resolve a turn's route, escalating ambiguous turns to the Flash router. - pub(super) async fn route_turn(&self, user_prompt: &str, ctx: RouteContext) -> TurnRoute { - let heuristic = || { - resolve_turn_route( - &self.config, - &self.registry, - user_prompt, - self.is_subagent, - ctx, - ) - }; - - // Only the auto + online + parent path consults the router. - if !self.config.router_enabled - || !self.config.auto_model_enabled() - || self.is_subagent - || self.client.provider_name() == "echo" - { - return heuristic(); - } - if !matches!( - classify_model(user_prompt, &ctx, self.config.auto_cost_saving), - ModelClass::Ambiguous { .. } - ) { - return heuristic(); - } - - match self.flash_route(user_prompt).await { - Some(route) => route, - None => heuristic(), - } - } - - async fn flash_route(&self, user_prompt: &str) -> Option { - let mut messages = self.router_messages(user_prompt).await; - // On `/beta`, seed the assistant reply so the classifier can only emit a - // parseable JSON object, and stop at the closing brace. Other endpoints - // free-form and are parsed leniently as before. - let beta = self.config.uses_beta_endpoint(); - if beta { - messages.push(Message::assistant_prefix(ROUTER_JSON_PREFIX)); - } - let mut request = - ChatRequest::streaming(DEEPSEEK_V4_FLASH, messages).with_reasoning_effort("off"); - if beta { - request = request.with_stop(vec!["}".to_string()]); - } - request.max_tokens = Some(32); - - let router_timeout = Duration::from_millis(self.config.router_timeout_ms); - let completion = timeout(router_timeout, collect_text(self.client.as_ref(), request)) - .await - .ok()??; - let json = if beta { - assemble_prefix_json(&completion) - } else { - completion - }; - let decision = parse_router_decision(&json)?; - Some(self.route_from_router(decision)) - } - - async fn router_messages(&self, user_prompt: &str) -> Vec { - let context = { - let state = self.state.lock().await; - recent_context( - state.session.messages(), - self.config.router_context_turns, - ROUTER_CONTEXT_CHARS, - ) - }; - let prompt = truncate(user_prompt, ROUTER_PROMPT_CHARS); - let user = if context.is_empty() { - format!("Latest request:\n{prompt}") - } else { - format!("Recent context (oldest first):\n{context}\n\nLatest request:\n{prompt}") - }; - vec![ - Message::system(router_system_prompt(self.config.auto_cost_saving)), - Message::user(user), - ] - } - - fn route_from_router(&self, decision: RouterDecision) -> TurnRoute { - let model = if decision.is_pro() { - DEEPSEEK_V4_PRO - } else { - DEEPSEEK_V4_FLASH - } - .to_string(); - - // Honor an explicit reasoning setting; only let the router pick the tier - // when effort is on Auto. - let auto_effort = self.config.reasoning_effort.is_auto(); - let effort = if auto_effort { - decision.thinking_effort() - } else { - self.config.reasoning_effort.resolve(self.is_subagent, "") - }; - let effective_effort = clamp_effort_to_model(&model, effort); - - let short_model = if model == DEEPSEEK_V4_PRO { - "Pro" - } else { - "Flash" - }; - TurnRoute { - requested_model: self.config.model.clone(), - effective_model: model, - auto_model: true, - reasoning_setting: self.config.reasoning_effort, - effective_effort, - auto_effort, - used_model_fallback: false, - route_reason: format!( - "Flash 路由判定:中等难度任务交由 Flash 分类器,选择 {short_model}" - ), - fallback_reason: None, - source: RouteSource::FlashRouter, - } - } -} - -fn router_system_prompt(cost_saving: bool) -> String { - let mut prompt = "You are deep-code's auto-routing classifier. Reply with ONLY compact JSON: \ -{\"model\":\"flash|pro\",\"thinking\":\"off|low|high|max\"}. \ -Use flash for trivial, conversational, status, or single-step work; \ -use pro for coding, debugging, multi-step, multi-file, high-risk, tool-heavy, or ambiguous work that benefits from deeper reasoning. \ -Use thinking off only for trivial no-tool answers, low for simple lookups, high for ordinary reasoning, and max for agentic/coding/debugging/architecture/security work." - .to_string(); - if cost_saving { - prompt.push_str( - " Cost-saving mode is ON: resolve ambiguous cases in favour of flash, not pro.", - ); - } - prompt -} - -async fn collect_text(client: &C, request: ChatRequest) -> Option { - let mut stream = client.stream_chat(request).await.ok()?; - let mut text = String::new(); - while let Some(event) = stream.next().await { - match event { - Ok(AgentEvent::TextDelta { text: delta }) => text.push_str(&delta), - Ok(AgentEvent::Done { .. }) => break, - Ok(_) => {} - Err(_) => return None, - } - } - let trimmed = text.trim(); - if trimmed.is_empty() { - None - } else { - Some(trimmed.to_string()) - } -} - -#[derive(Debug, Deserialize)] -struct RouterDecision { - model: String, - #[serde(default)] - thinking: Option, -} - -impl RouterDecision { - fn is_pro(&self) -> bool { - let model = self.model.trim().to_ascii_lowercase(); - model.contains("pro") - } - - fn thinking_effort(&self) -> ReasoningEffort { - match self - .thinking - .as_deref() - .map(str::trim) - .map(str::to_ascii_lowercase) - .as_deref() - { - Some("off") => ReasoningEffort::Off, - Some("low") => ReasoningEffort::Low, - Some("medium" | "med") => ReasoningEffort::Medium, - Some("max") => ReasoningEffort::Max, - _ => ReasoningEffort::High, - } - } -} - -/// Reassemble a JSON object from a prefix-completion response. The model -/// returns only the continuation of [`ROUTER_JSON_PREFIX`], so prepend the seed -/// (unless the reply echoed it) and close the brace if `stop` consumed it. -fn assemble_prefix_json(completion: &str) -> String { - let body = completion.trim(); - let combined = if body.starts_with('{') { - body.to_string() - } else { - format!("{ROUTER_JSON_PREFIX}{body}") - }; - if combined.contains('}') { - combined - } else { - format!("{combined}}}") - } -} - -/// Extract and parse the first JSON object from the classifier's reply, which -/// may carry stray prose or code fences around it. -fn parse_router_decision(raw: &str) -> Option { - let start = raw.find('{')?; - let end = raw[start..].find('}')? + start; - let json = &raw[start..=end]; - let decision: RouterDecision = serde_json::from_str(json).ok()?; - let model = decision.model.trim().to_ascii_lowercase(); - // Reject hallucinated models so we fall back to the heuristic. - if model.contains("pro") || model.contains("flash") { - Some(decision) - } else { - None - } -} - -fn recent_context(messages: &[Message], turns: usize, max_chars: usize) -> String { - let relevant: Vec<&Message> = messages - .iter() - .filter(|message| message.role != Role::System) - .collect(); - // Drop the trailing current user prompt — it's sent separately. - let end = relevant.len().saturating_sub(1); - let start = end.saturating_sub(turns); - relevant[start..end] - .iter() - .filter(|message| !message.content.trim().is_empty()) - .map(|message| { - let role = match message.role { - Role::User => "user", - Role::Assistant => "assistant", - Role::Tool => "tool", - Role::System => "system", - }; - format!("{role}: {}", truncate(&message.content, max_chars)) - }) - .collect::>() - .join("\n") -} - -fn truncate(value: &str, max_chars: usize) -> String { - if value.chars().count() <= max_chars { - value.to_string() - } else { - let kept: String = value.chars().take(max_chars).collect(); - format!("{kept}…") - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn parses_plain_json() { - let decision = parse_router_decision(r#"{"model":"pro","thinking":"max"}"#).unwrap(); - assert!(decision.is_pro()); - assert_eq!(decision.thinking_effort(), ReasoningEffort::Max); - } - - #[test] - fn parses_json_with_surrounding_prose() { - let decision = parse_router_decision( - "Here you go:\n```json\n{\"model\": \"flash\", \"thinking\": \"low\"}\n```", - ) - .unwrap(); - assert!(!decision.is_pro()); - assert_eq!(decision.thinking_effort(), ReasoningEffort::Low); - } - - #[test] - fn rejects_hallucinated_model() { - assert!(parse_router_decision(r#"{"model":"gpt-9","thinking":"high"}"#).is_none()); - } - - #[test] - fn assembles_completion_only_prefix_response() { - // Beta prefix completion returns only the continuation of `{"model":"`. - let decision = - parse_router_decision(&assemble_prefix_json("pro\",\"thinking\":\"max\"")).unwrap(); - assert!(decision.is_pro()); - assert_eq!(decision.thinking_effort(), ReasoningEffort::Max); - } - - #[test] - fn assemble_passes_through_echoed_full_json() { - let decision = parse_router_decision(&assemble_prefix_json( - "{\"model\":\"flash\",\"thinking\":\"low\"}", - )) - .unwrap(); - assert!(!decision.is_pro()); - assert_eq!(decision.thinking_effort(), ReasoningEffort::Low); - } - - #[test] - fn missing_thinking_defaults_to_high() { - let decision = parse_router_decision(r#"{"model":"pro"}"#).unwrap(); - assert_eq!(decision.thinking_effort(), ReasoningEffort::High); - } - - #[test] - fn recent_context_drops_current_prompt_and_system() { - let messages = vec![ - Message::system("sys"), - Message::user("first"), - Message::assistant("reply"), - Message::user("current prompt"), - ]; - let context = recent_context(&messages, 6, 900); - assert!(context.contains("user: first")); - assert!(context.contains("assistant: reply")); - assert!(!context.contains("current prompt")); - assert!(!context.contains("sys")); - } -} diff --git a/crates/deep-code-agent/src/runtime/state.rs b/crates/deep-code-agent/src/runtime/state.rs index d892620..8a66004 100644 --- a/crates/deep-code-agent/src/runtime/state.rs +++ b/crates/deep-code-agent/src/runtime/state.rs @@ -39,6 +39,13 @@ pub(super) struct RuntimeState { /// session-approvable by tool name, so this trusts at command granularity. /// In-memory only; compound commands are never matched (they keep prompting). pub(super) session_trusted_shell_prefixes: HashSet, + /// Cascade routing latch: set once Flash visibly struggles (repeated + /// tool-call execution failures within a turn). Sticky for the rest of the + /// session, forcing auto mode onto Pro. In-memory only. + pub(super) cascade_escalated: bool, + /// Tool-call execution failures observed in the current turn; reset at the + /// start of each turn. Crossing the cascade threshold latches `cascade_escalated`. + pub(super) turn_tool_errors: u32, } pub(super) struct Persistence { diff --git a/crates/deep-code-agent/src/runtime/tool_result.rs b/crates/deep-code-agent/src/runtime/tool_result.rs index ead76e4..11f043b 100644 --- a/crates/deep-code-agent/src/runtime/tool_result.rs +++ b/crates/deep-code-agent/src/runtime/tool_result.rs @@ -18,6 +18,12 @@ use crate::tool::{ pub(super) const CANCELLED_TOOL_RESULT: &str = "用户取消了本轮,该工具调用未执行 (cancelled by user)"; +/// Tool-call execution failures within a single turn that latch cascade +/// escalation (Flash → Pro for the rest of the session). Two mirrors the +/// "2–3 failed self-corrections, then escalate" rule of thumb without waiting +/// so long that a whole turn is wasted flailing on the weak model. +const CASCADE_ESCALATE_TOOL_ERRORS: u32 = 2; + /// Whether "approve for the whole session" may be recorded for a tool. /// Shell-class tools are excluded: their risk lives in the per-call /// arguments, so a blanket session consent would be misleading. @@ -378,6 +384,17 @@ impl AgentRuntime { if let Some(turn) = state.current_turn.as_mut() { turn.tool_results.push(result.clone()); } + // Cascade signal: a genuine execution failure means the model + // fumbled this tool call. Denials carry their own status and user + // cancellations carry a known marker, so neither counts. Enough + // fumbles in one turn latch escalation onto Pro (sticky for the + // session); the latch is read by the next turn's router. + if result.status == ToolResultStatus::Error && result.content != CANCELLED_TOOL_RESULT { + state.turn_tool_errors += 1; + if state.turn_tool_errors >= CASCADE_ESCALATE_TOOL_ERRORS { + state.cascade_escalated = true; + } + } } // Persistence and SessionUpdated are flushed once per batch boundary // (see process_tool_batch / finish_cancelled_calls), not per call. diff --git a/crates/deep-code-agent/src/runtime/turn_loop.rs b/crates/deep-code-agent/src/runtime/turn_loop.rs index 35510d8..cd0b196 100644 --- a/crates/deep-code-agent/src/runtime/turn_loop.rs +++ b/crates/deep-code-agent/src/runtime/turn_loop.rs @@ -2,7 +2,7 @@ use std::collections::{HashMap, VecDeque}; use tokio::sync::mpsc; -use crate::auto_mode::RouteContext; +use crate::auto_mode::{RouteContext, resolve_turn_route}; use crate::client::LlmClient; use crate::compaction::{estimate_token_count, stable_prefix_fingerprint}; use crate::event::AgentEvent; @@ -28,20 +28,24 @@ impl AgentRuntime { RouteContext { context_tokens, context_window: context_window_for_model(DEEPSEEK_V4_PRO), + escalated: state.cascade_escalated, }, ) }; let turn_id = self.current_turn_id().await; - // Routing may consult the Flash classifier (a short network call); let a - // cancel during that wait abort the turn instead of stalling. - let mut route = tokio::select! { - biased; - () = cancel.cancelled() => { - self.finish_turn_cancelled(&turn_id, tx).await; - return; - } - route = self.route_turn(&user_prompt, route_ctx) => route, - }; + if cancel.is_cancelled() { + self.finish_turn_cancelled(&turn_id, tx).await; + return; + } + // Routing is deterministic and local (no network): Flash-first unless a + // hard rule or difficulty keyword forces Pro, plus the cascade latch. + let mut route = resolve_turn_route( + &self.config, + &self.registry, + &user_prompt, + self.is_subagent, + route_ctx, + ); if self.maybe_compact(&route.effective_model, tx).await { // compaction event already emitted; continue with trimmed history From e98f1b93ffc009b4952c8cbf89bc2dc480631889 Mon Sep 17 00:00:00 2001 From: liwenkai <2020583117@qq.com> Date: Tue, 30 Jun 2026 21:06:36 +0800 Subject: [PATCH 2/2] =?UTF-8?q?refactor(config):=20=E7=A7=BB=E9=99=A4?= =?UTF-8?q?=E9=9A=8F=20Flash=20=E8=B7=AF=E7=94=B1=E5=99=A8=E4=B8=80?= =?UTF-8?q?=E5=B9=B6=E5=BA=9F=E5=BC=83=E7=9A=84=20router=5F*=20=E9=85=8D?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flash 路由器退役后,以下配置已无任何引用,删除: - AgentConfig 的 router_enabled / router_timeout_ms / router_context_turns 字段 - 对应的默认常量与 DEEP_CODE_AUTO_ROUTER_* 环境变量 - 配置文件的 [auto] 段(AutoSection)及其文件/env 解析与测试 --- crates/deep-code-agent/src/config/layers.rs | 56 +-------------------- crates/deep-code-agent/src/config/mod.rs | 16 ------ 2 files changed, 1 insertion(+), 71 deletions(-) diff --git a/crates/deep-code-agent/src/config/layers.rs b/crates/deep-code-agent/src/config/layers.rs index 128884c..fc9f3d3 100644 --- a/crates/deep-code-agent/src/config/layers.rs +++ b/crates/deep-code-agent/src/config/layers.rs @@ -10,8 +10,7 @@ use std::time::Duration; use serde::{Deserialize, Serialize}; use super::{ - APPROVAL_AUTO_ALLOW_ENV, AUTO_COST_SAVING_ENV, AUTO_ROUTER_CONTEXT_TURNS_ENV, - AUTO_ROUTER_ENABLED_ENV, AUTO_ROUTER_TIMEOUT_MS_ENV, AgentConfig, COMPACTION_THRESHOLD_ENV, + APPROVAL_AUTO_ALLOW_ENV, AUTO_COST_SAVING_ENV, AgentConfig, COMPACTION_THRESHOLD_ENV, COST_CURRENCY_ENV, DEEPSEEK_API_KEY_ENV, MODEL_ENV, REASONING_EFFORT_ENV, STREAM_CHUNK_TIMEOUT_ENV, STREAM_MAX_BYTES_ENV, STREAM_MAX_RETRIES_ENV, STREAM_TOTAL_TIMEOUT_ENV, @@ -172,15 +171,6 @@ struct ConfigFile { context: ContextSection, stream: StreamSection, approval: ApprovalSection, - auto: AutoSection, -} - -#[derive(Debug, Default, Deserialize)] -#[serde(default)] -struct AutoSection { - router_enabled: Option, - router_timeout_ms: Option, - router_context_turns: Option, } #[derive(Debug, Default, Deserialize)] @@ -355,16 +345,6 @@ fn apply_file_overlay( .collect(); } } - - if let Some(value) = file.auto.router_enabled { - config.router_enabled = value; - } - if let Some(value) = file.auto.router_timeout_ms { - config.router_timeout_ms = value; - } - if let Some(value) = file.auto.router_context_turns { - config.router_context_turns = value; - } } pub(super) fn apply_env_overlay( @@ -416,16 +396,6 @@ pub(super) fn apply_env_overlay( .filter(|rule| !rule.is_empty()) .collect(); } - if let Some(value) = lookup(AUTO_ROUTER_ENABLED_ENV) { - config.router_enabled = matches!(value.trim(), "1" | "true" | "yes" | "on"); - } - if let Some(value) = lookup(AUTO_ROUTER_TIMEOUT_MS_ENV).and_then(|value| value.parse().ok()) { - config.router_timeout_ms = value; - } - if let Some(value) = lookup(AUTO_ROUTER_CONTEXT_TURNS_ENV).and_then(|value| value.parse().ok()) - { - config.router_context_turns = value; - } } #[cfg(unix)] @@ -564,30 +534,6 @@ mod tests { assert_eq!(loaded.config.stream_chunk_timeout, Duration::from_secs(30)); } - #[test] - fn auto_router_settings_from_file_and_env() { - let global_dir = tempfile::tempdir().unwrap(); - let global = write_config( - global_dir.path(), - "[auto]\nrouter_enabled = false\nrouter_timeout_ms = 1500\nrouter_context_turns = 3\n", - ); - let loaded = AgentConfig::load_with(Some(global), None, &no_env); - assert!(!loaded.config.router_enabled); - assert_eq!(loaded.config.router_timeout_ms, 1500); - assert_eq!(loaded.config.router_context_turns, 3); - - // Env overrides the file. - let env = |name: &str| match name { - super::AUTO_ROUTER_ENABLED_ENV => Some("true".to_string()), - super::AUTO_ROUTER_TIMEOUT_MS_ENV => Some("2000".to_string()), - _ => None, - }; - let global = write_config(global_dir.path(), "[auto]\nrouter_enabled = false\n"); - let loaded = AgentConfig::load_with(Some(global), None, &env); - assert!(loaded.config.router_enabled); - assert_eq!(loaded.config.router_timeout_ms, 2000); - } - #[test] fn invalid_toml_layer_is_skipped_with_warning_not_panic() { let global_dir = tempfile::tempdir().unwrap(); diff --git a/crates/deep-code-agent/src/config/mod.rs b/crates/deep-code-agent/src/config/mod.rs index 5969530..962c82e 100644 --- a/crates/deep-code-agent/src/config/mod.rs +++ b/crates/deep-code-agent/src/config/mod.rs @@ -30,17 +30,11 @@ pub const STREAM_CHUNK_TIMEOUT_ENV: &str = "DEEP_CODE_STREAM_CHUNK_TIMEOUT_SECS" pub const STREAM_TOTAL_TIMEOUT_ENV: &str = "DEEP_CODE_STREAM_TOTAL_TIMEOUT_SECS"; pub const STREAM_MAX_BYTES_ENV: &str = "DEEP_CODE_STREAM_MAX_BYTES"; pub const APPROVAL_AUTO_ALLOW_ENV: &str = "DEEP_CODE_APPROVAL_AUTO_ALLOW"; -pub const AUTO_ROUTER_ENABLED_ENV: &str = "DEEP_CODE_AUTO_ROUTER_ENABLED"; -pub const AUTO_ROUTER_TIMEOUT_MS_ENV: &str = "DEEP_CODE_AUTO_ROUTER_TIMEOUT_MS"; -pub const AUTO_ROUTER_CONTEXT_TURNS_ENV: &str = "DEEP_CODE_AUTO_ROUTER_CONTEXT_TURNS"; pub const DEFAULT_STREAM_MAX_RETRIES: u32 = 3; pub const DEFAULT_STREAM_CHUNK_TIMEOUT_SECS: u64 = 300; pub const DEFAULT_STREAM_TOTAL_TIMEOUT_SECS: u64 = 900; pub const DEFAULT_STREAM_MAX_BYTES: u64 = 50 * 1024 * 1024; -pub const DEFAULT_AUTO_ROUTER_ENABLED: bool = true; -pub const DEFAULT_AUTO_ROUTER_TIMEOUT_MS: u64 = 2500; -pub const DEFAULT_AUTO_ROUTER_CONTEXT_TURNS: usize = 6; #[derive(Debug, Clone, PartialEq, Eq)] pub struct AgentConfig { @@ -65,13 +59,6 @@ pub struct AgentConfig { /// these run without prompting. Only env and the global config file may /// set this — project files are ignored (a repo must not disarm gates). pub approval_auto_allow: Vec, - /// Consult the Flash classifier for ambiguous auto-mode turns. When false, - /// auto mode is the pure heuristic (no extra model call). - pub router_enabled: bool, - /// Hard timeout for the Flash router round-trip, in milliseconds. - pub router_timeout_ms: u64, - /// Recent turns of context handed to the Flash router. - pub router_context_turns: usize, } impl Default for AgentConfig { @@ -103,9 +90,6 @@ impl AgentConfig { stream_total_timeout: Duration::from_secs(DEFAULT_STREAM_TOTAL_TIMEOUT_SECS), stream_max_bytes: DEFAULT_STREAM_MAX_BYTES, approval_auto_allow: Vec::new(), - router_enabled: DEFAULT_AUTO_ROUTER_ENABLED, - router_timeout_ms: DEFAULT_AUTO_ROUTER_TIMEOUT_MS, - router_context_turns: DEFAULT_AUTO_ROUTER_CONTEXT_TURNS, } }