From c96ee72dd3364c952025b84e5938ca51040be5fa Mon Sep 17 00:00:00 2001 From: liwenkai <2020583117@qq.com> Date: Fri, 26 Jun 2026 16:25:05 +0800 Subject: [PATCH] =?UTF-8?q?fix(agent):=20token=20=E4=BC=B0=E7=AE=97?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=20CJK=20=E6=84=9F=E7=9F=A5(=E4=B8=AD?= =?UTF-8?q?=E6=96=87=E2=89=881=20token/=E5=AD=97),=E4=BF=AE=E6=AD=A3?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=B8=8A=E4=B8=8B=E6=96=87=E4=BD=8E=E4=BC=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/deep-code-agent/src/compaction.rs | 55 +++++++++++++++++++++--- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/crates/deep-code-agent/src/compaction.rs b/crates/deep-code-agent/src/compaction.rs index be5e772..d968f37 100644 --- a/crates/deep-code-agent/src/compaction.rs +++ b/crates/deep-code-agent/src/compaction.rs @@ -41,13 +41,40 @@ pub struct CompactionResult { pub archived_count: usize, } +/// Rough token estimate. ASCII/Latin text is ~4 chars per token, but CJK text +/// is closer to ~1 token per character — counting it as `chars/4` (the old +/// behavior) underestimated Chinese context by 3–4×, tripping compaction far +/// too late and skewing the cost/usage display for DeepSeek's main audience. #[must_use] pub fn estimate_token_count(messages: &[Message]) -> u32 { - let chars = messages - .iter() - .map(|message| message.content.chars().count()) - .sum::(); - (chars / 4).max(1) as u32 + let mut cjk = 0usize; + let mut other = 0usize; + for message in messages { + for ch in message.content.chars() { + if is_cjk(ch) { + cjk += 1; + } else { + other += 1; + } + } + } + // CJK ≈ 1 token/char; other text ≈ 4 chars/token. + (cjk + other / 4).max(1) as u32 +} + +/// Whether a character is CJK-ish (Chinese/Japanese/Korean script or wide +/// punctuation), for the per-character token estimate. +fn is_cjk(ch: char) -> bool { + matches!(ch as u32, + 0x3000..=0x303F // CJK symbols & punctuation + | 0x3040..=0x30FF // Hiragana + Katakana + | 0x3400..=0x4DBF // CJK Unified Ideographs Ext A + | 0x4E00..=0x9FFF // CJK Unified Ideographs + | 0xAC00..=0xD7AF // Hangul syllables + | 0xF900..=0xFAFF // CJK compatibility ideographs + | 0xFF00..=0xFFEF // Halfwidth/Fullwidth forms + | 0x20000..=0x2FFFF // CJK Unified Ideographs Ext B–F + ) } #[must_use] @@ -161,6 +188,24 @@ mod tests { assert_eq!(result.messages.last().unwrap().content, "a11"); } + #[test] + fn cjk_text_is_not_underestimated() { + // 100 Han chars ≈ ~100 tokens (not 25 like the old chars/4); ASCII stays ~/4. + let han = "\u{5b57}".repeat(100); + let ascii = "a".repeat(100); + let cjk_tokens = estimate_token_count(&[Message::user(han)]); + let ascii_tokens = estimate_token_count(&[Message::user(ascii)]); + assert!( + cjk_tokens >= 90, + "CJK should count ~1 token/char, got {cjk_tokens}" + ); + assert!( + ascii_tokens <= 30, + "ASCII should stay ~chars/4, got {ascii_tokens}" + ); + assert!(cjk_tokens > ascii_tokens); + } + #[test] fn compaction_threshold_override() { let mut messages = vec![Message::system("sys")];