From fc80a01f566f7fe66de9ed1127494bb40fe37c98 Mon Sep 17 00:00:00 2001 From: caraka Date: Sat, 13 Jun 2026 13:07:15 -0400 Subject: [PATCH] feat(Inference): disable extended-thinking budget for fast/snap-judgment calls Every `claude --print` subprocess silently spends an extended-thinking budget, adding ~10s+ of latency to snap-judgment calls regardless of model tier (measured ~14.9s vs ~3.2s on a classification payload). Add a thinking?: boolean option to inference() and a --no-thinking CLI flag that set MAX_THINKING_TOKENS=0; level: 'fast' implies it. Other levels unchanged unless they opt out. --- .../v5.0.0/.claude/PAI/TOOLS/Inference.ts | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/Releases/v5.0.0/.claude/PAI/TOOLS/Inference.ts b/Releases/v5.0.0/.claude/PAI/TOOLS/Inference.ts index 22f348def9..d2fc24021f 100755 --- a/Releases/v5.0.0/.claude/PAI/TOOLS/Inference.ts +++ b/Releases/v5.0.0/.claude/PAI/TOOLS/Inference.ts @@ -25,9 +25,10 @@ * --auto-state v3.24 P5: Auto-synthesize state from current ISA + recent activity (advisor mode only, 2 positional args: task, question) * --json Expect and parse JSON response * --timeout Custom timeout (default varies by level) + * --no-thinking Disable the extended-thinking budget (snap-judgment latency); implied by --level fast * * DEFAULTS BY LEVEL: - * fast: model=haiku, timeout=15s + * fast: model=haiku, timeout=15s, thinking=off * standard: model=sonnet, timeout=30s * smart: model=opus, timeout=90s * advisor: model=opus, timeout=120s @@ -71,6 +72,12 @@ export interface InferenceOptions { * are prepended to the user prompt as @-references so Claude reads them as * image attachments. Routes through subscription like all other inference. */ imagePaths?: string[]; + /** Set to `false` to disable the extended-thinking budget for this call (sets + * MAX_THINKING_TOKENS=0 in the subprocess env). Every `claude --print` call + * otherwise silently spends a thinking budget — measured ~14.9s vs ~3.2s on a + * snap-judgment classification payload, with model tier nearly irrelevant. + * Implicitly disabled for `level: 'fast'`. */ + thinking?: boolean; } export interface InferenceResult { @@ -116,6 +123,15 @@ export async function inference(options: InferenceOptions): Promise 0; const args = [ '--print', @@ -414,6 +430,7 @@ async function main() { let expectJson = false; let timeout: number | undefined; let level: InferenceLevel = 'standard'; + let noThinking = false; let mode: 'inference' | 'advisor' = 'inference'; let autoState = false; // v3.24 P5 const positionalArgs: string[] = []; @@ -444,6 +461,8 @@ async function main() { } else if (args[i] === '--timeout' && args[i + 1]) { timeout = parseInt(args[i + 1], 10); i++; + } else if (args[i] === '--no-thinking') { + noThinking = true; } else { positionalArgs.push(args[i]); } @@ -483,7 +502,7 @@ async function main() { } if (positionalArgs.length < 2) { - console.error('Usage: bun Inference.ts [--level fast|standard|smart] [--json] [--timeout ] '); + console.error('Usage: bun Inference.ts [--level fast|standard|smart] [--json] [--timeout ] [--no-thinking] '); process.exit(1); } @@ -495,6 +514,7 @@ async function main() { level, expectJson, timeout, + thinking: noThinking ? false : undefined, }); if (result.success) {