From 6b440382805a92691ccdbb89baf917e63bfc685e Mon Sep 17 00:00:00 2001 From: Harness Loop Bot Date: Sun, 26 Apr 2026 08:51:16 +0200 Subject: [PATCH 1/2] feat(harness): detect redundant tool-call loops (3 consecutive identical outputs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production trace 2026-04-26: Slack DM "explore the codebase structure deeply" hit max_iterations_reached after sage's harness called workspace_list with the same path 7 times in a row, getting back byte-identical 12,604-char responses every time. The harness had no way to detect "same tool + same output again — make a decision", so it burned the budget and produced no useful reply. This adds a 5-entry ring buffer of (toolName, outputHash) per turn. When the last 3 entries match on BOTH toolName and outputHash, the harness exits with outcome=failed, stopReason=redundant_tool_loop — failed (not deferred) because more budget will not help; the model is dead-stuck on the same tool call and needs a different prompt or tool surface to make progress. User-facing copy added to stopReasonToUserMessage so consumers (sage, specialist surfaces) automatically pick up a useful error message instead of the generic "could not complete" fallback. Self-reviewed and peer-reviewed. Tests + typecheck green (163/163). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/harness.redundant-loop.test.ts | 218 ++++++++++++++++++ packages/harness/src/harness.ts | 59 ++++- .../harness/src/stop-reason-message.test.ts | 6 + packages/harness/src/stop-reason-message.ts | 4 + packages/harness/src/types.ts | 2 + packages/harness/vitest.config.ts | 22 ++ 6 files changed, 303 insertions(+), 8 deletions(-) create mode 100644 packages/harness/src/harness.redundant-loop.test.ts create mode 100644 packages/harness/vitest.config.ts diff --git a/packages/harness/src/harness.redundant-loop.test.ts b/packages/harness/src/harness.redundant-loop.test.ts new file mode 100644 index 0000000..747e84a --- /dev/null +++ b/packages/harness/src/harness.redundant-loop.test.ts @@ -0,0 +1,218 @@ +import { describe, expect, it, vi } from 'vitest'; + +import { createHarness, type HarnessModelOutput, type HarnessToolResult } from './index.js'; + +function createInput() { + return { + assistantId: 'assistant-1', + turnId: 'turn-1', + workspaceId: 'workspace-1', + sessionId: 'session-1', + userId: 'user-1', + threadId: 'thread-1', + message: { + id: 'msg-1', + text: 'help me', + receivedAt: '2026-04-13T12:00:00.000Z', + }, + instructions: { + systemPrompt: 'You are helpful.', + }, + }; +} + +function createClock(times: number[]) { + let index = 0; + return { + now: () => times[Math.min(index++, times.length - 1)] ?? 0, + nowIso: () => new Date(times[Math.min(index, times.length - 1)] ?? 0).toISOString(), + }; +} + +function createToolRequest( + id: string, + name: string, + input: Record = {}, +): HarnessModelOutput { + return { + type: 'tool_request', + calls: [{ id, name, input }], + }; +} + +function createHarnessForResults(results: HarnessToolResult[], finalAnswer = 'Done') { + const toolNames = [...new Set(results.map((result) => result.toolName))]; + const steps: HarnessModelOutput[] = results.map((result) => + createToolRequest(result.callId, result.toolName, { query: `input-for-${result.callId}` }), + ); + steps.push({ type: 'final_answer', text: finalAnswer }); + + let resultIndex = 0; + + return createHarness({ + model: { + nextStep: vi.fn(async () => steps.shift() as HarnessModelOutput), + }, + tools: { + listAvailable: async () => toolNames.map((name) => ({ name, description: `${name} tool` })), + execute: async () => results[resultIndex++] as HarnessToolResult, + }, + clock: createClock(Array.from({ length: 32 }, (_, index) => index)), + }); +} + +describe('createHarness redundant tool loop detection', () => { + it('fails with redundant_tool_loop after 3 identical outputs from the same tool', async () => { + const harness = createHarnessForResults([ + { callId: 'call-1', toolName: 'lookup', status: 'success', output: 'same-result' }, + { callId: 'call-2', toolName: 'lookup', status: 'success', output: 'same-result' }, + { callId: 'call-3', toolName: 'lookup', status: 'success', output: 'same-result' }, + ]); + + const result = await harness.runTurn(createInput()); + + expect(result.outcome).toBe('failed'); + expect(result.stopReason).toBe('redundant_tool_loop'); + }); + + it('does not fire when the same tool returns different outputs', async () => { + const harness = createHarnessForResults([ + { callId: 'call-1', toolName: 'lookup', status: 'success', output: 'result-a' }, + { callId: 'call-2', toolName: 'lookup', status: 'success', output: 'result-b' }, + { callId: 'call-3', toolName: 'lookup', status: 'success', output: 'result-c' }, + ]); + + const result = await harness.runTurn(createInput()); + + expect(result.outcome).toBe('completed'); + expect(result.stopReason).toBe('answer_finalized'); + expect(result.assistantMessage?.text).toBe('Done'); + }); + + it('does not fire when identical outputs come from different tools', async () => { + const harness = createHarnessForResults([ + { callId: 'call-1', toolName: 'lookup', status: 'success', output: 'shared' }, + { callId: 'call-2', toolName: 'lookup', status: 'success', output: 'shared' }, + { callId: 'call-3', toolName: 'search', status: 'success', output: 'shared' }, + { callId: 'call-4', toolName: 'search', status: 'success', output: 'shared' }, + ]); + + const result = await harness.runTurn(createInput()); + + expect(result.outcome).toBe('completed'); + expect(result.stopReason).toBe('answer_finalized'); + }); + + it('does not fire when a different output breaks the repeated run', async () => { + const harness = createHarnessForResults([ + { callId: 'call-1', toolName: 'lookup', status: 'success', output: 'X' }, + { callId: 'call-2', toolName: 'lookup', status: 'success', output: 'X' }, + { callId: 'call-3', toolName: 'lookup', status: 'success', output: 'Y' }, + { callId: 'call-4', toolName: 'lookup', status: 'success', output: 'X' }, + { callId: 'call-5', toolName: 'lookup', status: 'success', output: 'X' }, + ]); + + const result = await harness.runTurn(createInput()); + + expect(result.outcome).toBe('completed'); + expect(result.stopReason).toBe('answer_finalized'); + }); + + it('fires even when identical outputs came from different inputs', async () => { + const steps: HarnessModelOutput[] = [ + createToolRequest('call-1', 'lookup', { query: 'alpha' }), + createToolRequest('call-2', 'lookup', { query: 'beta' }), + createToolRequest('call-3', 'lookup', { query: 'gamma' }), + { type: 'final_answer', text: 'Done' }, + ]; + const execute = vi + .fn() + .mockResolvedValueOnce({ + callId: 'call-1', + toolName: 'lookup', + status: 'success', + output: 'same-result', + } satisfies HarnessToolResult) + .mockResolvedValueOnce({ + callId: 'call-2', + toolName: 'lookup', + status: 'success', + output: 'same-result', + } satisfies HarnessToolResult) + .mockResolvedValueOnce({ + callId: 'call-3', + toolName: 'lookup', + status: 'success', + output: 'same-result', + } satisfies HarnessToolResult); + + const harness = createHarness({ + model: { + nextStep: vi.fn(async () => steps.shift() as HarnessModelOutput), + }, + tools: { + listAvailable: async () => [{ name: 'lookup', description: 'lookup tool' }], + execute, + }, + clock: createClock(Array.from({ length: 16 }, (_, index) => index)), + }); + + const result = await harness.runTurn(createInput()); + + expect(execute).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ input: { query: 'alpha' } }), + expect.anything(), + ); + expect(execute).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ input: { query: 'beta' } }), + expect.anything(), + ); + expect(execute).toHaveBeenNthCalledWith( + 3, + expect.objectContaining({ input: { query: 'gamma' } }), + expect.anything(), + ); + expect(result.outcome).toBe('failed'); + expect(result.stopReason).toBe('redundant_tool_loop'); + }); + + it('ignores tool errors when counting a later 3-success redundant loop', async () => { + const harness = createHarnessForResults([ + { + callId: 'call-1', + toolName: 'lookup', + status: 'error', + error: { code: 'temporary', message: 'retry me', retryable: true }, + }, + { callId: 'call-2', toolName: 'lookup', status: 'success', output: 'same-result' }, + { callId: 'call-3', toolName: 'lookup', status: 'success', output: 'same-result' }, + { callId: 'call-4', toolName: 'lookup', status: 'success', output: 'same-result' }, + ]); + + const result = await harness.runTurn(createInput()); + + expect(result.outcome).toBe('failed'); + expect(result.stopReason).toBe('redundant_tool_loop'); + }); + + it('treats an error between successes as breaking consecutiveness', async () => { + const harness = createHarnessForResults([ + { callId: 'call-1', toolName: 'lookup', status: 'success', output: 'same-result' }, + { + callId: 'call-2', + toolName: 'lookup', + status: 'error', + error: { code: 'temporary', message: 'retry me', retryable: true }, + }, + { callId: 'call-3', toolName: 'lookup', status: 'success', output: 'same-result' }, + { callId: 'call-4', toolName: 'lookup', status: 'success', output: 'same-result' }, + ]); + + const result = await harness.runTurn(createInput()); + + expect(result.outcome).toBe('completed'); + expect(result.stopReason).toBe('answer_finalized'); + }); +}); diff --git a/packages/harness/src/harness.ts b/packages/harness/src/harness.ts index 3329a79..9173732 100644 --- a/packages/harness/src/harness.ts +++ b/packages/harness/src/harness.ts @@ -41,6 +41,7 @@ type MutableState = { transcript: HarnessTranscriptItem[]; modelCalls: HarnessModelCallRecord[]; usage: HarnessAggregateUsage; + recentToolResultHashes?: { toolName: string; outputHash: number }[]; consecutiveInvalidOutputs: number; finalEventType: string; }; @@ -138,6 +139,7 @@ async function runTurn(config: NormalizedConfig, input: HarnessTurnInput): Promi modelCalls: 0, toolCalls: 0, }, + recentToolResultHashes: [], consecutiveInvalidOutputs: 0, finalEventType: 'turn_started', }; @@ -326,6 +328,7 @@ async function runTurn(config: NormalizedConfig, input: HarnessTurnInput): Promi state.transcript.push({ type: 'tool_result', iteration, result }); if (result.status === 'error') { + state.recentToolResultHashes = []; await emit(config, input, state, { type: 'tool_failed', result }); await config.hooks?.onToolError?.(result, executionState(input, state, startedAt, config)); if (result.error?.retryable !== true) { @@ -338,6 +341,29 @@ async function runTurn(config: NormalizedConfig, input: HarnessTurnInput): Promi } } else { await emit(config, input, state, { type: 'tool_finished', result }); + const hash = djb2Hash(result.output ?? JSON.stringify(result.structuredOutput ?? {})); + state.recentToolResultHashes ??= []; + state.recentToolResultHashes.push({ toolName: result.toolName, outputHash: hash }); + if (state.recentToolResultHashes.length > 5) state.recentToolResultHashes.shift(); + + const lastThree = state.recentToolResultHashes.slice(-3); + const [first] = lastThree; + if ( + first && + lastThree.length === 3 && + lastThree.every( + (entry) => + entry.toolName === first.toolName && entry.outputHash === first.outputHash, + ) + ) { + finalResult = await buildLimitResult( + config, + input, + state, + 'redundant_tool_loop', + ); + return finalResult; + } } finalResult = await checkLimits(config, input, state, startedAt); @@ -446,18 +472,29 @@ async function buildLimitResult( config: NormalizedConfig, input: HarnessTurnInput, state: MutableState, - stopReason: Extract, + stopReason: Extract< + HarnessStopReason, + | 'max_iterations_reached' + | 'max_tool_calls_reached' + | 'timeout_reached' + | 'budget_reached' + | 'redundant_tool_loop' + >, ): Promise { await emit(config, input, state, { type: 'limit_reached', stopReason }); + const outcome = stopReason === 'redundant_tool_loop' ? 'failed' : 'deferred'; return buildResult(input, state, { - outcome: 'deferred', + outcome, stopReason, - continuation: createContinuation(config, input, 'deferred', { - stopReason, - transcript: summarizeTranscript(state.transcript), - iteration: state.iteration, - toolCallCount: state.toolCallCount, - }), + continuation: + outcome === 'deferred' + ? createContinuation(config, input, 'deferred', { + stopReason, + transcript: summarizeTranscript(state.transcript), + iteration: state.iteration, + toolCallCount: state.toolCallCount, + }) + : undefined, }); } @@ -619,6 +656,12 @@ function getElapsedMs(config: NormalizedConfig, startedAt: number): number { return Math.max(0, config.clock.now() - startedAt); } +function djb2Hash(s: string): number { + let hash = 5381; + for (let i = 0; i < s.length; i++) hash = ((hash * 33) ^ s.charCodeAt(i)) | 0; + return hash; +} + async function emit( config: NormalizedConfig, input: HarnessTurnInput, diff --git a/packages/harness/src/stop-reason-message.test.ts b/packages/harness/src/stop-reason-message.test.ts index 3e4f6cc..94c10a7 100644 --- a/packages/harness/src/stop-reason-message.test.ts +++ b/packages/harness/src/stop-reason-message.test.ts @@ -9,6 +9,7 @@ const ALL_STOP_REASONS: HarnessStopReason[] = [ 'approval_required', 'max_iterations_reached', 'max_tool_calls_reached', + 'redundant_tool_loop', 'timeout_reached', 'budget_reached', 'tool_unavailable', @@ -72,4 +73,9 @@ describe('stopReasonToUserMessage', () => { it('signals cancellation on cancelled', () => { expect(stopReasonToUserMessage('cancelled').toLowerCase()).toContain('cancel'); }); + + it('uses loop-oriented copy for redundant_tool_loop', () => { + const message = stopReasonToUserMessage('redundant_tool_loop').toLowerCase(); + expect(['loop', 'stuck', 'repeating'].some((signal) => message.includes(signal))).toBe(true); + }); }); diff --git a/packages/harness/src/stop-reason-message.ts b/packages/harness/src/stop-reason-message.ts index 066d236..df9d488 100644 --- a/packages/harness/src/stop-reason-message.ts +++ b/packages/harness/src/stop-reason-message.ts @@ -38,6 +38,10 @@ export function stopReasonToUserMessage( return canRetry ? "I got stuck looping on tools and didn't reach an answer. Retrying with a simpler path." : "I got stuck looping on tools and didn't reach an answer. Try rephrasing or breaking the request into smaller pieces."; + case 'redundant_tool_loop': + return canRetry + ? 'I got stuck in a loop on that tool call — let me retry with a different approach.' + : 'I got stuck repeating the same tool call without making progress. Try rephrasing or asking about a more specific entity.'; case 'timeout_reached': return canRetry ? "That took too long to gather. Retrying with a simpler path." diff --git a/packages/harness/src/types.ts b/packages/harness/src/types.ts index 66f4b33..a1a10ab 100644 --- a/packages/harness/src/types.ts +++ b/packages/harness/src/types.ts @@ -324,6 +324,7 @@ export type HarnessStopReason = | 'approval_required' | 'max_iterations_reached' | 'max_tool_calls_reached' + | 'redundant_tool_loop' | 'timeout_reached' | 'budget_reached' | 'tool_unavailable' @@ -439,6 +440,7 @@ export interface HarnessLimitReachedEvent extends HarnessBaseTraceEvent { stopReason: | 'max_iterations_reached' | 'max_tool_calls_reached' + | 'redundant_tool_loop' | 'timeout_reached' | 'budget_reached'; } diff --git a/packages/harness/vitest.config.ts b/packages/harness/vitest.config.ts new file mode 100644 index 0000000..1780d50 --- /dev/null +++ b/packages/harness/vitest.config.ts @@ -0,0 +1,22 @@ +import { fileURLToPath } from 'node:url'; + +import { defineConfig } from 'vitest/config'; + +const workspacePackageRoot = (name: string) => + fileURLToPath(new URL(`../${name}/src/index.ts`, import.meta.url)); + +export default defineConfig({ + resolve: { + alias: { + '@agent-assistant/connectivity': workspacePackageRoot('connectivity'), + '@agent-assistant/core': workspacePackageRoot('core'), + '@agent-assistant/coordination': workspacePackageRoot('coordination'), + '@agent-assistant/traits': workspacePackageRoot('traits'), + '@agent-assistant/turn-context': workspacePackageRoot('turn-context'), + '@agent-assistant/vfs': workspacePackageRoot('vfs'), + }, + }, + test: { + environment: 'node', + }, +}); From e897c090eca67cff7c319e3665e78ecbb728214e Mon Sep 17 00:00:00 2001 From: Harness Loop Bot Date: Sun, 26 Apr 2026 09:05:35 +0200 Subject: [PATCH 2/2] fix(harness): skip redundant-loop detection on empty payloads + safe stringify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two codex review issues on PR #63: P1 — false-positive on side-effect tools. Pre-fix the detector hashed `JSON.stringify({})` whenever both `result.output` and `result.structuredOutput` were absent. Three successful calls from a side-effect tool (writes/notifications/etc) all hashed identically and falsely tripped redundant_tool_loop after the third call, even though each call had different inputs and made real progress. Fix: skip the detector entirely when the result has no payload to compare. P2 — JSON.stringify can throw on non-serializable values. structuredOutput is typed as Record and may carry BigInt or circular refs. A throw inside the detector bubbled to the outer harness catch and converted a successful tool execution into a runtime_error that failed the whole turn. Fix: wrap stringify in try/catch; treat serialization failures as "no comparable signature" — the detector skips, the turn continues. Both behaviors live in a new `computeToolResultSignature()` helper that returns null when there's no comparable payload (or the payload can't be serialized) and a number otherwise. The redundant-loop check is gated on the helper returning a non-null signature. Two new regression tests covering each case. 165/165 harness suite passes; typecheck clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/harness.redundant-loop.test.ts | 56 +++++++++++ packages/harness/src/harness.ts | 92 ++++++++++++++----- 2 files changed, 126 insertions(+), 22 deletions(-) diff --git a/packages/harness/src/harness.redundant-loop.test.ts b/packages/harness/src/harness.redundant-loop.test.ts index 747e84a..1ea80ca 100644 --- a/packages/harness/src/harness.redundant-loop.test.ts +++ b/packages/harness/src/harness.redundant-loop.test.ts @@ -215,4 +215,60 @@ describe('createHarness redundant tool loop detection', () => { expect(result.outcome).toBe('completed'); expect(result.stopReason).toBe('answer_finalized'); }); + + // Codex P1 review on PR #63: side-effect tools that return no payload would + // all hash to "{}" and false-trigger the redundant_tool_loop detector after + // 3 calls, even though each call had different inputs and made progress. + it('does not detect a loop when consecutive results have no payload (side-effect tools)', async () => { + const harness = createHarnessForResults([ + // No `output`, no `structuredOutput` — empty success, common for + // side-effect tools (writes, notifications, etc). + { callId: 'call-1', toolName: 'notify', status: 'success' }, + { callId: 'call-2', toolName: 'notify', status: 'success' }, + { callId: 'call-3', toolName: 'notify', status: 'success' }, + { callId: 'call-4', toolName: 'notify', status: 'success' }, + ]); + + const result = await harness.runTurn(createInput()); + + // The detector should have stayed inert. Final outcome is the model + // emitting answer_finalized after the side-effect chain. + expect(result.outcome).toBe('completed'); + expect(result.stopReason).toBe('answer_finalized'); + }); + + // Codex P2 review on PR #63: JSON.stringify on structuredOutput could throw + // on non-serializable values (BigInt at root, circular refs). Pre-fix this + // bubbled to the outer harness catch and converted a successful tool + // execution into a runtime_error. Now: serialization failures are treated + // as "no comparable signature" — the detector skips, the turn continues. + it('handles non-serializable structuredOutput without crashing the turn', async () => { + const cyclicObject: Record = { kind: 'cyclic' }; + cyclicObject.self = cyclicObject; + + const harness = createHarnessForResults([ + // BigInt inside the structuredOutput — JSON.stringify would normally + // throw "Do not know how to serialize a BigInt". + { + callId: 'call-1', + toolName: 'lookup', + status: 'success', + structuredOutput: { count: 9007199254740993n as unknown as number }, + }, + // Circular reference — JSON.stringify throws "Converting circular structure to JSON". + { + callId: 'call-2', + toolName: 'lookup', + status: 'success', + structuredOutput: cyclicObject, + }, + ]); + + const result = await harness.runTurn(createInput()); + + // Critical: must NOT be runtime_error. The successful tool calls survive + // the serialization failure and the harness continues to model finalization. + expect(result.outcome).toBe('completed'); + expect(result.stopReason).toBe('answer_finalized'); + }); }); diff --git a/packages/harness/src/harness.ts b/packages/harness/src/harness.ts index 9173732..c7cb941 100644 --- a/packages/harness/src/harness.ts +++ b/packages/harness/src/harness.ts @@ -341,28 +341,43 @@ async function runTurn(config: NormalizedConfig, input: HarnessTurnInput): Promi } } else { await emit(config, input, state, { type: 'tool_finished', result }); - const hash = djb2Hash(result.output ?? JSON.stringify(result.structuredOutput ?? {})); - state.recentToolResultHashes ??= []; - state.recentToolResultHashes.push({ toolName: result.toolName, outputHash: hash }); - if (state.recentToolResultHashes.length > 5) state.recentToolResultHashes.shift(); - - const lastThree = state.recentToolResultHashes.slice(-3); - const [first] = lastThree; - if ( - first && - lastThree.length === 3 && - lastThree.every( - (entry) => - entry.toolName === first.toolName && entry.outputHash === first.outputHash, - ) - ) { - finalResult = await buildLimitResult( - config, - input, - state, - 'redundant_tool_loop', - ); - return finalResult; + // Compute a hash signature of the tool result so we can detect + // the model dead-looping on the same tool call. Two safety + // guards (codex P1 + P2 review on PR #63): + // + // 1. Skip the detector when the tool returned no payload (no + // `output`, no `structuredOutput`). Side-effect tools that + // intentionally return nothing would otherwise all hash to + // `"{}"` and trip the detector after 3 calls even though + // each call had different inputs and made real progress. + // + // 2. Wrap JSON.stringify in try/catch so non-serializable values + // (BigInt, circular refs) cannot throw and convert a + // successful tool execution into a runtime_error. + const signature = computeToolResultSignature(result); + if (signature !== null) { + state.recentToolResultHashes ??= []; + state.recentToolResultHashes.push({ toolName: result.toolName, outputHash: signature }); + if (state.recentToolResultHashes.length > 5) state.recentToolResultHashes.shift(); + + const lastThree = state.recentToolResultHashes.slice(-3); + const [first] = lastThree; + if ( + first && + lastThree.length === 3 && + lastThree.every( + (entry) => + entry.toolName === first.toolName && entry.outputHash === first.outputHash, + ) + ) { + finalResult = await buildLimitResult( + config, + input, + state, + 'redundant_tool_loop', + ); + return finalResult; + } } } @@ -662,6 +677,39 @@ function djb2Hash(s: string): number { return hash; } +/** + * Compute a stable signature for a tool result, used by the redundant-tool-loop + * detector. Returns null when the result has no payload to compare — side-effect + * tools that intentionally return nothing should NOT trigger loop detection + * just because their empty results all hash identically. + * + * Wraps JSON.stringify in try/catch so non-serializable values (BigInt, + * circular references) in structuredOutput cannot throw. A serialization + * failure is treated as "no comparable signature" — the detector skips this + * call rather than risk converting a successful tool execution into a + * runtime_error via the outer catch. + */ +function computeToolResultSignature(result: HarnessToolResult): number | null { + if (typeof result.output === 'string' && result.output.length > 0) { + return djb2Hash(result.output); + } + if (result.structuredOutput !== undefined) { + try { + const serialized = JSON.stringify(result.structuredOutput); + // JSON.stringify returns undefined for values that cannot be + // serialized (e.g. plain `BigInt` at the root). Treat that as no + // comparable signature too. + if (typeof serialized !== 'string' || serialized.length === 0 || serialized === '{}') { + return null; + } + return djb2Hash(serialized); + } catch { + return null; + } + } + return null; +} + async function emit( config: NormalizedConfig, input: HarnessTurnInput,