diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..52a7daf --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,41 @@ +# Changelog + +All notable changes to `ai-consensus-core` will be documented here. +Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), [SemVer](https://semver.org/spec/v2.0.0.html). + +## [0.11.0] — 2026-04-30 + +### Added — tool calling + +The engine can now drive a per-participant tool-call loop. Hosts plug in a `toolExecutor`; participants declare a `tools` list; the engine dispatches each `ToolCall` from the model, feeds the results back, and re-invokes the caller until the model returns final content or `maxToolIterations` is hit. + +- `Participant.tools?: ToolDefinition[]` — per-participant tool inventory +- `ConsensusOptions.toolExecutor?: ToolExecutor` — host-supplied dispatcher +- `ConsensusOptions.maxToolIterations?: number` — loop cap (default 8, clamped to [1, 32]) +- `ModelCallRequest.tools?: ToolDefinition[]` — forwarded verbatim to the caller +- `ModelCallRequest.toolCallTurns?: ToolCallTurn[]` — accumulated history on follow-up calls +- `ModelCallResponse.toolCalls?: ToolCall[]` — caller may return tool-call requests +- New types: `ToolDefinition`, `ToolCall`, `ToolCallTurn`, `ToolCallContext`, `ToolExecutionResult`, `ToolExecutor` +- New events: `toolCallStart`, `toolCallComplete`, `toolError` +- Schema export: `ToolDefinitionSchema` +- Constant export: `MAX_TOOL_ITERATIONS_CAP` (= 32) + +See README "Tool calling" section for the full contract and integration recipe. + +### Backward compatibility + +100% backward compatible with 0.10.x. Every new field is optional; absence of `toolExecutor` ⇒ engine behaviour is byte-identical to 0.10. Existing tests pass without modification (130/130) plus 12 new tool-calling tests (142 total). + +### Internal + +- New private engine helper: `#runParticipantTurn` (drives the tool loop) +- New private engine helper: `#dispatchToolCalls` (per-iteration dispatch + events) +- Token usage is summed across loop iterations; the final response's `content` is the participant turn's output + +## [0.10.0] — 2026-04-24 + +- Removed the seven Roundtable personas from the library; they live in `docs/personas.md` for callers to copy. Only `JUDGE_PERSONA` remains in code. +- Added `ConsensusOptions.judge.systemPrompt` to override the synthesis prompt (with documented contract on the `## Majority Position` / `## Synthesis Confidence` markers). +- Replaced ReDoS-prone regex parsers with linear string scans. + +(Earlier history is in git; this CHANGELOG starts at 0.10.0.) diff --git a/README.md b/README.md index 3f56a83..ba03ece 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,73 @@ export interface ModelCallResponse { 4. **Don't swallow other errors.** Throw. The engine captures the error into `ParticipantResponse` and keeps running. 5. **Return the full content verbatim.** Do not strip the trailing `CONFIDENCE:` line — the parser needs it. +## Tool calling + +Participants can invoke tools mid-turn. The library never executes a tool itself — it plumbs the request from the model to a host-supplied `ToolExecutor`, feeds the results back, and re-invokes the caller until the model returns final content (or `maxToolIterations` is exhausted). + +```ts +import { ConsensusEngine, type ToolDefinition, type ToolExecutor } from "ai-consensus-core"; + +const READ_FILE_TOOL: ToolDefinition = { + name: "read_file", + description: "Read a file by absolute path.", + parameters: { type: "object", properties: { path: { type: "string" } }, required: ["path"] }, +}; + +const participants = [ + { + id: "domain", + modelId: "claude-sonnet-4-6", + persona: domainExpertPersona, + tools: [READ_FILE_TOOL], // declared per-participant + }, + // … +]; + +const toolExecutor: ToolExecutor = async (call, ctx) => { + // ctx: { participantId, round, phase, signal? } + if (call.name === "read_file") { + const args = call.arguments as { path: string }; + return { content: await readFile(args.path, "utf8") }; + // …or { error: "permission denied" } to feed an error back into the conversation + } + return { error: `unknown tool ${call.name}` }; +}; + +const engine = new ConsensusEngine(modelCaller); +const result = await engine.run({ + question: "What does the build output say?", + participants, + toolExecutor, + maxToolIterations: 8, // optional, default 8, clamped to [1, 32] +}); +``` + +**ModelCaller responsibilities** (when `tools` is present on the request): + +- Translate the `tools` array into whatever the underlying provider expects (OpenAI's `tools`, Anthropic's `tools`, etc.). +- Translate `toolCallTurns` (when present, on follow-up calls) into the conversation history the provider expects — typically: assistant message with tool_calls, then tool messages with results, in order. +- Parse the model's response and surface `toolCalls` on `ModelCallResponse` if the model wants to dispatch tools. Each `ToolCall` carries `{ id, name, arguments }` where `arguments` is **already JSON-parsed**. The library never parses the model's raw argument string. + +**Engine guarantees:** + +- The tool loop runs **per participant turn**, separately for each model call. Tool history does not leak between participants or between rounds. +- The executor receives a fresh `ToolCallContext` (with `participantId`, `round`, `phase`, `signal`) per call. +- An exception thrown by the executor is captured as a `{ error: message }` result and forwarded back into the conversation — the participant turn does not abort. +- `AbortError` thrown by the executor (or `signal` triggered) propagates up and aborts the whole run with `stopReason: "aborted"`. +- Hitting `maxToolIterations` breaks the loop and uses the last response's `content` — even if the model still wants more tools. +- Without `toolExecutor`, the engine ignores any `toolCalls` on the response: 0.10 behaviour preserved exactly. + +**Events:** + +```ts +engine.on("toolCallStart", (e: ToolCallStartEvent) => void); +engine.on("toolCallComplete", (e: ToolCallCompleteEvent) => void); // ok: boolean, durationMs, preview (≤ 200 chars) +engine.on("toolError", (e: ToolErrorEvent) => void); // fires when ok === false +``` + +`iteration` (1-based) on these events disambiguates round-trips within a single participant turn. + ## Events ```ts @@ -248,6 +315,9 @@ engine.on("synthesisStart", (e: SynthesisStartEvent) => void); engine.on("synthesisToken", (e: SynthesisTokenEvent) => void); engine.on("synthesisComplete", (e: SynthesisCompleteEvent) => void); engine.on("finalResult", (e: FinalResultEvent) => void); +engine.on("toolCallStart", (e: ToolCallStartEvent) => void); // per dispatch +engine.on("toolCallComplete", (e: ToolCallCompleteEvent) => void); +engine.on("toolError", (e: ToolErrorEvent) => void); // ok === false engine.on("error", (err: Error) => void); ``` diff --git a/package.json b/package.json index 86f8b70..1a818a3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ai-consensus-core", - "version": "0.10.0", + "version": "0.11.0", "description": "Dependency-light TypeScript implementation of the Consensus Validation Protocol (CVP): multi-model debate with confidence-weighted scoring, disagreement detection, and optional judge synthesis.", "keywords": [ "consensus", diff --git a/src/__tests__/tool-calling.test.ts b/src/__tests__/tool-calling.test.ts new file mode 100644 index 0000000..9f23700 --- /dev/null +++ b/src/__tests__/tool-calling.test.ts @@ -0,0 +1,418 @@ +import { describe, expect, it, vi } from "vitest"; +import { ConsensusEngine } from "../engine.js"; +import { TEST_PERSONAS } from "./_fixtures.js"; +import type { + ConsensusOptions, + ModelCallRequest, + ModelCallResponse, + ModelCaller, + Participant, + ToolCall, + ToolCallContext, + ToolCallTurn, + ToolDefinition, + ToolExecutionResult, + ToolExecutor, +} from "../types.js"; + +// ───────────────────────────────────────────────────────────── +// Tool-calling: a participant declares a tool, the model emits a +// tool-call request on its first turn, the engine dispatches it +// through the host's executor, then the model returns final content +// on the second turn — including the standard CONFIDENCE marker. +// ───────────────────────────────────────────────────────────── + +const READ_FILE_TOOL: ToolDefinition = { + name: "read_file", + description: "Read the contents of a file by absolute path.", + parameters: { type: "object", properties: { path: { type: "string" } } }, +}; + +function buildParticipant(id: string, tools?: readonly ToolDefinition[]): Participant { + const persona = TEST_PERSONAS[0]!; + return tools ? { id, modelId: `model-${id}`, persona, tools: [...tools] } : { id, modelId: `model-${id}`, persona }; +} + +function baseOptions(overrides: Partial = {}): ConsensusOptions { + return { + question: "What does the build output say?", + participants: [buildParticipant("p1", [READ_FILE_TOOL]), buildParticipant("p2")], + maxRounds: 1, + earlyStop: false, + blindFirstRound: true, + randomizeOrder: false, + randomSeed: 1, + ...overrides, + }; +} + +/** + * Build a scripted ModelCaller for tool-calling tests. `script[participantId]` + * is an array of responses; each engine call to that participant pulls the + * next entry. The judge isn't used in these tests. + */ +function scriptedCaller( + script: Record, +): { caller: ModelCaller; calls: ModelCallRequest[] } { + const counters = new Map(); + const calls: ModelCallRequest[] = []; + const caller: ModelCaller = (req) => { + calls.push(req); + const seq = script[req.participantId]; + if (!seq) { + return Promise.resolve({ content: `[no script for ${req.participantId}]\nCONFIDENCE: 50` }); + } + const idx = counters.get(req.participantId) ?? 0; + counters.set(req.participantId, idx + 1); + const entry = seq[Math.min(idx, seq.length - 1)]!; + return Promise.resolve(entry); + }; + return { caller, calls }; +} + +describe("tool calling — happy path", () => { + it("dispatches a tool call, feeds results back, and uses the second-turn content", async () => { + const fileContent = "build OK — 0 errors, 1 warning"; + const { caller, calls } = scriptedCaller({ + p1: [ + // First turn: model wants to call read_file + { content: "", toolCalls: [{ id: "call_1", name: "read_file", arguments: { path: "/build.log" } }] }, + // Second turn: model has the result, gives final answer + confidence + { content: `The build log says: ${fileContent}\nCONFIDENCE: 80` }, + ], + p2: [{ content: "agreed.\nCONFIDENCE: 75" }], + }); + + const executor = vi.fn<(call: ToolCall, ctx: ToolCallContext) => Promise>(); + executor.mockResolvedValue({ content: fileContent }); + + const events: { name: string; payload: unknown }[] = []; + const engine = new ConsensusEngine(caller); + engine.on("toolCallStart", (e) => events.push({ name: "toolCallStart", payload: e })); + engine.on("toolCallComplete", (e) => events.push({ name: "toolCallComplete", payload: e })); + engine.on("toolError", (e) => events.push({ name: "toolError", payload: e })); + + const result = await engine.run(baseOptions({ toolExecutor: executor })); + + // Executor called exactly once with the right call + context. + expect(executor).toHaveBeenCalledTimes(1); + const callArgs = executor.mock.calls[0]!; + expect(callArgs[0]).toEqual({ id: "call_1", name: "read_file", arguments: { path: "/build.log" } }); + expect(callArgs[1].participantId).toBe("p1"); + expect(callArgs[1].round).toBe(1); + + // p1 was invoked twice: first turn (gets toolCalls), second turn (final). + const p1Calls = calls.filter((c) => c.participantId === "p1"); + expect(p1Calls).toHaveLength(2); + expect(p1Calls[0]!.tools).toEqual([READ_FILE_TOOL]); + expect(p1Calls[0]!.toolCallTurns).toBeUndefined(); + expect(p1Calls[1]!.tools).toEqual([READ_FILE_TOOL]); + expect(p1Calls[1]!.toolCallTurns).toHaveLength(1); + const turn = p1Calls[1]!.toolCallTurns![0]!; + expect(turn.toolCalls[0]!.name).toBe("read_file"); + expect(turn.toolResults[0]).toEqual({ content: fileContent }); + + // The final response uses the second-turn content + CONFIDENCE. + const p1Response = result.rounds[0]!.responses.find((r) => r.participantId === "p1")!; + expect(p1Response.content).toContain("build OK"); + expect(p1Response.confidence).toBe(80); + expect(p1Response.error).toBeUndefined(); + + // Events fired with the right shapes. + const startEvents = events.filter((e) => e.name === "toolCallStart"); + const completeEvents = events.filter((e) => e.name === "toolCallComplete"); + expect(startEvents).toHaveLength(1); + expect(completeEvents).toHaveLength(1); + expect((completeEvents[0]!.payload as { ok: boolean }).ok).toBe(true); + }); + + it("ignores toolCalls when no toolExecutor is configured (0.10 backward compat)", async () => { + const { caller } = scriptedCaller({ + p1: [ + // Model returns a stray toolCalls field even though host has no executor. + { + content: "Direct answer despite the tool-call hint.\nCONFIDENCE: 65", + toolCalls: [{ id: "call_x", name: "ignored", arguments: {} }], + }, + ], + p2: [{ content: "yep.\nCONFIDENCE: 70" }], + }); + + const engine = new ConsensusEngine(caller); + const observed: string[] = []; + engine.on("toolCallStart", () => observed.push("start")); + + const result = await engine.run(baseOptions()); // no toolExecutor + + expect(observed).toEqual([]); // no events fired + const p1 = result.rounds[0]!.responses.find((r) => r.participantId === "p1")!; + expect(p1.content).toContain("Direct answer"); + expect(p1.confidence).toBe(65); + }); + + it("does not pass `tools` to the caller when the participant has none", async () => { + const { caller, calls } = scriptedCaller({ + p1: [{ content: "answer.\nCONFIDENCE: 60" }], + p2: [{ content: "agreed.\nCONFIDENCE: 60" }], + }); + + const engine = new ConsensusEngine(caller); + await engine.run( + baseOptions({ + participants: [buildParticipant("p1"), buildParticipant("p2")], + toolExecutor: vi.fn(), + }), + ); + for (const c of calls) { + expect(c.tools).toBeUndefined(); + } + }); +}); + +describe("tool calling — error handling", () => { + it("captures executor exceptions as { error } results and continues", async () => { + const { caller } = scriptedCaller({ + p1: [ + { content: "", toolCalls: [{ id: "call_1", name: "read_file", arguments: {} }] }, + { content: "I couldn't read it but here's my best guess.\nCONFIDENCE: 50" }, + ], + p2: [{ content: "ok.\nCONFIDENCE: 55" }], + }); + + const executor: ToolExecutor = () => { + throw new Error("filesystem blew up"); + }; + + const engine = new ConsensusEngine(caller); + const errors: { call: ToolCall; error: string }[] = []; + engine.on("toolError", (e) => errors.push({ call: e.call, error: e.error })); + + const result = await engine.run(baseOptions({ toolExecutor: executor })); + + expect(errors).toHaveLength(1); + expect(errors[0]!.error).toBe("filesystem blew up"); + // The participant kept going and produced final content. + const p1 = result.rounds[0]!.responses.find((r) => r.participantId === "p1")!; + expect(p1.error).toBeUndefined(); + expect(p1.content).toContain("best guess"); + expect(p1.confidence).toBe(50); + }); + + it("forwards executor-returned { error } as a toolError event with ok:false", async () => { + const { caller } = scriptedCaller({ + p1: [ + { content: "", toolCalls: [{ id: "call_1", name: "read_file", arguments: {} }] }, + { content: "moving on.\nCONFIDENCE: 60" }, + ], + p2: [{ content: "ok.\nCONFIDENCE: 60" }], + }); + + const executor: ToolExecutor = () => Promise.resolve({ error: "permission denied" }); + + const engine = new ConsensusEngine(caller); + let completeOk: boolean | undefined; + let errorPayload: string | undefined; + engine.on("toolCallComplete", (e) => { + completeOk = e.ok; + }); + engine.on("toolError", (e) => { + errorPayload = e.error; + }); + + await engine.run(baseOptions({ toolExecutor: executor })); + + expect(completeOk).toBe(false); + expect(errorPayload).toBe("permission denied"); + }); +}); + +describe("tool calling — iteration cap", () => { + it("breaks after maxToolIterations even if model keeps requesting tools", async () => { + // Model never gives up — every response has a tool call. + const looping: ModelCallResponse = { + content: "", + toolCalls: [{ id: `call_x`, name: "read_file", arguments: {} }], + }; + // Caller returns the looping response forever for p1. + const calls: ModelCallRequest[] = []; + const caller: ModelCaller = (req) => { + calls.push(req); + if (req.participantId === "p2") { + return Promise.resolve({ content: "ok.\nCONFIDENCE: 50" }); + } + return Promise.resolve(looping); + }; + + let executorCalls = 0; + const executor: ToolExecutor = () => { + executorCalls += 1; + return Promise.resolve({ content: "..." }); + }; + + const engine = new ConsensusEngine(caller); + const result = await engine.run(baseOptions({ toolExecutor: executor, maxToolIterations: 3 })); + + expect(executorCalls).toBe(3); + // p1 was called maxToolIterations+1 times: one initial + 3 loop iterations, + // each followed by a re-invocation. After the 3rd dispatch, the engine + // breaks before re-invoking. So total p1 calls = 1 + 3 = 4. + const p1Calls = calls.filter((c) => c.participantId === "p1"); + expect(p1Calls).toHaveLength(4); + // The final response uses the LAST caller response's content (empty). + const p1 = result.rounds[0]!.responses.find((r) => r.participantId === "p1")!; + expect(p1.error).toBeUndefined(); + expect(p1.content).toBe(""); + }); + + it("clamps maxToolIterations to [1, 32]", async () => { + // Caller always asks for tools; if cap weren't clamped to 1, executor + // would run more times. We verify the cap by setting 0 (clamped to 1). + const calls: ModelCallRequest[] = []; + const caller: ModelCaller = (req) => { + calls.push(req); + if (req.participantId === "p2") return Promise.resolve({ content: "ok.\nCONFIDENCE: 50" }); + return Promise.resolve({ + content: "", + toolCalls: [{ id: "x", name: "read_file", arguments: {} }], + }); + }; + let execs = 0; + const executor: ToolExecutor = () => { + execs += 1; + return Promise.resolve({ content: "x" }); + }; + const engine = new ConsensusEngine(caller); + await engine.run(baseOptions({ toolExecutor: executor, maxToolIterations: 0 })); + expect(execs).toBe(1); // clamped to 1 + }); +}); + +describe("tool calling — abort", () => { + it("propagates AbortError from the executor and finalizes with stopReason=aborted", async () => { + const ac = new AbortController(); + const { caller } = scriptedCaller({ + p1: [ + { content: "", toolCalls: [{ id: "call_1", name: "read_file", arguments: {} }] }, + { content: "shouldn't get here.\nCONFIDENCE: 50" }, + ], + p2: [{ content: "ok.\nCONFIDENCE: 50" }], + }); + + const executor: ToolExecutor = () => { + ac.abort(); + const err = new Error("Aborted"); + err.name = "AbortError"; + throw err; + }; + + const engine = new ConsensusEngine(caller); + const result = await engine.run(baseOptions({ toolExecutor: executor, signal: ac.signal })); + expect(result.stopReason).toBe("aborted"); + }); +}); + +describe("tool calling — turn isolation", () => { + it("does not leak toolCallTurns across separate participant turns", async () => { + const { caller, calls } = scriptedCaller({ + p1: [ + { content: "", toolCalls: [{ id: "c1", name: "read_file", arguments: {} }] }, + { content: "p1 done.\nCONFIDENCE: 70" }, + ], + p2: [{ content: "p2 done.\nCONFIDENCE: 75" }], + }); + const executor: ToolExecutor = () => Promise.resolve({ content: "value" }); + + const engine = new ConsensusEngine(caller); + await engine.run(baseOptions({ toolExecutor: executor })); + + // p2's only call must have an empty/absent toolCallTurns (it didn't call tools). + const p2Calls = calls.filter((c) => c.participantId === "p2"); + expect(p2Calls).toHaveLength(1); + expect(p2Calls[0]!.toolCallTurns).toBeUndefined(); + }); + + it("emits monotonically increasing iteration counters within a turn", async () => { + const looping = (cid: string): ModelCallResponse => ({ + content: "", + toolCalls: [{ id: cid, name: "read_file", arguments: {} }], + }); + let p1Calls = 0; + const caller: ModelCaller = (req) => { + if (req.participantId === "p2") return Promise.resolve({ content: "ok.\nCONFIDENCE: 50" }); + p1Calls += 1; + // Two iterations of looping, then final content. + if (p1Calls <= 2) return Promise.resolve(looping(`c${p1Calls}`)); + return Promise.resolve({ content: "done.\nCONFIDENCE: 80" }); + }; + const executor: ToolExecutor = () => Promise.resolve({ content: "ok" }); + + const engine = new ConsensusEngine(caller); + const iterations: number[] = []; + engine.on("toolCallStart", (e) => iterations.push(e.iteration)); + + await engine.run(baseOptions({ toolExecutor: executor })); + expect(iterations).toEqual([1, 2]); + }); +}); + +describe("tool calling — turn payload integrity", () => { + it("usage from each iteration is summed, not replaced", async () => { + let n = 0; + const caller: ModelCaller = (req) => { + if (req.participantId === "p2") { + return Promise.resolve({ + content: "ok.\nCONFIDENCE: 50", + usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 }, + }); + } + n += 1; + if (n === 1) { + return Promise.resolve({ + content: "", + toolCalls: [{ id: "c1", name: "read_file", arguments: {} }], + usage: { inputTokens: 100, outputTokens: 10, totalTokens: 110 }, + }); + } + return Promise.resolve({ + content: "done.\nCONFIDENCE: 70", + usage: { inputTokens: 50, outputTokens: 30, totalTokens: 80 }, + }); + }; + const executor: ToolExecutor = () => Promise.resolve({ content: "x" }); + + const engine = new ConsensusEngine(caller); + const result = await engine.run(baseOptions({ toolExecutor: executor })); + const p1 = result.rounds[0]!.responses.find((r) => r.participantId === "p1")!; + expect(p1.usage).toEqual({ inputTokens: 150, outputTokens: 40, totalTokens: 190 }); + }); + + it("the toolCallTurns forwarded to the caller mirror what the executor actually returned", async () => { + const { caller, calls } = scriptedCaller({ + p1: [ + { + content: "", + toolCalls: [ + { id: "c1", name: "read_file", arguments: { p: "/a" } }, + { id: "c2", name: "read_file", arguments: { p: "/b" } }, + ], + }, + { content: "done.\nCONFIDENCE: 70" }, + ], + p2: [{ content: "ok.\nCONFIDENCE: 50" }], + }); + const executor: ToolExecutor = (call) => + Promise.resolve({ content: `result-for-${(call.arguments as { p: string }).p}` }); + + const engine = new ConsensusEngine(caller); + await engine.run(baseOptions({ toolExecutor: executor })); + + const followUp = calls.filter((c) => c.participantId === "p1")[1]!; + const turns: readonly ToolCallTurn[] = followUp.toolCallTurns!; + expect(turns).toHaveLength(1); + expect(turns[0]!.toolCalls).toHaveLength(2); + expect(turns[0]!.toolResults).toEqual([ + { content: "result-for-/a" }, + { content: "result-for-/b" }, + ]); + }); +}); diff --git a/src/engine.ts b/src/engine.ts index d27f172..6461e46 100644 --- a/src/engine.ts +++ b/src/engine.ts @@ -34,6 +34,8 @@ import type { ConsensusOptions, ConsensusResult, Disagreement, + ModelCallRequest, + ModelCallResponse, ModelCaller, Participant, ParticipantResponse, @@ -41,6 +43,12 @@ import type { RoundResult, StopReason, SynthesisResult, + ToolCall, + ToolCallContext, + ToolCallTurn, + ToolDefinition, + ToolExecutionResult, + ToolExecutor, } from "./types.js"; // ── Defaults ─────────────────────────────────────────────── @@ -56,10 +64,13 @@ const DEFAULTS = { maxOutputTokens: 1500, judgeTemperature: 0.3, judgeMaxOutputTokens: 1500, + maxToolIterations: 8, } as const; const MAX_ROUNDS_CAP = 10; const MIN_PARTICIPANTS = 2; +const MAX_TOOL_ITERATIONS_CAP = 32; +const TOOL_RESULT_PREVIEW_CHARS = 200; // ── Public engine ────────────────────────────────────────── @@ -123,6 +134,8 @@ export class ConsensusEngine extends TypedEventEmitter { temperature: opts.participantTemperature, maxOutputTokens: opts.maxOutputTokens, signal: opts.signal, + toolExecutor: opts.toolExecutor, + maxToolIterations: opts.maxToolIterations, }); const roundCompletedAt = Date.now(); @@ -276,6 +289,8 @@ export class ConsensusEngine extends TypedEventEmitter { temperature: number; maxOutputTokens: number; signal: AbortSignal | undefined; + toolExecutor: ToolExecutor | undefined; + maxToolIterations: number; }): Promise { const { round, @@ -288,6 +303,8 @@ export class ConsensusEngine extends TypedEventEmitter { temperature, maxOutputTokens, signal, + toolExecutor, + maxToolIterations, } = args; if (blind) { @@ -303,6 +320,8 @@ export class ConsensusEngine extends TypedEventEmitter { maxOutputTokens, signal, runningConfidences: [], + toolExecutor, + maxToolIterations, }), ); return Promise.all(promises); @@ -312,9 +331,7 @@ export class ConsensusEngine extends TypedEventEmitter { for (const participant of order) { throwIfAborted(signal); const visible = [...previousResponses, ...collected]; - const confidencesSoFar = collected - .filter((r) => !r.error) - .map((r) => r.confidence); + const confidencesSoFar = collected.filter((r) => !r.error).map((r) => r.confidence); const response = await this.#callParticipant({ participant, round, @@ -326,6 +343,8 @@ export class ConsensusEngine extends TypedEventEmitter { maxOutputTokens, signal, runningConfidences: confidencesSoFar, + toolExecutor, + maxToolIterations, }); collected.push(response); } @@ -345,6 +364,8 @@ export class ConsensusEngine extends TypedEventEmitter { maxOutputTokens: number; signal: AbortSignal | undefined; runningConfidences: readonly number[]; + toolExecutor: ToolExecutor | undefined; + maxToolIterations: number; }): Promise { const { participant, @@ -357,6 +378,8 @@ export class ConsensusEngine extends TypedEventEmitter { maxOutputTokens, signal, runningConfidences, + toolExecutor, + maxToolIterations, } = args; const system = buildParticipantSystemPrompt({ @@ -381,26 +404,20 @@ export class ConsensusEngine extends TypedEventEmitter { let usage: ParticipantResponse["usage"]; try { - const result = await this.#caller({ - participantId: participant.id, - modelId: participant.modelId, + const turn = await this.#runParticipantTurn({ + participant, round, phase, system, - user: question, + question, temperature, maxOutputTokens, signal, - onToken: (token) => { - this.emit("participantToken", { - round, - participantId: participant.id, - token, - }); - }, + toolExecutor, + maxToolIterations, }); - content = result.content; - usage = result.usage; + content = turn.content; + usage = turn.usage; } catch (err) { if (isAbortError(err)) throw err; error = err instanceof Error ? err.message : String(err); @@ -440,6 +457,189 @@ export class ConsensusEngine extends TypedEventEmitter { return response; } + // ── Participant turn (handles the tool-call loop) ──────── + + /** + * Runs a participant's turn end-to-end. When `toolExecutor` is provided + * AND the participant declares tools AND the model returns tool-call + * requests, the engine loops: + * 1. Dispatch each tool call to the executor. + * 2. Append the (calls, results) pair to `toolCallTurns`. + * 3. Re-invoke the caller with the accumulated history. + * 4. Repeat until the response carries no tool calls or `maxToolIterations` + * is exceeded. + * + * Without an executor, this runs exactly one model call — preserving the + * 0.10 single-call behaviour byte-for-byte. + * + * Token usage from each iteration is summed; the final response's + * `content` is what becomes the participant's turn output. + */ + async #runParticipantTurn(args: { + participant: Participant; + round: number; + phase: Phase; + system: string; + question: string; + temperature: number; + maxOutputTokens: number; + signal: AbortSignal | undefined; + toolExecutor: ToolExecutor | undefined; + maxToolIterations: number; + }): Promise<{ content: string; usage: ParticipantResponse["usage"] }> { + const { + participant, + round, + phase, + system, + question, + temperature, + maxOutputTokens, + signal, + toolExecutor, + maxToolIterations, + } = args; + + const tools: readonly ToolDefinition[] | undefined = + participant.tools && participant.tools.length > 0 ? participant.tools : undefined; + const useToolLoop = Boolean(toolExecutor) && tools !== undefined; + + const toolCallTurns: ToolCallTurn[] = []; + let mergedUsage: ParticipantResponse["usage"]; + let lastResponse: ModelCallResponse | undefined; + let iter = 0; + + while (true) { + const req: ModelCallRequest = { + participantId: participant.id, + modelId: participant.modelId, + round, + phase, + system, + user: question, + temperature, + maxOutputTokens, + signal, + onToken: (token) => { + this.emit("participantToken", { + round, + participantId: participant.id, + token, + }); + }, + ...(tools !== undefined ? { tools } : {}), + ...(toolCallTurns.length > 0 + ? { toolCallTurns: toolCallTurns.map((t) => ({ ...t })) } + : {}), + }; + + lastResponse = await this.#caller(req); + mergedUsage = mergeUsage(mergedUsage, lastResponse.usage); + + const calls: readonly ToolCall[] = lastResponse.toolCalls ?? []; + if (!useToolLoop || calls.length === 0) break; + if (iter >= maxToolIterations) break; + iter += 1; + + const results = await this.#dispatchToolCalls({ + participant, + round, + phase, + iteration: iter, + calls, + toolExecutor: toolExecutor!, + signal, + }); + toolCallTurns.push({ toolCalls: calls, toolResults: results }); + } + + return { + content: lastResponse?.content ?? "", + usage: mergedUsage, + }; + } + + /** + * Dispatch the tool calls of a single iteration through the host executor. + * Errors thrown by the executor are caught and converted into + * `{ error: message }` results so the conversation can continue. Aborts + * propagate up so the engine's outer abort handling can finalise the run. + */ + async #dispatchToolCalls(args: { + participant: Participant; + round: number; + phase: Phase; + iteration: number; + calls: readonly ToolCall[]; + toolExecutor: ToolExecutor; + signal: AbortSignal | undefined; + }): Promise { + const { participant, round, phase, iteration, calls, toolExecutor, signal } = args; + const ctx: ToolCallContext = { + participantId: participant.id, + round, + phase, + ...(signal ? { signal } : {}), + }; + const results: ToolExecutionResult[] = []; + + for (const call of calls) { + throwIfAborted(signal); + this.emit("toolCallStart", { + participantId: participant.id, + round, + phase, + iteration, + call, + }); + const startedAt = Date.now(); + let result: ToolExecutionResult; + try { + result = await toolExecutor(call, ctx); + } catch (err) { + if (isAbortError(err)) throw err; + const message = err instanceof Error ? err.message : String(err); + result = { error: message }; + } + results.push(result); + + const ok = !("error" in result); + const previewSource = ok + ? "content" in result + ? result.content + : "" + : "error" in result + ? result.error + : ""; + const preview = truncate(previewSource, TOOL_RESULT_PREVIEW_CHARS); + const durationMs = Date.now() - startedAt; + + this.emit("toolCallComplete", { + participantId: participant.id, + round, + phase, + iteration, + call, + durationMs, + ok, + preview, + }); + if (!ok) { + const error = "error" in result ? result.error : "unknown"; + this.emit("toolError", { + participantId: participant.id, + round, + phase, + iteration, + call, + error, + }); + } + } + + return results; + } + // ── Judge synthesizer ──────────────────────────────────── async #runJudge(args: { @@ -541,6 +741,8 @@ interface NormalizedOptions { judge: ConsensusOptions["judge"]; randomSeed: number | undefined; signal: AbortSignal | undefined; + toolExecutor: ToolExecutor | undefined; + maxToolIterations: number; } function normalizeOptions(options: ConsensusOptions): NormalizedOptions { @@ -563,6 +765,11 @@ function normalizeOptions(options: ConsensusOptions): NormalizedOptions { } const maxRounds = clampInt(options.maxRounds ?? DEFAULTS.maxRounds, 1, MAX_ROUNDS_CAP); + const maxToolIterations = clampInt( + options.maxToolIterations ?? DEFAULTS.maxToolIterations, + 1, + MAX_TOOL_ITERATIONS_CAP, + ); return { question: options.question, @@ -573,12 +780,13 @@ function normalizeOptions(options: ConsensusOptions): NormalizedOptions { disagreementThreshold: options.disagreementThreshold ?? DEFAULTS.disagreementThreshold, blindFirstRound: options.blindFirstRound ?? DEFAULTS.blindFirstRound, randomizeOrder: options.randomizeOrder ?? DEFAULTS.randomizeOrder, - participantTemperature: - options.participantTemperature ?? DEFAULTS.participantTemperature, + participantTemperature: options.participantTemperature ?? DEFAULTS.participantTemperature, maxOutputTokens: options.maxOutputTokens ?? DEFAULTS.maxOutputTokens, judge: options.judge, randomSeed: options.randomSeed, signal: options.signal, + toolExecutor: options.toolExecutor, + maxToolIterations, }; } @@ -596,10 +804,27 @@ function throwIfAborted(signal: AbortSignal | undefined): void { function isAbortError(err: unknown): boolean { return ( - err instanceof DOMException && err.name === "AbortError" - ) || ( - err instanceof Error && err.name === "AbortError" + (err instanceof DOMException && err.name === "AbortError") || + (err instanceof Error && err.name === "AbortError") ); } -export { DEFAULTS as CONSENSUS_DEFAULTS, MAX_ROUNDS_CAP }; +function mergeUsage( + acc: ParticipantResponse["usage"], + next: ParticipantResponse["usage"], +): ParticipantResponse["usage"] { + if (!next) return acc; + if (!acc) return next; + return { + inputTokens: acc.inputTokens + next.inputTokens, + outputTokens: acc.outputTokens + next.outputTokens, + totalTokens: acc.totalTokens + next.totalTokens, + }; +} + +function truncate(s: string, n: number): string { + if (typeof s !== "string") return ""; + return s.length <= n ? s : `${s.slice(0, n)}…`; +} + +export { DEFAULTS as CONSENSUS_DEFAULTS, MAX_ROUNDS_CAP, MAX_TOOL_ITERATIONS_CAP }; diff --git a/src/index.ts b/src/index.ts index 1ebd676..f50ace2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,7 +2,12 @@ // ai-consensus-core — public API // ───────────────────────────────────────────────────────────── -export { ConsensusEngine, CONSENSUS_DEFAULTS, MAX_ROUNDS_CAP } from "./engine.js"; +export { + ConsensusEngine, + CONSENSUS_DEFAULTS, + MAX_ROUNDS_CAP, + MAX_TOOL_ITERATIONS_CAP, +} from "./engine.js"; export { JUDGE_PERSONA } from "./personas.js"; @@ -34,7 +39,7 @@ export { TypedEventEmitter } from "./events.js"; export type { ConsensusEmitter } from "./events.js"; // Schemas (zod) — exported for callers that want boundary validation. -export { PersonaSchema, ParticipantSchema, PHASES } from "./types.js"; +export { PersonaSchema, ParticipantSchema, ToolDefinitionSchema, PHASES } from "./types.js"; // Types export type { @@ -52,6 +57,13 @@ export type { ConsensusResult, ConsensusOptions, StopReason, + // Tool calling + ToolDefinition, + ToolCall, + ToolCallTurn, + ToolCallContext, + ToolExecutionResult, + ToolExecutor, // Events ConsensusEventMap, ConsensusEventName, @@ -67,4 +79,7 @@ export type { SynthesisTokenEvent, SynthesisCompleteEvent, FinalResultEvent, + ToolCallStartEvent, + ToolCallCompleteEvent, + ToolErrorEvent, } from "./types.js"; diff --git a/src/types.ts b/src/types.ts index 1fffe93..7bb2ca6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -24,6 +24,13 @@ export const ParticipantSchema = z.object({ modelId: z.string().min(1), persona: PersonaSchema, label: z.string().optional(), + /** + * Tools this participant is allowed to invoke during its turn. The library + * forwards the list to the ModelCaller verbatim — semantics (dispatch, + * loop, error handling) live in the engine when `ConsensusOptions.toolExecutor` + * is provided. Empty/undefined ⇒ classic text-only debate (0.10 behaviour). + */ + tools: z.array(z.lazy(() => ToolDefinitionSchema)).optional(), }); export type Participant = z.infer; @@ -51,6 +58,83 @@ export interface TokenUsage { totalTokens: number; } +// ───────────────────────────────────────────────────────────── +// Tool calling +// ───────────────────────────────────────────────────────────── +// Engine-orchestrated tool calling sits between the ModelCaller and the +// host. The library never parses tool arguments, never invokes a tool, and +// never decides what tools a participant has — it just plumbs: +// +// 1. `Participant.tools` flows into each `ModelCallRequest.tools`. +// 2. If the response carries `toolCalls`, the engine dispatches each one +// to `ConsensusOptions.toolExecutor` (host-supplied) and feeds results +// back into a follow-up call via `ModelCallRequest.toolCallTurns`. +// 3. The loop terminates when the model returns a response with no +// `toolCalls`, or when `maxToolIterations` is hit. +// +// Hosts that don't supply a `toolExecutor` see no behaviour change — every +// new field is optional and the engine's flow degrades to 0.10 verbatim. + +/** OpenAI-style tool definition (function-call shape). */ +export const ToolDefinitionSchema = z.object({ + name: z.string().min(1), + description: z.string(), + /** + * JSON Schema (object). The library does not validate or interpret the + * schema — it forwards verbatim to the ModelCaller, which is responsible + * for translating it into whatever the underlying provider expects. + */ + parameters: z.unknown(), +}); + +export type ToolDefinition = z.infer; + +/** A tool-call request emitted by an assistant turn. */ +export interface ToolCall { + /** Unique id assigned by the model — round-trip back in tool results. */ + id: string; + /** Tool name; must match a `ToolDefinition.name` from the request. */ + name: string; + /** + * Already JSON-parsed arguments. Callers MUST parse the model's raw + * argument string before populating this; the library never parses. + */ + arguments: unknown; +} + +/** Result of executing a tool call. Either a content string or an error. */ +export type ToolExecutionResult = { content: string } | { error: string }; + +/** + * One turn of tool-call dispatch. The engine appends one entry per iteration + * of the tool loop, in order, and forwards the accumulated history on each + * follow-up `ModelCallRequest`. + */ +export interface ToolCallTurn { + /** The tool calls the assistant requested in this turn. */ + toolCalls: readonly ToolCall[]; + /** Results, in the same order as `toolCalls`. */ + toolResults: readonly ToolExecutionResult[]; +} + +/** Context passed to the host's `ToolExecutor` so it knows what's running. */ +export interface ToolCallContext { + participantId: string; + round: number; + phase: Phase; + signal?: AbortSignal; +} + +/** + * Host-supplied tool executor. The engine awaits this once per tool call. + * Throw on unrecoverable errors; return `{ error }` to feed an error string + * back into the conversation as a normal tool result (model can recover). + */ +export type ToolExecutor = ( + call: ToolCall, + ctx: ToolCallContext, +) => Promise; + // ───────────────────────────────────────────────────────────── // ModelCaller — the one extension point of the library // ───────────────────────────────────────────────────────────── @@ -76,6 +160,22 @@ export interface ModelCallRequest { signal?: AbortSignal; /** Optional streaming sink; callers MAY call this with partial tokens. */ onToken?: (token: string) => void; + /** + * Tools available for this turn. Forwarded verbatim from `Participant.tools` + * (and only for participant calls — judge calls never carry tools). Absent + * when the participant declares no tools. + */ + tools?: readonly ToolDefinition[]; + /** + * Tool-call history for this single participant turn, populated by the + * engine when re-invoking the caller after dispatching tool calls. Each + * entry is one round-trip through the tool loop. Absent on the first call + * of a turn. + * + * Callers MUST translate this into whatever the provider expects (e.g. + * for OpenAI, append assistant + tool messages to the conversation). + */ + toolCallTurns?: readonly ToolCallTurn[]; } export interface ModelCallResponse { @@ -83,6 +183,18 @@ export interface ModelCallResponse { content: string; /** Optional token usage, if the provider surfaces it. */ usage?: TokenUsage; + /** + * Tool calls the model wants to dispatch this turn. If non-empty AND the + * engine has a `toolExecutor`, the engine runs each call and re-invokes + * the caller with the results in `ModelCallRequest.toolCallTurns`. If empty + * or absent, the engine treats `content` as the participant's final turn. + * + * If the engine has no `toolExecutor` configured but the response carries + * `toolCalls`, they are ignored and `content` is used as-is — preserves + * 0.10 backward compatibility for callers that opt into tool streaming + * but don't wire an executor. + */ + toolCalls?: readonly ToolCall[]; } export type ModelCaller = (request: ModelCallRequest) => Promise; @@ -239,6 +351,24 @@ export interface ConsensusOptions { randomSeed?: number; /** Propagates cancellation to every ModelCaller and aborts the loop. */ signal?: AbortSignal; + /** + * Host-supplied tool executor. When set, the engine drives the tool-call + * loop for participants whose `tools` list is non-empty: dispatches each + * `ToolCall` returned by the model, feeds results back via + * `ModelCallRequest.toolCallTurns`, and emits `toolCallStart` / + * `toolCallComplete` / `toolError` events. + * + * When omitted, the engine ignores any `toolCalls` in `ModelCallResponse` + * and treats `content` as the participant's final turn — exact 0.10 behaviour. + */ + toolExecutor?: ToolExecutor; + /** + * Maximum tool-loop iterations per participant turn. After this many + * round-trips through the executor, the engine breaks out and uses the + * last response's `content` as the participant's turn — even if the model + * still wants to call more tools. Defaults to 8. Bounded to [1, 32]. + */ + maxToolIterations?: number; } // ───────────────────────────────────────────────────────────── @@ -319,6 +449,39 @@ export interface FinalResultEvent { result: ConsensusResult; } +// ── Tool-calling events ───────────────────────────────────── + +export interface ToolCallStartEvent { + participantId: string; + round: number; + phase: Phase; + /** 1-based iteration counter within this participant's tool loop. */ + iteration: number; + call: ToolCall; +} + +export interface ToolCallCompleteEvent { + participantId: string; + round: number; + phase: Phase; + iteration: number; + call: ToolCall; + durationMs: number; + /** True when the executor returned `{ content }`; false when it returned `{ error }`. */ + ok: boolean; + /** Truncated preview of the result payload (first 200 chars). */ + preview: string; +} + +export interface ToolErrorEvent { + participantId: string; + round: number; + phase: Phase; + iteration: number; + call: ToolCall; + error: string; +} + // ───────────────────────────────────────────────────────────── // Event map (for typed EventEmitter) // ───────────────────────────────────────────────────────────── @@ -336,6 +499,9 @@ export interface ConsensusEventMap { synthesisToken: (event: SynthesisTokenEvent) => void; synthesisComplete: (event: SynthesisCompleteEvent) => void; finalResult: (event: FinalResultEvent) => void; + toolCallStart: (event: ToolCallStartEvent) => void; + toolCallComplete: (event: ToolCallCompleteEvent) => void; + toolError: (event: ToolErrorEvent) => void; error: (error: Error) => void; }