diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..4276912 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,40 @@ +# scripts/ + +One-off diagnostic and verification scripts that don't fit inside the +test suite — usually because they talk to a real Anthropic endpoint and +cost tokens, or they exist to debug a specific proxy/gateway quirk. + +These scripts are deliberately kept outside `packages/*/test/` so they +don't run under `npm test`. Run them manually with `node scripts/.mjs`. + +## Current scripts + +| Script | Purpose | +|---|---| +| [verify-prompt-cache.mjs](verify-prompt-cache.mjs) | Issues three back-to-back non-streaming requests with `cache_control: ephemeral` and reports whether `cache_creation_input_tokens` / `cache_read_input_tokens` flip as expected. Useful for distinguishing "myagent isn't sending cache_control" from "the configured endpoint silently ignores cache_control." | + +## Gateway quirks discovered + +If you're using a non-official Anthropic-compatible proxy and `myagent +usage ` is reporting all zeros, suspect one of these (both +observed on `claude.proai.love` during M1.5b verification): + +1. **Streaming usage returns zeros.** Some proxies report + `input_tokens: 0`, `output_tokens: 0`, etc. in `message_start` / + `message_delta` even though the non-streaming endpoint returns real + numbers for the same request. myagent uses streaming exclusively in + the agent loop, so it will display zeros in `myagent usage` on such + proxies. `verify-prompt-cache.mjs` uses non-streaming on purpose to + sidestep this and reveal real numbers. + +2. **`cache_control` is silently dropped.** A proxy may accept the + marker (no error response) but not pass it through to Anthropic, so + `cache_creation_input_tokens` / `cache_read_input_tokens` stay zero + on every call regardless of payload size or repetition. + `verify-prompt-cache.mjs` prints a `[probe] WARNING …` line in this + case. + +myagent's request-side caching plumbing is correct against Anthropic's +official `api.anthropic.com` endpoint. If a proxy zeros things out, the +agent will still work — there will just be no cost savings and no +visibility into cache hits. diff --git a/scripts/verify-prompt-cache.mjs b/scripts/verify-prompt-cache.mjs new file mode 100644 index 0000000..ea4d958 --- /dev/null +++ b/scripts/verify-prompt-cache.mjs @@ -0,0 +1,140 @@ +// Verify Anthropic prompt caching against the configured endpoint. +// +// Reads ANTHROPIC_API_KEY + ANTHROPIC_BASE_URL from .env (mirroring the +// CLI's loader so it works under any wrapper that also sets these vars in +// process.env), applies the same /v1-stripping normalization the CLI does, +// and issues three back-to-back non-streaming requests with identical +// system content and a `cache_control: ephemeral` marker. The expected +// pattern when the endpoint actually supports prompt caching is: +// +// call 1: cache_creation_input_tokens > 0, cache_read_input_tokens = 0 +// call 2: cache_creation_input_tokens = 0, cache_read_input_tokens > 0 +// call 3: same as call 2 +// +// Why non-streaming: at least one popular proxy (claude.proai.love at the +// time of writing) returns zeroed usage in streaming responses but real +// numbers in non-streaming. Caching works the same way at the model end, +// but with streaming we'd see all zeros even on a successful hit. Use +// this script (non-streaming) to verify the cache itself; rely on +// `myagent usage ` for the agent loop's per-turn breakdown +// once you have a backend that returns usage in streaming events. +// +// Usage: +// node scripts/verify-prompt-cache.mjs +import Anthropic from "@anthropic-ai/sdk"; + +import { readFileSync, existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const here = dirname(fileURLToPath(import.meta.url)); +const envPath = join(here, "..", ".env"); +const envVars = {}; +if (existsSync(envPath)) { + for (const line of readFileSync(envPath, "utf8").split(/\r?\n/)) { + const m = /^\s*([A-Z_]+)\s*=\s*(.*)$/.exec(line); + if (!m) continue; + let value = m[2].trim(); + if ( + (value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'")) + ) { + value = value.slice(1, -1); + } + envVars[m[1]] = value; + } +} + +const apiKey = envVars.ANTHROPIC_API_KEY ?? process.env.ANTHROPIC_API_KEY; +const rawBaseURL = envVars.ANTHROPIC_BASE_URL ?? process.env.ANTHROPIC_BASE_URL; + +// Mirror packages/core/src/anthropic.ts normalizeAnthropicBaseURL — strip a +// trailing /v1 so the SDK doesn't double it. +function normalize(baseURL) { + if (!baseURL) return undefined; + const trimmed = baseURL.replace(/\/+$/, ""); + return trimmed.endsWith("/v1") ? trimmed.slice(0, -"/v1".length) : trimmed; +} +const baseURL = normalize(rawBaseURL); + +if (!apiKey) { + console.error("[probe] ANTHROPIC_API_KEY missing (checked .env and process.env)"); + process.exit(1); +} +console.log("[probe] baseURL =", baseURL || "(SDK default)"); + +const client = new Anthropic({ apiKey, baseURL }); + +// Pad the system prompt past Anthropic's minimum cache write threshold +// (1024 tokens for Sonnet/Opus; 2048 for Haiku). ~22k chars of repeated +// filler is reliably above that. +const filler = Array.from({ length: 60 }) + .map( + (_, i) => + `Paragraph ${i + 1}: This is filler context describing operational policy for the agent. ` + + `The agent should remain concise, deferential, and grounded. It must not hallucinate. ` + + `It should prefer to ask clarifying questions when input is ambiguous. ` + + `It should always report uncertainty rather than fabricate. ` + + `It should keep responses under fifty words unless otherwise requested.` + ) + .join(" "); + +const request = { + model: envVars.MYAGENT_MODEL ?? "claude-sonnet-4-6", + max_tokens: 60, + system: [ + { + type: "text", + text: `You are a concise assistant. ${filler}`, + cache_control: { type: "ephemeral" } + } + ], + messages: [{ role: "user", content: "Say hi in 4 words." }] +}; + +async function callOnce(label) { + const startedAt = Date.now(); + const response = await client.messages.create(request); + const elapsedMs = Date.now() - startedAt; + const usage = response.usage ?? {}; + console.log( + `[${label}] ${elapsedMs}ms ` + + `in=${usage.input_tokens ?? "?"} ` + + `out=${usage.output_tokens ?? "?"} ` + + `cache_w=${usage.cache_creation_input_tokens ?? 0} ` + + `cache_r=${usage.cache_read_input_tokens ?? 0}` + ); + return usage; +} + +try { + console.log("[probe] system prompt length:", request.system[0].text.length, "chars"); + const usage1 = await callOnce("call 1"); + const usage2 = await callOnce("call 2"); + const usage3 = await callOnce("call 3"); + + const cacheHit = + (usage2.cache_read_input_tokens ?? 0) > 0 || + (usage3.cache_read_input_tokens ?? 0) > 0; + const cacheWrite = (usage1.cache_creation_input_tokens ?? 0) > 0; + + console.log(""); + if (cacheWrite && cacheHit) { + console.log("[probe] OK — prompt caching is working: call 1 wrote, calls 2-3 read."); + } else if (cacheHit && !cacheWrite) { + console.log("[probe] OK — calls 2-3 hit a pre-existing cache entry."); + } else { + console.log( + "[probe] WARNING — no cache_creation or cache_read tokens observed on any call. " + + "The endpoint is silently ignoring cache_control. Anthropic's official endpoint " + + "supports prompt caching; some proxies do not pass the markers through." + ); + } +} catch (error) { + console.error("[probe] FAILED"); + console.error(" name: ", error?.constructor?.name); + console.error(" message: ", error?.message); + console.error(" status: ", error?.status); + console.error(" cause: ", error?.cause?.message ?? error?.cause); + process.exit(1); +}