rlippmann · rlippmann · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,6 +26,14 @@ jobs:
       - name: Install
         run: npm ci
 
+      - name: Checkout Python fixture source
+        # TS 0.5.2 fixtures include cases added after Python v0.5.2; pin to v0.6.0
+        # to keep fixture drift checks aligned with the current TS fixture corpus.
+        run: git clone --depth 1 --branch v0.6.0 https://github.com/rlippmann/context-compiler ../context-compiler
+
+      - name: Check fixture drift
+        run: npm run fixtures:check
+
       - name: Build
         run: npm run build
 

diff --git a/AGENTS.md b/AGENTS.md
@@ -78,6 +78,30 @@ A change is correct only if all fixtures pass.
 
 Do not modify fixtures to make tests pass.
 
+Fixture updates are allowed only when syncing from the authoritative Python source for the targeted compatibility line.
+
+If synced fixtures introduce failures, fix TypeScript behavior rather than editing fixture expectations.
+
+## Test Coverage Expectations
+
+Before opening a PR, consider:
+
+- Does this change affect any user-facing behavior?
+- If so, is that behavior covered by tests?
+
+User-facing behavior includes:
+
+- Engine decision outcomes (`kind`, `prompt_to_user`, and returned `state`)
+- Checkpoint export/import and continuation behavior
+- Clarify/confirmation flows (`yes` / `no`)
+- Transcript replay behavior and compaction-related behavior
+- Integration behavior (examples, demo runner, and integration scripts)
+- Integration error-path normalization
+
+If a user-facing behavior is changed or introduced, add or update tests to cover it.
+
+Do not rely solely on coverage metrics.
+
 ## Workflow
 
 Typical workflow:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -25,6 +25,26 @@ All changes must:
   - Exact match when a fixture specifies a string
   - Non-empty string when a fixture uses `null`
 
+## Test Coverage Expectations
+
+Before opening a PR, consider:
+
+- Does this change affect any user-facing behavior?
+- If so, is that behavior covered by tests?
+
+User-facing behavior includes:
+
+- Engine decision outcomes (`kind`, `prompt_to_user`, and returned `state`)
+- Checkpoint export/import and continuation behavior
+- Clarify/confirmation flows (`yes` / `no`)
+- Transcript replay behavior and compaction-related behavior
+- Integration behavior (examples, demo runner, and integration scripts)
+- Integration error-path normalization
+
+If a user-facing behavior is changed or introduced, add or update tests to cover it.
+
+Do not rely solely on coverage metrics.
+
 ## What Not to Do
 
 Do not:

diff --git a/demos/llm_client.ts b/demos/llm_client.ts
@@ -11,6 +11,8 @@ export type LLMConfig = {
   model: string;
 };
 
+const DEMO_MOCK_ENV_VAR = 'CONTEXT_COMPILER_DEMO_MOCK';
+
 export class MissingDemoConfigError extends Error {
   readonly missing: string[];
   readonly baseUrl: string | null;
@@ -85,6 +87,149 @@ function endpointFor(baseUrl: string | null): string {
   return `${root}/chat/completions`;
 }
 
+function isDemoMockEnabled(): boolean {
+  const raw = (process.env[DEMO_MOCK_ENV_VAR] ?? '').trim().toLowerCase();
+  return raw === '1' || raw === 'true' || raw === 'yes' || raw === 'on';
+}
+
+function splitItems(raw: string): string[] {
+  return raw
+    .split(',')
+    .map((item) => item.trim().toLowerCase())
+    .filter((item) => item !== '' && item !== '(none)');
+}
+
+function parseCompiledState(systemPrompt: string): {
+  premise: string | null;
+  useItems: string[];
+  prohibitItems: string[];
+} {
+  const premiseMatch = systemPrompt.match(/^- premise:\s*(.+)$/im);
+  const useMatch = systemPrompt.match(/^- use policy items:\s*(.+)$/im);
+  const prohibitMatch = systemPrompt.match(/^- prohibited policy items:\s*(.+)$/im);
+
+  return {
+    premise: premiseMatch ? premiseMatch[1].trim() : null,
+    useItems: useMatch ? splitItems(useMatch[1]) : [],
+    prohibitItems: prohibitMatch ? splitItems(prohibitMatch[1]) : []
+  };
+}
+
+function parseDirectivePremises(messages: Message[]): { first: string | null; latest: string | null } {
+  let first: string | null = null;
+  let latest: string | null = null;
+  const directiveRe = /^\s*(?:set premise|change premise to)\s+(.+?)\s*$/i;
+
+  for (const message of messages) {
+    if (message.role !== 'user') {
+      continue;
+    }
+    const match = message.content.match(directiveRe);
+    if (!match) {
+      continue;
+    }
+    const premise = match[1].trim();
+    if (first === null) {
+      first = premise;
+    }
+    latest = premise;
+  }
+
+  return { first, latest };
+}
+
+function chooseMockPremise(messages: Message[], systemPrompt: string, compiledPremise: string | null): string {
+  if (compiledPremise && compiledPremise !== '(unset)') {
+    return compiledPremise;
+  }
+
+  const { first, latest } = parseDirectivePremises(messages);
+  const joined = messages
+    .filter((message) => message.role === 'user')
+    .map((message) => message.content)
+    .join('\n')
+    .toLowerCase();
+  const strongPrompt = /prioritize explicit user directives|careful assistant|first line must be exactly premise:/i.test(
+    systemPrompt
+  );
+
+  if (!strongPrompt) {
+    if (joined.includes('beef stew')) {
+      return 'beef stew';
+    }
+    return first ?? latest ?? 'vegetarian curry';
+  }
+  return latest ?? first ?? 'vegan curry';
+}
+
+function mockCompletion(messages: Message[]): string {
+  const systemPrompt = messages.find((message) => message.role === 'system')?.content ?? '';
+  const allUserText = messages
+    .filter((message) => message.role === 'user')
+    .map((message) => message.content)
+    .join('\n');
+  const lowered = allUserText.toLowerCase();
+  const compiled = parseCompiledState(systemPrompt);
+  const premise = chooseMockPremise(messages, systemPrompt, compiled.premise);
+  const isCompiled = /follow authoritative compiled state exactly\./i.test(systemPrompt);
+
+  if (/reply with exactly:\s*ok/i.test(allUserText)) {
+    return 'OK';
+  }
+
+  if (/first line must be tool:<docker\|kubectl>/i.test(allUserText)) {
+    const tool = isCompiled && !compiled.prohibitItems.includes('kubectl') ? 'kubectl' : 'docker';
+    return [`TOOL:${tool}`, `ACTION:Use ${tool} to deploy the service.`].join('\n');
+  }
+
+  if (/first line must be action:<clarify\|proceed>/i.test(allUserText)) {
+    const contradictoryPeanutDirectives = lowered.includes('prohibit peanuts') && lowered.includes('use peanuts');
+    if (isCompiled && contradictoryPeanutDirectives) {
+      return 'ACTION:clarify\nRequest is contradictory; please confirm policy.';
+    }
+    return 'ACTION:proceed\nProceeding with a best-effort interpretation.';
+  }
+
+  const wantsPremiseTag = /first line must be premise:<value>/i.test(allUserText);
+  const wantsRecipe = /\b(recipe|ingredients|steps|curry)\b/i.test(allUserText);
+  const wantsPlan = /\b(plan|shopping list|dinner)\b/i.test(allUserText);
+  const blocksPeanuts = compiled.prohibitItems.includes('peanuts') || compiled.prohibitItems.includes('peanut');
+
+  if (wantsRecipe && blocksPeanuts) {
+    const prefix = wantsPremiseTag ? [`PREMISE:${premise}`] : [];
+    return [
+      ...prefix,
+      'I cannot provide a peanut recipe because it conflicts with policy.',
+      'Ingredients:',
+      '- chickpeas',
+      '- coconut milk',
+      '- garlic',
+      'Steps:',
+      '1. Saute garlic.',
+      '2. Simmer chickpeas in coconut milk.',
+      '3. Serve hot.'
+    ].join('\n');
+  }
+
+  if (wantsPremiseTag || wantsPlan || wantsRecipe) {
+    const lines: string[] = [];
+    if (wantsPremiseTag) {
+      lines.push(`PREMISE:${premise}`);
+    }
+    lines.push('Shopping list:');
+    lines.push('- onions');
+    lines.push('- tomatoes');
+    lines.push(`- ${premise}`);
+    lines.push('Steps:');
+    lines.push(`1. Prepare a ${premise} base.`);
+    lines.push('2. Simmer until flavors combine.');
+    lines.push('3. Serve warm.');
+    return lines.join('\n');
+  }
+
+  return 'OK';
+}
+
 function parseRetryAfterSeconds(headers: Headers): number | null {
   const raw = headers.get('retry-after') ?? headers.get('Retry-After');
   if (!raw) {
@@ -200,6 +345,10 @@ export async function completeMessages(
     delaySeconds?: number;
   }
 ): Promise<string> {
+  if (isDemoMockEnabled()) {
+    return mockCompletion(messages);
+  }
+
   const config = loadConfig();
   const targetModel = options?.model ?? config.model;
   const configuredDelay = options?.delaySeconds && options.delaySeconds > 0 ? options.delaySeconds : defaultLlmDelaySeconds;

diff --git a/package-lock.json b/package-lock.json
diff --git a/tests/demos-smoke.test.ts b/tests/demos-smoke.test.ts
@@ -6,8 +6,6 @@ import { beforeAll, describe, expect, it } from 'vitest';
 const ROOT = resolve(process.cwd());
 const DIST_DEMOS = resolve(ROOT, 'dist', 'demos');
 
-const HAS_DEMO_ENV = Boolean(process.env.OPENAI_API_KEY && process.env.MODEL);
-
 function runNodeScript(file: string, args: string[] = [], envOverride?: Record<string, string | undefined>) {
   const script = resolve(DIST_DEMOS, file);
   const run = spawnSync(process.execPath, [script, ...args], {
@@ -25,6 +23,16 @@ function runNodeScript(file: string, args: string[] = [], envOverride?: Record<s
   };
 }
 
+function runNodeScriptWithMock(file: string, args: string[] = []) {
+  // Force deterministic/offline smoke behavior in CI: no real LLM calls.
+  // Mock mode keeps demo smoke runs stable and fast.
+  return runNodeScript(file, args, {
+    CONTEXT_COMPILER_DEMO_MOCK: '1',
+    OPENAI_API_KEY: process.env.OPENAI_API_KEY && process.env.OPENAI_API_KEY.trim() !== '' ? process.env.OPENAI_API_KEY : 'mock-key',
+    MODEL: process.env.MODEL && process.env.MODEL.trim() !== '' ? process.env.MODEL : 'mock-model'
+  });
+}
+
 describe('demos smoke', () => {
   beforeAll(() => {
     const build = spawnSync('npm', ['run', 'build'], {
@@ -42,13 +50,12 @@ describe('demos smoke', () => {
       MODEL: ''
     });
     expect(run.status).toBe(2);
+    expect(run.stderr.trim()).toBe('');
     expect(run.stdout).toContain('Unable to run LLM demos: missing model configuration.');
     expect(run.stdout).toContain('Missing variables: OPENAI_API_KEY, MODEL');
   });
 
-  const describeWhenConfigured = HAS_DEMO_ENV ? describe : describe.skip;
-
-  describeWhenConfigured('with configured llm env', () => {
+  describe('with configured llm env or demo mock', () => {
     it('runs scored demos with comparative markers', () => {
       const demos = [
         ['01_llm_contradiction_clarify.js', '01_contradiction_block'],
@@ -60,8 +67,9 @@ describe('demos smoke', () => {
       ] as const;
 
       for (const [file, marker] of demos) {
-        const run = runNodeScript(file);
+        const run = runNodeScriptWithMock(file);
         expect(run.status).toBe(0);
+        expect(run.stderr.trim()).toBe('');
         expect(run.stdout).toContain(marker);
         expect(run.stdout).toContain('baseline:');
         expect(run.stdout).toContain('compiler:');
@@ -73,8 +81,9 @@ describe('demos smoke', () => {
     }, 180_000);
 
     it('runs informational demo 06 with compaction markers', () => {
-      const run = runNodeScript('06_llm_context_compaction.js');
+      const run = runNodeScriptWithMock('06_llm_context_compaction.js');
       expect(run.status).toBe(0);
+      expect(run.stderr.trim()).toBe('');
       expect(run.stdout).toContain('06_context_compaction');
       expect(run.stdout).toContain('context scaling:');
       expect(run.stdout).toContain('compacted transcript:');
@@ -85,14 +94,16 @@ describe('demos smoke', () => {
     }, 120_000);
 
     it('runs demo runner for single and all with summary markers', () => {
-      const single = runNodeScript('run_demo.js', ['1']);
+      const single = runNodeScriptWithMock('run_demo.js', ['1']);
       expect(single.status).toBe(0);
+      expect(single.stderr.trim()).toBe('');
       expect(single.stdout).toContain('01_contradiction_block');
       expect(single.stdout).toContain('baseline:');
       expect(single.stdout).toContain('compiler:');
 
-      const all = runNodeScript('run_demo.js', ['all']);
+      const all = runNodeScriptWithMock('run_demo.js', ['all']);
       expect(all.status).toBe(0);
+      expect(all.stderr.trim()).toBe('');
       expect(all.stdout).toContain('Summary:');
       expect(all.stdout).toContain('Evaluative demos:');
       expect(all.stdout).toContain('Baseline results:');