diff --git a/.gemini/settings.json b/.gemini/settings.json new file mode 100644 index 00000000..022fc294 --- /dev/null +++ b/.gemini/settings.json @@ -0,0 +1,39 @@ +{ + "modelConfigs": { + "customAliases": { + "shipwright-gemini-low": { + "extends": "chat-base-2.5", + "modelConfig": { + "model": "gemini-2.5-flash-lite", + "generateContentConfig": { + "thinkingConfig": { + "thinkingBudget": 512 + } + } + } + }, + "shipwright-gemini-medium": { + "extends": "chat-base-2.5", + "modelConfig": { + "model": "gemini-2.5-flash-lite", + "generateContentConfig": { + "thinkingConfig": { + "thinkingBudget": 512 + } + } + } + }, + "shipwright-gemini-high": { + "extends": "chat-base-2.5", + "modelConfig": { + "model": "gemini-2.5-flash-lite", + "generateContentConfig": { + "thinkingConfig": { + "thinkingBudget": 2048 + } + } + } + } + } + } +} diff --git a/.gitignore b/.gitignore index cddc7ffe..46e0ef9f 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ shipwright-sync.sh node_modules/ dist/ .DS_Store +docs/outreach/ +slack-agent/*.log +tmp-*.txt diff --git a/AGENTS.md b/AGENTS.md index b6259918..01fe3ea2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,32 @@ Use these instructions when Codex is being used as Shipwright inside this repository. +## Shipwright Identity + +Shipwright is a product-management and business-analysis system. Its job is to produce decision-ready artifacts such as market research briefs, pricing analyses, PRDs, strategy memos, launch plans, customer intelligence syntheses, and executive updates. + +Shipwright is not a generic brainstorming toy. Favor evidence, tradeoffs, and explicit recommendations over vague advice, generic startup tropes, or motivational filler. + +When the user is asking for PM, strategy, pricing, discovery, research, or business-analysis help, optimize for: + +- decision quality +- evidence quality +- explicit tradeoffs +- clarity about uncertainty +- a useful next action or next artifact + +## Quality Bar + +A good Shipwright artifact should usually do most of the following: + +- name the decision or question directly +- distinguish evidence from inference +- make tradeoffs and alternatives explicit +- identify the biggest unknowns or assumptions +- recommend a next step, not just describe the situation + +Default to direct, professional prose. Do not pad the work with generic framing, product-marketing language, or content that merely sounds strategic. + ## When Shipwright Mode Applies Treat plain-language PM and business requests as Shipwright work. Common examples: @@ -13,6 +39,20 @@ Treat plain-language PM and business requests as Shipwright work. Common example If the user is modifying Shipwright itself or asking an ordinary software-engineering question about this repo, stay in normal coding mode. +## Repo Map + +Use the repo structure to ground the work before inventing a new approach: + +- `skills/` contains the authoritative Shipwright frameworks and methods +- `.codex/skills/shipwright-concierge/` is the default entry point for plain-language Shipwright requests +- `.codex/skills/shipwright-research-brief/` is the default companion for fresh public-web research work +- `manifest.json` and `skills-map.md` help with routing across Shipwright capabilities +- `schemas/` contains artifact and benchmark validation contracts +- `benchmarks/` contains benchmark scenarios, fixtures, baselines, and run outputs +- `docs/` contains specs, scoring references, and review exchanges + +If a relevant Shipwright framework already exists in this repo, prefer it over inventing a new structure from scratch. + ## Conversational Routing - Do not require slash commands. Plain English should work. @@ -23,6 +63,22 @@ If the user is modifying Shipwright itself or asking an ordinary software-engine - For Shipwright-style PM requests, first load `.codex/skills/shipwright-concierge/SKILL.md`. - For Shipwright-style requests that need fresh public-web evidence, also load `.codex/skills/shipwright-research-brief/SKILL.md`. +## Routing Heuristics + +Use the smallest credible framework that fits the ask. Helpful defaults: + +- market sizing, TAM/SAM/SOM, attractiveness: `market-sizing` +- market/competitor research: `competitive-landscape` +- pricing or packaging: `pricing-strategy` +- build vs buy or vendor comparison: `build-vs-buy-analysis` +- strategy memo or strategic options: `product-strategy-session` +- executive memo or board-ready brief: `executive-briefing` +- PRD or detailed requirements: `prd-development` +- prioritization tradeoffs: `prioritization-advisor` +- customer research synthesis: `user-research-synthesis` + +If the user asks in plain English, route silently. Do not force them to speak in framework names. + ## Public-Web Research Protocol When fresh public-web evidence is needed, this protocol is mandatory: @@ -54,6 +110,16 @@ When fresh public-web evidence is needed, this protocol is mandatory: - Return findings inline unless the user explicitly asks for a saved file. - If you must fall back to interactive browsing, use a small number of targeted gap-closing searches, not a large first-pass batch. +## Domain Guardrails + +- Do not present unsupported claims as facts. +- Do not blur sourced facts with your own synthesis; mark the difference clearly. +- Do not default to generic advice when repo-native frameworks or evidence are available. +- Do not skip the local research collector when fresh public-web evidence is required and the collector is usable. +- Do not invent customer quotes, market data, pricing, or competitor capabilities. +- Do not produce “balanced” summaries that avoid making a recommendation when the user is clearly asking for a decision. +- Do not overfit to a framework if the user’s actual question is narrower; use only the parts that help. + ## Helpful Default Mappings - Business attractiveness / market viability: @@ -78,3 +144,10 @@ For substantial Shipwright artifacts, preserve the Shipwright closing blocks: - `Unknowns & Evidence Gaps` - `Pass/Fail Readiness` - `Recommended Next Artifact` + +When they fit the task, these blocks should be substantive rather than ceremonial: + +- `Decision Frame`: the actual choice or judgment call +- `Unknowns & Evidence Gaps`: what would most change the recommendation +- `Pass/Fail Readiness`: what conditions make the recommendation actionable now +- `Recommended Next Artifact`: the specific next memo, analysis, plan, or experiment that should exist diff --git a/agents/orchestrator.md b/agents/orchestrator.md index 33283aa9..c64d9d50 100644 --- a/agents/orchestrator.md +++ b/agents/orchestrator.md @@ -26,6 +26,45 @@ You are Shipwright's concierge — the first point of contact for product manage - **Fast:** Direct execution for high-confidence obvious asks that map cleanly to one workflow or one skill, require no external research, and do not trigger escalation rules. - **Rigorous:** Planning-first execution for high-stakes, research-heavy, cross-workflow, or externally-facing work. +## Judge Escalation Awareness + +When a workflow uses evaluators or judges, treat judge outputs as routing signals rather than universal truth. + +- Default to the lightest judge path that still protects decision quality. +- Do not default to triple-panel judging for every artifact. +- Escalate from one judge to more judges only when ambiguity, contradiction risk, or disagreement is itself valuable signal. + +Use the following practical policy: + +- Stay on a single judge when the verdict is high-confidence, low-stakes, and unflagged. +- Escalate to a second judge when the verdict is a tie, low-confidence, needs human review, or the artifact is contradiction-heavy / boundary-heavy. +- Escalate to a triple panel when: + - two judges disagree + - the case is materially high-stakes or benchmark-defining + - the disagreement itself is important evidence + +Use the following default model-routing policy: + +- Default single runtime judge: `GPT` +- Default two-judge contrast panel: `Claude + GPT` +- Default triple panel: `Claude + GPT + Gemini` +- Treat `Gemini` primarily as an escalation judge, ambiguity detector, or third-panel perspective rather than the default solo runtime judge. + +Recommended model choice by case: + +- Low-stakes or routine screening: start with `GPT` +- Contradiction-heavy or boundary-heavy artifacts: start with `GPT`, then add `Gemini` and a contrast judge if needed +- Strategy-heavy or leadership-facing artifacts: prefer `Claude + GPT`, add `Gemini` when disagreement is informative +- Benchmark or judge-behavior research: use `Claude + GPT + Gemini` + +If a judge returns tie or low confidence, prefer asking: + +- what evidence is missing +- what questions would resolve uncertainty +- what next artifact should be produced + +Do not treat a tie as "done" when it can instead be routed into evidence-gathering, a lighter precursor artifact, or targeted human review. + If `scripts/route-request.mjs` exists, use it with Bash before deciding whether the request qualifies for Fast mode: ```bash diff --git a/benchmarks/AGENTS.md b/benchmarks/AGENTS.md new file mode 100644 index 00000000..9762a102 --- /dev/null +++ b/benchmarks/AGENTS.md @@ -0,0 +1,79 @@ +# Benchmarks Area Guidance + +Use these instructions when working anywhere under `benchmarks/`. + +## Purpose + +This directory holds benchmark scenarios, fixtures, baselines, review artifacts, and run outputs for Shipwright evaluation work. + +Optimize for: + +- experimental clarity +- reproducibility +- minimal hidden variance +- accurate bookkeeping + +Benchmark work is methodology work. Small inconsistencies in naming, orientation, inputs, or summary logic can invalidate conclusions. + +## Directory Roles + +- `benchmarks/scenarios/`: canonical scenario definitions +- `benchmarks/fixtures/`: fixture artifacts and expected packet inputs +- `benchmarks/baselines/`: baseline prompts, baselines, and reference runs +- `benchmarks/results/`: generated run outputs and summaries +- `benchmarks/reviews/`: benchmark-specific review notes if present + +Treat `scenarios/` as source of truth. Treat `results/` as generated evidence. + +## Working Rules + +- Prefer replaying or rejudging existing completed runs when the goal is to compare judges. Do not rerun both sides unless generation variance is part of the experiment. +- Make role assignment explicit. Side A, Side B, judge family, and orientation should never be implicit in analysis writeups. +- Preserve run artifacts. Do not rewrite or “clean up” generated run outputs unless the user explicitly asks for regeneration. +- When adding summaries, clearly separate completed cells, partial cells, and failed cells. +- Fail closed on unknown scenario IDs, missing comparisons, or incomplete judge matrices. +- Treat new metrics conservatively until they are validated. Heuristic metrics should be labeled as heuristic in code or analysis. + +## Analysis Guardrails + +- Do not present single-run outcomes as stable findings when rerun variance is unmeasured. +- Distinguish: + - generation variance + - judge variance + - position/orientation effects + - family/model effects +- If a matrix is incomplete, say so plainly and avoid strong publishability claims. +- Prefer matched comparisons over aggregate storytelling when the sample is still small. +- If scenario counts, tables, and narrative claims disagree, fix bookkeeping before interpretation. + +## Judge Principles + +When acting as a judge in the conflict harness, follow the protocol already encoded in the judge prompt and schemas. Do not invent a new evaluation philosophy on the fly. + +Useful default principles: + +- Judge the artifacts that were actually produced, not the solution you wish either side had written. +- Judge relatively, not absolutely. One side can win even if both are imperfect. +- Reward evidence discipline, internal consistency, responsiveness to critique, and decision usefulness. +- Penalize unsupported certainty, hidden contradictions, and arguments that sound confident without earning it. +- Treat small margins as genuinely uncertain. Use `needs_human_review` when the result is close, noisy, or both sides are weak in different ways. +- Do not infer provider identity from tone, stylistic quirks, formatting habits, or priors about model families. +- If both sides miss the core decision or both artifacts are materially weak, reflect that in margin and confidence rather than forcing a theatrical verdict. + +Do not add extra hidden criteria in analysis after the fact. If the judging standard needs to change, version the prompt or protocol explicitly. + +## Scenario Authoring + +When adding or editing scenarios: + +- keep the decision crisp +- keep the evidence packet bounded +- avoid vague “what should the company do?” framing when a narrower board/product decision is available +- prefer evidence-rich cases over lore-heavy cases +- note whether the scenario is synthetic, historical real-world, or current-event real-world + +## Result Hygiene + +- Generated run directories should remain inspectable and diffable. +- Preserve prompt files, input packets, raw outputs, parsed JSON, and summaries together. +- Do not delete failed runs unless the user explicitly asks; failure artifacts are part of the evidence trail. diff --git a/benchmarks/scenarios/bayer-breakup-not-now.json b/benchmarks/scenarios/bayer-breakup-not-now.json new file mode 100644 index 00000000..26b52d01 --- /dev/null +++ b/benchmarks/scenarios/bayer-breakup-not-now.json @@ -0,0 +1,38 @@ +{ + "id": "bayer-breakup-not-now", + "title": "Bayer breakup: split now or fix operations first", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Bayer's supervisory board in March 2024. Investors have been pressing Bayer to break itself up by separating its pharmaceuticals, consumer health, and crop science businesses, arguing that the conglomerate structure is destroying value. At the same time, CEO Bill Anderson has concluded that a breakup is not the right move yet. Bayer is dealing with heavy debt, ongoing Roundup / glyphosate litigation inherited from Monsanto, pressure on its pharmaceuticals pipeline, weak crop-science conditions, and the need for major internal simplification.\n\nThe decision is whether Bayer should move ahead with a breakup now to unlock value, or delay any split for 24 to 36 months while focusing on litigation, debt reduction, operating improvement, and management simplification.\n\nWrite a strategic recommendation memo. Take a clear position and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- In March 2024 Bayer said its answer on a breakup was 'not now' rather than 'never'\n- Management said the next 24 to 36 months should focus on improving operating performance, reducing debt, strengthening the pharma pipeline, and addressing litigation\n- Bayer's net debt at the end of 2023 was about 34.5 billion euros, up roughly 8.5%\n- Bayer expected annual cost savings of about 2 billion euros from 2026 through its restructuring efforts\n- Bayer's 2024 EBITDA guidance was lower than 2023 levels, reflecting continued pressure on the business\n- The company was still dealing with tens of thousands of unresolved glyphosate / Roundup cases as well as other Monsanto-related liabilities\n- Bayer's equity value had been badly damaged since the 2018 Monsanto acquisition, prompting renewed investor calls for breakup or asset sales\n- A breakup could reduce conglomerate discount and sharpen capital allocation, but may be difficult while litigation and debt remain unresolved\n- Creditors and execution risk could make a breakup harder or less value-accretive in the near term\n- The labor organization context in Germany and management simplification plans also complicated a near-term separation\n- A 'fix first, split later' approach could improve bargaining position and reduce forced-sale risk, but could also entrench delay and destroy more shareholder trust if operations do not improve", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/bayer-breakup-not-now/first-pass.md", + "final_pass_artifact": "../fixtures/bayer-breakup-not-now/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/blockbuster-total-access.json b/benchmarks/scenarios/blockbuster-total-access.json new file mode 100644 index 00000000..7a99fbde --- /dev/null +++ b/benchmarks/scenarios/blockbuster-total-access.json @@ -0,0 +1,38 @@ +{ + "id": "blockbuster-total-access", + "title": "Blockbuster Total Access: continue or kill the hybrid strategy", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Blockbuster's board in Q3 2007. CEO Jim Keyes has just replaced John Antioco and is reviewing the Total Access program — a hybrid online-rental + in-store-return strategy launched in late 2006.\n\nTotal Access is working competitively: Blockbuster added 2 million online subscribers in under a year, Netflix growth stalled for the first time, and Netflix CEO Reed Hastings privately told colleagues he was 'ichiban scared' of the program. Blockbuster's online subscriber count reached ~3 million.\n\nHowever, Total Access is expensive. Each in-store return costs Blockbuster ~$2 in handling and lost rental revenue from the returned disc being re-rented free. The program is burning roughly $400M/year against Blockbuster's already-leveraged balance sheet ($1.1B long-term debt). Blockbuster lost $85M in Q2 2007. Franchisees are hostile — the program cannibalizes their store traffic economics. Activist investor Carl Icahn, who controls 3 board seats, views the online losses as unsustainable.\n\nMeanwhile, Netflix is investing aggressively in streaming infrastructure (launched Watch Instantly in January 2007), betting that physical disc rental is a transitional business. Netflix has 7.5M subscribers to Blockbuster's 3M online + 47M store-visit customers.\n\nThe board must decide: should Blockbuster continue funding Total Access at current levels to press the competitive advantage, or scale it back to reduce losses and refocus on store profitability?\n\nWrite a strategic recommendation memo for the board. Your memo must take a clear position — continue or kill — and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- Total Access added 2M subscribers in <12 months (late 2006 to mid-2007)\n- Netflix growth stalled for the first time during Total Access's peak\n- Program cost: ~$400M/year incremental burn\n- Blockbuster long-term debt: $1.1B; Q2 2007 net loss: $85M\n- Netflix streaming launched January 2007 with 1,000 titles\n- Netflix total subscribers: 7.5M; Blockbuster online: ~3M\n- Blockbuster store footprint: ~5,700 US locations (asset or liability?)\n- Franchisee resistance: independent operators threatened by cannibalization\n- Carl Icahn controls 3 board seats; views online investment as value-destroying\n- DVD-by-mail market projected to peak 2010-2012 then decline\n- Broadband penetration in US: ~50% of households (2007), projected 70%+ by 2010\n- Blockbuster had attempted a streaming deal with Enron Broadband in 2000 (failed)\n- Redbox kiosk expansion accelerating in grocery/convenience stores ($1/night rentals)", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/blockbuster-total-access/first-pass.md", + "final_pass_artifact": "../fixtures/blockbuster-total-access/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/board-update-ambiguity.json b/benchmarks/scenarios/board-update-ambiguity.json index ec77ab71..a65418a8 100644 --- a/benchmarks/scenarios/board-update-ambiguity.json +++ b/benchmarks/scenarios/board-update-ambiguity.json @@ -1,6 +1,9 @@ { "id": "board-update-ambiguity", "title": "Board update under ambiguity", + "taxonomy": { + "scenario_type": "executive_ambiguity" + }, "inputs": { "prompt": "Write a strategy update for a board audience under ambiguity.", "context_files": [], diff --git a/benchmarks/scenarios/churn-conflicting-signals.json b/benchmarks/scenarios/churn-conflicting-signals.json index de180f94..e3918950 100644 --- a/benchmarks/scenarios/churn-conflicting-signals.json +++ b/benchmarks/scenarios/churn-conflicting-signals.json @@ -1,6 +1,9 @@ { "id": "churn-conflicting-signals", "title": "Churn diagnosis with conflicting signals", + "taxonomy": { + "scenario_type": "executive_ambiguity" + }, "inputs": { "prompt": "Write a churn reduction PRD when signals conflict with strategy targets.", "context_files": [], diff --git a/benchmarks/scenarios/event-automation-boundary.json b/benchmarks/scenarios/event-automation-boundary.json index 6412b49a..09ddcb33 100644 --- a/benchmarks/scenarios/event-automation-boundary.json +++ b/benchmarks/scenarios/event-automation-boundary.json @@ -1,6 +1,9 @@ { "id": "event-automation-boundary", "title": "Event automation platform with human-in-the-loop boundary constraints", + "taxonomy": { + "scenario_type": "contradiction_or_boundary_prd" + }, "inputs": { "prompt": "Write a Phase 1 PRD for an event management automation platform with a WhatsApp assistant, an internal console, human approval for pricing commitments, and strict deterministic boundaries around LLM usage.", "context_files": [], diff --git a/benchmarks/scenarios/feature-weak-evidence.json b/benchmarks/scenarios/feature-weak-evidence.json index 7846bf89..80d11fa4 100644 --- a/benchmarks/scenarios/feature-weak-evidence.json +++ b/benchmarks/scenarios/feature-weak-evidence.json @@ -1,6 +1,9 @@ { "id": "feature-weak-evidence", "title": "New feature with weak evidence", + "taxonomy": { + "scenario_type": "evidence_fragile_prd" + }, "inputs": { "prompt": "Draft a PRD for a new feature with weak supporting evidence.", "context_files": [], diff --git a/benchmarks/scenarios/google-adtech-breakup-remedies.json b/benchmarks/scenarios/google-adtech-breakup-remedies.json new file mode 100644 index 00000000..f21e825b --- /dev/null +++ b/benchmarks/scenarios/google-adtech-breakup-remedies.json @@ -0,0 +1,38 @@ +{ + "id": "google-adtech-breakup-remedies", + "title": "Google ad-tech remedies: fight for conduct fixes or accept structural separation", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Alphabet and Google leadership in May 2025 after a U.S. federal judge found Google illegally maintained monopoly power in two online advertising technology markets and the Department of Justice proposed structural remedies. The DOJ is seeking divestiture of AdX, Google's ad exchange, and DFP / Google Ad Manager, Google's publisher ad-serving platform. Google argues that structural separation goes beyond the court's findings, would be legally excessive, and would harm publishers and advertisers that depend on integrated tooling. The company says behavioral remedies such as better interoperability and bid transparency are the right response.\n\nThe strategic question is whether Google should fight aggressively for behavioral remedies only, or proactively move toward structural separation of key ad-tech assets to reduce regulatory overhang and preserve broader strategic freedom.\n\nWrite a strategic recommendation memo. Take a clear position and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- In April 2025 Judge Leonie Brinkema found Google illegally maintained monopoly power in publisher ad servers and ad exchanges\n- In May 2025 the U.S. DOJ proposed that Google divest AdX and DFP / Google Ad Manager as remedies\n- The DOJ argued that divestitures were necessary to terminate the monopolies and restore competition\n- Google argued that forced divestiture would go beyond the court's findings, have weak legal grounding, and harm publishers and advertisers that rely on integrated tools\n- Google instead supported behavioral remedies such as greater interoperability and access to bidding information\n- A remedies trial was scheduled for later in 2025\n- The ad-tech case arrived on top of broader antitrust pressure on Google, including major search-monopoly proceedings and app-store / platform scrutiny\n- Proactively separating assets could reduce legal overhang, demonstrate good-faith restructuring, and preserve management focus on AI and search competition\n- Fighting for conduct remedies could preserve valuable integration, data advantages, and customer relationships while avoiding a costly breakup of a still-profitable business stack\n- Structural separation could create execution risk, strategic leakage, and near-term revenue pressure\n- But a prolonged legal fight could also extend uncertainty, embolden regulators, and worsen the perception that Google will resist any meaningful reform", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/google-adtech-breakup-remedies/first-pass.md", + "final_pass_artifact": "../fixtures/google-adtech-breakup-remedies/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/handoff-contradiction.json b/benchmarks/scenarios/handoff-contradiction.json index 46b5c5fb..30f338c9 100644 --- a/benchmarks/scenarios/handoff-contradiction.json +++ b/benchmarks/scenarios/handoff-contradiction.json @@ -1,6 +1,9 @@ { "id": "handoff-contradiction", "title": "Handoff artifact with cross-document contradictions", + "taxonomy": { + "scenario_type": "contradiction_or_boundary_prd" + }, "inputs": { "prompt": "Write a technical handoff PRD aligned to a platform strategy and challenge review.", "context_files": [], diff --git a/benchmarks/scenarios/intel-foundry-separation.json b/benchmarks/scenarios/intel-foundry-separation.json new file mode 100644 index 00000000..996778c6 --- /dev/null +++ b/benchmarks/scenarios/intel-foundry-separation.json @@ -0,0 +1,38 @@ +{ + "id": "intel-foundry-separation", + "title": "Intel foundry separation: split for capital access or stay integrated", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Intel's board in September 2024 after Intel announced that it would turn its foundry business into a standalone subsidiary with its own operating board and the potential to raise outside capital. Intel is trying to execute an expensive turnaround while losing share in key markets and trailing Nvidia in AI acceleration. The foundry strategy is central to CEO Pat Gelsinger's revival plan, but it is also a major drag on profitability and investor confidence.\n\nThe board must decide whether Intel should keep pushing toward structural separation and outside funding for the foundry business, or preserve tighter integration and avoid further complexity while the turnaround is still fragile.\n\nWrite a strategic recommendation memo. Take a clear position and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- In September 2024 Intel said it would create a separate entity for its foundry business with its own operating board and the ability to evaluate outside funding\n- Intel had spent roughly $25 billion per year on the foundry business in each of the previous two years\n- Intel's stock had lost nearly 60% of its value during 2024 before the announcement\n- Intel disclosed in April 2024 that the foundry business lost about $7 billion in operating income in 2023 on roughly $18.9 billion in sales\n- Intel had said foundry losses would likely peak in 2024 and not break even until sometime before 2030\n- Intel was also pursuing layoffs and a roughly $10 billion cost-reduction plan\n- Intel paused or delayed parts of its fabrication expansion in Poland and Germany while keeping key U.S. projects moving\n- Intel received major U.S. government support under the CHIPS Act and additional Secure Enclave funding, making the foundry strategy geopolitically important\n- Intel also announced an expanded custom-chip relationship with Amazon Web Services, supporting the foundry narrative\n- A more independent foundry structure could unlock outside capital, cleaner economics, and strategic optionality for a spin or carve-out\n- But further separation could also add governance complexity, weaken execution focus, and raise questions about whether Intel can still integrate design and manufacturing as a differentiated model", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/intel-foundry-separation/first-pass.md", + "final_pass_artifact": "../fixtures/intel-foundry-separation/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/meta-muse-spark.json b/benchmarks/scenarios/meta-muse-spark.json new file mode 100644 index 00000000..b9cc0c16 --- /dev/null +++ b/benchmarks/scenarios/meta-muse-spark.json @@ -0,0 +1,38 @@ +{ + "id": "meta-muse-spark", + "title": "Meta Muse Spark: keep the frontier model closed or return to the open strategy", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Meta's senior leadership in April 2026, immediately after the April 8, 2026 launch of Muse Spark.\n\nMuse Spark is the first model released by Meta Superintelligence Labs (MSL), the new unit built after Meta's 2025 AI reset. Meta says MSL rebuilt its AI stack over the prior nine months and that Muse Spark is now its most powerful model. The model already powers the Meta AI app and website, with rollout planned over the coming weeks to WhatsApp, Instagram, Facebook, Messenger, and Meta's AI glasses. Meta is also offering Muse Spark in private-preview API access to selected partners.\n\nThe strategic break is obvious: Muse Spark is not open-source. Meta says it hopes to open-source future versions, but for now the company's newest frontier model is closed and tightly integrated into Meta products. That is a sharp departure from the Llama strategy, which helped Meta build developer goodwill and ecosystem reach.\n\nLeadership believes the move may be necessary. Meta has spent billions trying to reassert itself in AI, including its June 2025 $14.3B investment in Scale AI and the hiring of Alexandr Wang to help lead its superintelligence push. Fortune reported that Meta's published benchmarks suggest Muse Spark is competitive with leading models from OpenAI, Anthropic, and Google across many tasks, though not clearly superior across the board. Internally, this launch is being treated as proof that Meta is back in the race after Llama 4 (April 2025) was widely seen as disappointing.\n\nBut the closed-model move carries real tradeoffs. Meta's open strategy gave it differentiation against closed rivals, drove broad adoption of Llama, and supported an ecosystem narrative that fit the company's platform DNA. Keeping Muse Spark closed may improve product control, monetization, safety, and frontier secrecy, but it may also weaken Meta's developer credibility, blur its strategic identity, and intensify the capex race against firms that are already ahead on flagship closed models.\n\nThe leadership team must decide: should Meta stay the course with a closed, product-first Muse strategy for its frontier models, or should it commit to reopening the roadmap and release open-weight successors quickly even if that slows product integration and monetization?\n\nWrite a strategic recommendation memo. Take a clear position — stay closed or reopen — and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- Muse Spark launched on April 8, 2026 as the first model from Meta Superintelligence Labs\n- Meta says MSL rebuilt its AI stack over the prior 9 months\n- Muse Spark currently powers the Meta AI app and website\n- Planned rollout in coming weeks: WhatsApp, Instagram, Facebook, Messenger, and AI glasses\n- Muse Spark is available in private-preview API access to selected partners\n- Meta says larger Muse-family models are already in development\n- Meta says it hopes to open-source future versions, but Muse Spark itself is closed\n- Meta positions Muse Spark as purpose-built for Meta products and personal superintelligence use cases\n- Fortune reported Meta's published benchmarks indicate Muse Spark is competitive with frontier models, but not clearly best-in-class across the board\n- Muse Spark is the first major Meta model release since the June 2025 Scale AI deal and Alexandr Wang's hiring into Meta's superintelligence effort\n- Meta's previous differentiator in AI was the open Llama strategy rather than a closed frontier product stack\n- Closed-source frontier models may improve product control, speed of monetization, partner leverage, and model secrecy\n- Open-weight releases may strengthen developer trust, ecosystem adoption, and Meta's strategic differentiation versus other frontier labs\n- If Meta keeps the frontier roadmap closed, it is competing more directly on product execution and capital intensity against OpenAI, Anthropic, and Google", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/meta-muse-spark/first-pass.md", + "final_pass_artifact": "../fixtures/meta-muse-spark/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/netflix-qwikster.json b/benchmarks/scenarios/netflix-qwikster.json new file mode 100644 index 00000000..c71ce94b --- /dev/null +++ b/benchmarks/scenarios/netflix-qwikster.json @@ -0,0 +1,38 @@ +{ + "id": "netflix-qwikster", + "title": "Netflix Qwikster: split streaming from DVD or reverse course", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Netflix's leadership team in September 2011. CEO Reed Hastings has just announced Qwikster — a plan to split Netflix into two separate companies: Netflix for streaming and Qwikster for DVD-by-mail. Each would have its own website, billing, and queue.\n\nThe rationale: DVD-by-mail is a declining business (peaked at ~20M subscribers), while streaming is the future. Bundling them creates organizational drag — product decisions for streaming are constrained by DVD logistics. Separating them lets each business optimize independently.\n\nHowever, the market reaction has been catastrophic. Netflix stock dropped 77% from its July 2011 peak of $298 to ~$65. The company lost 800,000 subscribers in Q3 2011 after a 60% price increase for the combined plan ($7.99 each vs $9.99 bundled). Customer anger is at an all-time high — the Netflix blog post announcing Qwikster received 27,000+ comments, overwhelmingly negative. The Qwikster Twitter handle is owned by someone else posting inappropriate content.\n\nKey evidence available:\n- Netflix subscribers: ~23.8M total (Q3 2011), down from 24.6M in Q2 2011\n- DVD-by-mail subscribers declining ~5% per quarter organically\n- Streaming content costs surging: licensing jumped from $180M (2010) to projected $2B (2012)\n- Netflix streaming library: ~20,000 titles vs ~100,000 on DVD\n- Competitors entering streaming: Amazon Prime Video, Hulu Plus, HBO GO all launched 2010-2011\n- International expansion underway: launched in Canada (2010), Latin America and UK (2011)\n- Price increase backlash: 60% increase for combo plan, announced July 2011\n- Customer satisfaction scores dropped from 79 to 52 (ACSI) in one quarter\n- Qwikster would require separate logins, separate queues, separate reviews\n- DVD business still generating significant cash flow (~$1B revenue, high margins)\n- Streaming business operating at a loss, subsidized by DVD profits\n- Hastings publicly apologized for the price increase but defended the strategic logic of separation\n- Wall Street analysts split: some see strategic clarity, most see execution disaster\n\nThe leadership team must decide: proceed with the Qwikster split as announced, or reverse course and keep both services under the Netflix brand?\n\nWrite a strategic recommendation memo. Take a clear position — split or reverse — and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/netflix-qwikster/first-pass.md", + "final_pass_artifact": "../fixtures/netflix-qwikster/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/nissan-honda-merger-collapse.json b/benchmarks/scenarios/nissan-honda-merger-collapse.json new file mode 100644 index 00000000..feedb53c --- /dev/null +++ b/benchmarks/scenarios/nissan-honda-merger-collapse.json @@ -0,0 +1,38 @@ +{ + "id": "nissan-honda-merger-collapse", + "title": "Nissan-Honda merger collapse: accept subordination or stay independent", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Nissan's board in February 2025, immediately after Nissan and Honda terminated merger talks that could have created the world's third-largest automaker by sales. Nissan is under severe pressure: it has already announced plans to cut 9,000 jobs and reduce global production capacity by 20%, its first-half operating profit fell roughly 90%, and it has been struggling in the U.S. hybrid market and in China. Honda is financially stronger and pushed to change the structure from a joint holding company to one in which Honda would become the parent and Nissan a subsidiary. Nissan resisted that proposal because it would effectively concede independence and control.\n\nThe board now faces a difficult question: should Nissan have accepted a Honda-led combination even as a junior partner, or was walking away and preserving independence the right move?\n\nWrite a strategic recommendation memo. Take a clear position and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- Nissan and Honda announced merger talks in December 2024 and terminated them in February 2025\n- The proposed combination was valued at roughly $60 billion and would have created the world's third-largest automaker by sales\n- Honda proposed shifting the structure from a joint holding company to one with Honda as parent and Nissan as subsidiary\n- Nissan opposed the subsidiary structure on autonomy and control grounds\n- Nissan had already announced 9,000 job cuts and a 20% reduction in global production capacity\n- Nissan's first-half fiscal 2024 operating profit fell about 90% year over year and net income fell about 94%\n- Nissan has struggled in the U.S. and China and was seen as needing a turnaround lifeline\n- Honda remained profitable and reported stronger recent results than Nissan\n- Both companies argued they still needed to collaborate on EVs, software, and intelligent vehicle technologies\n- The global auto industry remains under pressure from Chinese EV competitors and high software / battery investment requirements\n- The merger fell apart in part over balance-of-power disputes, Nissan's resistance to deeper cuts, and Honda's demand for more control\n- Analysts viewed the tie-up as a scale and survival play, not a merger of equals\n- Nissan may still need a partner, investor, or sharper turnaround plan after the collapse\n- Accepting the deal might have improved scale, purchasing leverage, and technology sharing but at the cost of strategic control", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/nissan-honda-merger-collapse/first-pass.md", + "final_pass_artifact": "../fixtures/nissan-honda-merger-collapse/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/openai-nonprofit-control.json b/benchmarks/scenarios/openai-nonprofit-control.json new file mode 100644 index 00000000..374dfff5 --- /dev/null +++ b/benchmarks/scenarios/openai-nonprofit-control.json @@ -0,0 +1,38 @@ +{ + "id": "openai-nonprofit-control", + "title": "OpenAI governance: preserve nonprofit control or simplify for capital access", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to OpenAI's board in May 2025, after OpenAI announced that its nonprofit would remain in control even as the business restructures into a public benefit corporation. OpenAI had faced strong pressure from civic groups, former employees, and AI safety critics who argued that a move to a more conventional for-profit structure would undermine its mission. At the same time, OpenAI needs extraordinary capital, computing capacity, and operational flexibility to compete with Google, Anthropic, Meta, and frontier-model rivals. Microsoft remains a critical commercial and infrastructure partner, and the company's scale requirements keep rising.\n\nThe board now faces a strategic governance question: should OpenAI preserve nonprofit control over the new structure, or would a cleaner conventional for-profit setup be strategically superior despite the governance backlash?\n\nWrite a strategic recommendation memo. Take a clear position and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- In May 2025 OpenAI said its nonprofit would remain in control of the company even as the business restructures into a public benefit corporation\n- OpenAI had faced pressure from civic leaders, researchers, and former employees who opposed a full move away from nonprofit control\n- OpenAI was founded as a nonprofit in 2015 and already operates with a complex capped-profit / commercial structure layered beneath the nonprofit\n- The company needs very large capital and compute commitments to compete in frontier AI development\n- Microsoft remains a major strategic partner providing capital, cloud infrastructure, and commercialization leverage\n- A more conventional for-profit structure could simplify governance, fundraising, equity incentives, and strategic speed\n- Keeping nonprofit control may preserve mission legitimacy, safety credibility, and political resilience in a heavily scrutinized industry\n- OpenAI faces pressure to scale quickly while also proving it can govern powerful models responsibly\n- Governance instability had already become a visible strategic risk after prior board turmoil and leadership conflict\n- Public benefit corporation status may offer a middle path between mission preservation and commercial flexibility\n- Critics argue that mission-first control is only meaningful if it can actually constrain commercial pressure in practice\n- Supporters argue that abandoning nonprofit control would permanently damage trust and make OpenAI look like a standard profit-maximizer", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/openai-nonprofit-control/first-pass.md", + "final_pass_artifact": "../fixtures/openai-nonprofit-control/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/paramount-skydance-deal.json b/benchmarks/scenarios/paramount-skydance-deal.json new file mode 100644 index 00000000..7234e744 --- /dev/null +++ b/benchmarks/scenarios/paramount-skydance-deal.json @@ -0,0 +1,38 @@ +{ + "id": "paramount-skydance-deal", + "title": "Paramount-Skydance deal: sell now or keep searching", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Paramount Global's special committee in July 2024 as the company considers the Skydance transaction. Paramount is under pressure from declining linear television economics, a weak advertising market, streaming losses, and a heavy debt load. Shari Redstone's National Amusements controls Paramount, and the proposed Skydance deal would end the Redstone era through a two-step transaction: acquisition of National Amusements followed by a merger of Skydance and Paramount. The committee also has other interested parties and a temporary go-shop window.\n\nThe central question is whether Paramount should accept the Skydance-led deal now, or reject it in favor of waiting for a better bidder / independent turnaround path.\n\nWrite a strategic recommendation memo. Take a clear position and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- In July 2024 Paramount agreed to merge with Skydance through a two-step transaction including the acquisition of National Amusements\n- The Skydance group agreed to acquire National Amusements for about $2.4 billion in cash\n- The deal also contemplated roughly $4.5 billion in cash or stock for shareholders and an additional $1.5 billion to support Paramount's balance sheet\n- Paramount carried nearly $15 billion of debt and faced pressure from weak advertising and continued cable declines\n- Paramount+ had not yet reached profitability\n- The deal included a 45-day go-shop period to solicit alternative bids\n- Other parties including Apollo / Sony and later Edgar Bronfman Jr. had shown interest or were discussed as potential alternatives\n- Skydance argued it could combine creative leadership, technology, and fresh capital with Paramount's library and distribution footprint\n- Critics questioned governance, valuation fairness, and whether the special committee was optimizing for all shareholders or primarily for the controller transaction path\n- Accepting the deal offered capital support and strategic clarity but ended Redstone control and could limit upside if a better path existed\n- Rejecting the deal preserved optionality but left Paramount exposed to continued streaming losses, debt pressure, and industry erosion\n- Paramount's traditional TV assets still generated cash but were structurally declining", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/paramount-skydance-deal/first-pass.md", + "final_pass_artifact": "../fixtures/paramount-skydance-deal/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/prd-hidden-scope-creep.json b/benchmarks/scenarios/prd-hidden-scope-creep.json index 70ee1871..14f466ad 100644 --- a/benchmarks/scenarios/prd-hidden-scope-creep.json +++ b/benchmarks/scenarios/prd-hidden-scope-creep.json @@ -1,6 +1,9 @@ { "id": "prd-hidden-scope-creep", "title": "PRD with hidden scope creep", + "taxonomy": { + "scenario_type": "contradiction_or_boundary_prd" + }, "inputs": { "prompt": "Write a PRD for a team inbox workflow handoff improvement.", "context_files": [], diff --git a/benchmarks/scenarios/pricing-partial-data.json b/benchmarks/scenarios/pricing-partial-data.json index 43c2cbb7..d8caffaf 100644 --- a/benchmarks/scenarios/pricing-partial-data.json +++ b/benchmarks/scenarios/pricing-partial-data.json @@ -1,6 +1,9 @@ { "id": "pricing-partial-data", "title": "Pricing change with partial market data", + "taxonomy": { + "scenario_type": "executive_ambiguity" + }, "inputs": { "prompt": "Draft a pricing strategy recommendation under partial market data.", "context_files": [], diff --git a/benchmarks/scenarios/supermicro-export-controls.json b/benchmarks/scenarios/supermicro-export-controls.json new file mode 100644 index 00000000..44a10544 --- /dev/null +++ b/benchmarks/scenarios/supermicro-export-controls.json @@ -0,0 +1,38 @@ +{ + "id": "supermicro-export-controls", + "title": "Supermicro export-controls crisis: keep pushing AI growth or pause for compliance reset", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Supermicro's board in April 2026, immediately after the company's April 7, 2026 announcement that an independent board investigation is underway.\n\nSupermicro is one of the most visible infrastructure suppliers in the AI boom, building server systems that depend heavily on Nvidia GPUs and serving data-center customers racing to deploy AI capacity. Growth expectations are high, and slowing down could create openings for rivals.\n\nBut the company is now under fresh scrutiny. On March 19, 2026, Supermicro says it was informed that two employees and a contractor were indicted in connection with an alleged conspiracy to commit export-control violations. On April 7, 2026, Supermicro publicly confirmed that an independent investigation is underway, overseen by Lead Independent Director Scott Angel and Audit Committee Chair Tally Liu with external counsel Munger, Tolles & Olson LLP. The company says it is not named as a defendant and is not accused of wrongdoing, and that the three individuals no longer have any relationship with Supermicro.\n\nThe board now faces a classic strategic tradeoff. One path is to keep pressing the AI infrastructure opportunity at full speed, argue that the alleged misconduct was limited to individuals, and prevent a compliance episode from derailing a once-in-a-generation growth cycle. The other path is to slow selected expansion efforts, tighten high-risk channels, and reset governance before reputational and regulatory damage compounds.\n\nThe reputational risk is not abstract. Fortune reported that the allegations center on the routing of roughly $2.5B in servers packed with Nvidia GPUs to China in violation of export controls, and that investors are concerned the controversy could strain Supermicro's relationship with Nvidia. Supermicro has already launched an internal review of its Global Trade Compliance Program, with its acting Chief Compliance Officer reporting to the General Counsel as part of a remediation effort.\n\nThe board must decide: should Supermicro continue aggressive AI-growth execution while the investigation proceeds, or should it pause and narrow high-risk activity until a full compliance reset is complete?\n\nWrite a strategic recommendation memo. Take a clear position — keep pushing growth or pause for reset — and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.\n\nKey evidence available:\n- On March 19, 2026, Supermicro says it was informed that two employees and a contractor were indicted in connection with alleged export-control violations\n- On April 7, 2026, Supermicro confirmed an independent investigation led by Lead Independent Director Scott Angel and Audit Committee Chair Tally Liu, supported by Munger, Tolles & Olson LLP\n- Supermicro says it is not named as a defendant and is not accused of wrongdoing\n- Supermicro says the three individuals no longer have any relationship with the company\n- Supermicro is reviewing its Global Trade Compliance Program as part of the response\n- The acting Chief Compliance Officer now reports to the General Counsel during the review process\n- Fortune reported the indictment alleges roughly $2.5B in servers containing Nvidia GPUs were routed to China in violation of export controls\n- Fortune also reported investor concern that compliance and reputational risk could strain Supermicro's relationship with Nvidia\n- Supermicro remains a prominent AI-server supplier during a period of intense demand for Nvidia-based infrastructure\n- Pausing or narrowing high-risk activity could protect the company from deeper regulatory, customer, and governance fallout\n- Continuing full-speed growth could preserve revenue momentum, customer trust in delivery capacity, and competitive positioning in the AI infrastructure market\n- A narrow-company-defense narrative may be credible if the misconduct was isolated to individuals rather than systemic company behavior\n- A pause may be costly if hyperscale and enterprise customers shift orders to other suppliers and do not return quickly\n- If the company underreacts and further facts emerge, the board could face a much larger credibility and governance crisis later", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/supermicro-export-controls/first-pass.md", + "final_pass_artifact": "../fixtures/supermicro-export-controls/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/yahoo-microsoft.json b/benchmarks/scenarios/yahoo-microsoft.json new file mode 100644 index 00000000..4acb03fc --- /dev/null +++ b/benchmarks/scenarios/yahoo-microsoft.json @@ -0,0 +1,38 @@ +{ + "id": "yahoo-microsoft", + "title": "Yahoo rejecting Microsoft: hold independent or accept acquisition", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Yahoo's board in May 2008. Microsoft has offered to acquire Yahoo for $33 per share ($47.5 billion), a 62% premium over Yahoo's pre-offer price of ~$20. CEO Jerry Yang and the board have rejected the offer, arguing it undervalues Yahoo's strategic assets.\n\nMicrosoft CEO Steve Ballmer has since withdrawn the offer but signaled willingness to return with a revised bid. Yahoo has entered a search advertising partnership with Google as a defensive measure, but the DOJ is reviewing it for antitrust concerns.\n\nThe context: Yahoo's display advertising business is still #1 in the US, but search — the highest-growth, highest-margin segment — is dominated by Google (60%+ share). Yahoo Search has ~20% share and declining. Yahoo's Panama search platform (launched 2007) narrowed the monetization gap with Google but hasn't reversed share losses.\n\nKey evidence available:\n- Microsoft offer: $33/share ($47.5B), rejected by Yahoo board as too low\n- Yahoo pre-offer stock price: ~$20/share; post-rejection: ~$24/share\n- Yahoo board's stated minimum: $37/share\n- Yahoo 2007 revenue: $6.97B; operating income: $695M (10% margin)\n- Google 2007 revenue: $16.6B; operating income: $5.1B (31% margin)\n- Yahoo display ad market share: #1 in US (15.6% of total digital ad spend)\n- Yahoo search share: ~20% and declining (Google: ~63%, Microsoft: ~10%)\n- Yahoo Mail users: 260M+ (largest email service globally in 2008)\n- Yahoo Finance, Yahoo Sports, Flickr: high-traffic properties with strong engagement\n- Yahoo Japan stake: 34% ownership, valued at ~$8-10B independently\n- Alibaba stake: ~40% ownership (pre-IPO), valued at ~$3-4B in 2008\n- Panama search platform: improved revenue-per-search by 15-20% since launch\n- Google-Yahoo search partnership: would outsource Yahoo search ads to Google for 3+ years\n- DOJ antitrust review of Google-Yahoo deal: likely to block or severely limit scope\n- Microsoft's strategic need: desperately needs search scale to compete with Google\n- Carl Icahn (activist investor): acquired 5% Yahoo stake, pushing board to accept Microsoft offer\n- Yahoo headcount: ~14,300 employees (2008), up from 11,400 (2006)\n- Social media shift: Facebook growing from 100M to 200M users in 2008, fragmenting display ad attention\n\nThe board must decide: reopen negotiations with Microsoft at a higher price, or pursue an independent strategy built on display advertising dominance, the Google partnership, and the Alibaba/Yahoo Japan stakes?\n\nWrite a strategic recommendation memo. Take a clear position — sell or stay independent — and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/yahoo-microsoft/first-pass.md", + "final_pass_artifact": "../fixtures/yahoo-microsoft/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/zillow-offers.json b/benchmarks/scenarios/zillow-offers.json new file mode 100644 index 00000000..0fdb8383 --- /dev/null +++ b/benchmarks/scenarios/zillow-offers.json @@ -0,0 +1,38 @@ +{ + "id": "zillow-offers", + "title": "Zillow Offers: reform iBuying or exit the business", + "taxonomy": { + "scenario_type": "historical_strategy" + }, + "inputs": { + "prompt": "You are a strategy advisor to Zillow's board in October 2021. CEO Rich Barton is evaluating the future of Zillow Offers, the company's iBuying division that purchases homes directly from sellers using algorithmic pricing.\n\nZillow Offers launched in 2018 as a bold bet to transform Zillow from an advertising/lead-gen marketplace into a transactional player. The thesis: Zillow's proprietary Zestimate algorithm (used by 200M+ monthly visitors) gives Zillow a pricing edge that competitors like Opendoor and Offerpad lack.\n\nThe reality has been different. Zillow Offers purchased 9,680 homes in Q3 2021 alone, but the pricing algorithm has been systematically overpaying. Zillow is sitting on ~7,000 homes purchased above current market value. The company just announced it would pause home purchases due to 'labor and supply constraints' — but insiders know the real issue is pricing model failure.\n\nKey evidence available:\n- Zillow Offers Q3 2021: purchased 9,680 homes, sold 3,032 homes\n- Inventory backlog: ~7,000 homes on the books, many underwater\n- Estimated write-down exposure: $300-500M\n- Zillow Offers revenue: $1.2B in Q3 2021 (vs $740M in Q3 2020)\n- Zillow Offers EBITDA: negative, losses widening quarter-over-quarter\n- Total Zillow revenue: $1.7B in Q3 2021 (Offers is 70% of revenue but a drag on margins)\n- Zestimate median error rate: ~6.9% nationally (too wide for thin-margin home flipping)\n- Competitor Opendoor: narrower geography, lower volume, but consistently closer to breakeven\n- Zillow core business (IMT segment): $490M revenue, ~30% EBITDA margins, growing 20%+ YoY\n- Zillow stock: dropped from $200 (Feb 2021) to ~$85 (Oct 2021)\n- Housing market: prices rose 19.5% YoY (Case-Shiller, Aug 2021) but showing signs of cooling\n- Zillow headcount: grew from 5,400 (2019) to 8,000+ (2021), mostly Offers-related hires\n- Home flipping industry average margin: 2-5% gross, requires sub-2% pricing accuracy\n- Zillow had already pivoted the algorithm twice in 2021 to reduce overpayment\n- Board concern: Offers losses are masking the strength of the core marketplace business\n\nThe board must decide: reform Zillow Offers with tighter pricing controls and reduced volume, or exit iBuying entirely and refocus on the high-margin marketplace business?\n\nWrite a strategic recommendation memo. Take a clear position — reform or exit — and defend it with evidence. Acknowledge the strongest counterarguments and explain why your position is still correct.", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/zillow-offers/first-pass.md", + "final_pass_artifact": "../fixtures/zillow-offers/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": null + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/docs/shipwright-cross-model-conflict-harness-spec.md b/docs/shipwright-cross-model-conflict-harness-spec.md new file mode 100644 index 00000000..3bc02003 --- /dev/null +++ b/docs/shipwright-cross-model-conflict-harness-spec.md @@ -0,0 +1,666 @@ +# Technical Specification: Shipwright Cross-Model Conflict Harness + +## Metadata + +- Related context: `benchmarks/`, `docs/shipwright-v2-benchmark-scoring-spec.md`, `agents/red-team.md` +- Author: Codex with Shipwright frameworks +- Reviewers: Shipwright maintainer, Claude reviewer +- Status: Draft +- Last updated: April 13, 2026 + +## Context + +**What we're building:** A provider-agnostic conflict harness that lets Shipwright run structured adversarial exchanges across model families, starting with Anthropic Claude and OpenAI ChatGPT, instead of relying on the same model family on both sides of a conflict. + +**Why (business):** Shipwright's red-team value is weaker when the producer and challenger share the same blind spots, prompt priors, and stylistic habits. A cross-model harness makes challenge results more credible, benchmark outcomes more interesting, and adversarial review less vulnerable to single-family failure modes. + +**Why (technical):** The current repo already has benchmark fixtures, scoring conventions, and adversarial-review patterns, but it does not yet define a runtime contract for multi-provider opposition, limited transcript sharing, or blind adjudication. We need a deterministic protocol before adding more models or more elaborate agent topologies. + +## Scope + +**In scope:** + +- Define a reusable harness for structured conflict runs between providers +- Support 3 execution modes: + - `head_to_head` + - `coalition_vs_coalition` + - `swap_test` +- Standardize what is shared across sides: + - same case packet + - same rubric + - committed outputs only + - critiques only +- Explicitly disallow shared scratchpads, hidden reasoning exchange, and free-form cross-side chat +- Define provider adapter contracts for Anthropic and OpenAI +- Define run, packet, verdict, and transcript shapes suitable for benchmark storage +- Define fairness controls, cost caps, and judge-blinding rules +- Specify how the harness plugs into `benchmarks/` without rewriting the existing scoring spec + +**Out of scope:** + +- Fine-tuning or training data generation +- Real-time voice, multimodal, or browser-driven conflict sessions +- Unlimited debate loops or open-ended agent swarms +- Replacing human review for high-stakes benchmark publication +- Repo-wide implementation of every file named in this spec + +**Assumptions:** + +- Provider access is available through stable CLI or subprocess interfaces backed by active subscription plans +- Phase 1 must use Claude Max and ChatGPT Pro style plan access rather than pay-per-token API access +- Early runs are text-first and tool-light +- Judge prompts can be normalized enough to compare outputs fairly +- Provider names can be hidden from judges during scoring +- Benchmark scenarios are a better starting point than live user traffic + +## Architecture + +### High-Level Design + +The harness separates 5 responsibilities: packet construction, side execution, exchange gating, adjudication, and run storage. + +```text +[Scenario or case file] + -> [Case Packet Builder] + -> [Conflict Runner] + -> [Side A Adapter] -> first pass + -> [Side B Adapter] -> first pass + -> [Exchange Gate] -> rebuttals + -> [Revision Gate] -> final submissions + -> [Judge Adapter] -> verdict + -> [Run Store] + -> [Benchmark Summary / Review Queue] +``` + +### Components Affected + +| Component | Change Type | Description | +|---|---|---| +| `docs/shipwright-cross-model-conflict-harness-spec.md` | New | This specification | +| `scripts/run-conflict-harness.mjs` | New | CLI entry point for running a conflict session | +| `scripts/score-conflict-run.mjs` | New | Applies rubric scoring and emits a verdict payload | +| `scripts/build-case-packet.mjs` | New | Normalizes scenario input, rubric, role definitions, and round budgets | +| `schemas/conflict-case.schema.json` | New | Validates the shared case packet | +| `schemas/conflict-run.schema.json` | New | Validates per-run metadata and transcript references | +| `schemas/conflict-verdict.schema.json` | New | Validates judge output | +| `benchmarks/scenarios/` | Modify | Allow conflict-mode configuration alongside existing scenario definitions | +| `benchmarks/results/conflict-harness/` | New | Versioned outputs for run transcripts and verdicts | +| `benchmarks/reviews/` | Modify | Optional blind human review packets for tie-breaks or calibration | + +### Architecture Decision Records (ADRs) + +#### ADR 1: Require independent first-pass submissions before any exchange + +- **Context:** If both sides see each other's drafts too early, they quickly converge in style and argument shape, which removes the main reason to run cross-model opposition. +- **Options considered:** + - Option A: Let sides see each other's prompts and partial drafts in real time + - Pros: More conversational, may surface faster consensus + - Cons: High convergence risk, weak provenance, hard-to-score independence + - Option B: Force each side to commit a first-pass artifact before any exchange + - Pros: Preserves model diversity signal, cleaner transcript, easier judging + - Cons: Adds one more protocol step +- **Decision:** Choose Option B. +- **Rationale:** The harness exists to measure disagreement quality and error-finding power, not to optimize for casual collaboration. +- **Consequences:** The runner must support a sealed first-pass phase and store those artifacts separately. + +#### ADR 2: Build a provider-agnostic harness, not a hardcoded Claude-vs-ChatGPT feature + +- **Context:** The immediate use case is Claude vs ChatGPT, but the durable value is a framework that can compare any pair or coalition of providers. +- **Options considered:** + - Option A: Hardcode `anthropic` and `openai` as named sides + - Pros: Fastest short-term implementation + - Cons: Brittle, leaks provider semantics into scoring and storage + - Option B: Use generic side bindings backed by provider adapters + - Pros: Extensible, easier to benchmark new model families later + - Cons: Slightly more abstraction upfront +- **Decision:** Choose Option B. +- **Rationale:** The repo should learn a reusable harness pattern, not a one-off rivalry script. +- **Consequences:** Provider-specific logic lives behind adapters, while run protocols stay neutral. In Phase 1 those adapters are CLI/subprocess wrappers over subscription plans, not SDK clients. + +#### ADR 3: Share committed outputs and critiques, but never hidden reasoning + +- **Context:** The user wants the models to share inputs and outputs. Sharing the same case packet is useful; sharing chain-of-thought or hidden scratchpads is not. +- **Options considered:** + - Option A: Share all intermediate reasoning across sides + - Pros: Maximum context transfer + - Cons: Collapses independence, increases leakage risk, muddies attribution + - Option B: Share only case packets, committed artifacts, and explicit critiques + - Pros: Keeps the exchange legible and adversarial + - Cons: Less collaborative synthesis +- **Decision:** Choose Option B. +- **Rationale:** The harness needs auditable disagreement, not merged cognition. +- **Consequences:** Prompts must forbid internal reasoning disclosure, and transcript storage must capture only user-visible artifacts. + +#### ADR 4: Blind the judge to side labels, acknowledge family bias, and support optional role swap + +- **Context:** A single judge can introduce provider affinity bias. Side-label blinding helps, but it does not fully remove model-family bias when the judge shares a family with one competitor. A role swap catches prompt-side bias and some judge overfitting. +- **Options considered:** + - Option A: Judge once with visible provider labels + - Pros: Simple + - Cons: Easy bias leakage + - Option B: Judge with `Side A` / `Side B` labels, document family-bias limits honestly, and optionally rerun with role assignments swapped + - Pros: Better fairness signal, compatible with current provider availability + - Cons: Higher cost and latency +- **Decision:** Choose Option B. +- **Rationale:** Side-label blindness and swap stability are cheap compared to publishing a bad conclusion with false confidence, but the spec must stay honest that v1 is not fully family-blind. +- **Consequences:** The run schema must track provider identity separately from the judge-facing transcript labels, and Phase 2 must include alternate-judge calibration before published benchmark claims rely on single-judge confidence. + +## Conflict Modes + +| Mode | Description | Best For | Default Round Budget | +|---|---|---|---| +| `head_to_head` | One provider-backed side vs one provider-backed side | Early harness validation, benchmark scenarios, low-cost runs | `first_pass -> rebuttal -> final -> verdict` | +| `coalition_vs_coalition` | Each side has a `lead` and `wing` model; the side emits one shared submission | Harder scenarios where intra-side critique is valuable | `internal draft -> side merge -> rebuttal -> final -> verdict` | +| `swap_test` | Reruns the same case with side labels or role framing swapped | Bias detection and robustness checks | Same as parent mode plus one rerun | + +### Mode Rules + +#### `head_to_head` + +- Both sides receive the same case packet at the same time +- Neither side sees the other side until both first-pass artifacts are committed +- Each side gets exactly one rebuttal and one final revision unless the run config explicitly raises the cap + +#### `coalition_vs_coalition` + +- Each side contains: + - one `lead` model + - one `wing` model +- `lead` and `wing` receive the same side-specific role packet +- `lead` and `wing` produce independent internal drafts +- The side merger step produces exactly one committed side artifact +- The opposing side never sees the internal drafts; it sees only the merged side artifact + +#### `swap_test` + +- Reuse the same scenario, rubric, and round budget +- Swap one of: + - side labels + - proposer/challenger framing + - lead/wing role order in coalition mode +- Treat a materially different verdict as a calibration warning, not as an automatic failure +- `materially_different_swap_result` is `true` when the winner changes or `abs(primary.margin - swap.margin) >= 0.20` + +## Conflict Protocol + +### Shared Packet Types + +#### Case Packet + +The same packet is sent to every side in the run. + +Required fields: + +- `scenario_id` +- `prompt` +- `artifact_type` +- `rubric` +- `constraints` +- `evidence` +- `max_rounds` +- `tool_policy` +- `sharing_policy` +- `success_condition` + +#### Committed Artifact Packet + +One visible artifact per side per visible round: + +- `run_id` +- `side_id` +- `round` +- `artifact_markdown` +- `claims` - array of claim objects: `{ claim_id, summary, evidence_refs, is_major }` +- `citations` +- `open_questions` +- `critique_responses` - required in `final` round: array of `{ finding_id, disposition, rationale }` + +#### Critique Packet + +Explicit attack surface only: + +- `target_side` +- `finding_id` - runner-assigned sequential ID within the run +- `target_claim_ids` +- `claim_under_attack` +- `attack_type` +- `evidence_or_reason` +- `severity` + +#### Verdict Packet + +- `winner` +- `margin` +- `rubric_scores` +- `decisive_findings` +- `judge_confidence` - enum: `high` | `medium` | `low` +- `needs_human_review` + +### Sharing Policy + +The default policy is deliberately narrow: + +- **Shared with all sides:** case packet, rubric, public evidence, final visible outputs +- **Shared only after first-pass commit:** committed artifacts +- **Shared only after critique stage opens:** critique packets +- **Never shared across sides:** hidden reasoning, draft fragments, token logs, provider names, internal coalition drafts + +### Identity Leakage Handling + +Identity leakage is treated as a best-effort containment problem, not a solved guarantee. + +- The runner performs a post-processing scan on every visible artifact before exchange-gate reveal and before judge-packet construction +- Minimum v1 detection is explicit string/pattern matching for self-identification phrases such as: + - `I am Claude` + - `As an OpenAI model` + - `As ChatGPT` + - `Anthropic` + - `OpenAI` +- If explicit identity leakage is detected before reveal: + - issue one repair retry with a prompt to remove provider self-identification + - if the retry still leaks, log `identity_leak_warning`, preserve the original transcript, and redact explicit provider strings from the judge-facing packet only +- Style leakage, refusal-style leakage, or family-inference from prose remains an acknowledged v1 limitation and is not grounds for automatic run failure +- Referencing unseen opponent content is a separate protocol violation: + - one repair retry is allowed + - a second failure ends the run with `status = "protocol_violation"` + +### Round Sequence + +#### Step 0: Run initialization + +- Materialize `run_id` +- Freeze model versions, side bindings, rubric version, and round limits +- Build the common case packet +- Validate against `schemas/conflict-case.schema.json` + +#### Step 1: Independent first pass + +- Send case packet to each side +- Collect one committed artifact from each side +- Apply the identity leakage policy before reveal +- Reject or retry responses that reference unseen opponent content according to the protocol-violation rule + +#### Step 2: Exchange gate + +- Reveal only the committed first-pass artifacts +- Label them `Side A` and `Side B` +- Do not expose provider or coalition composition + +#### Step 3: Rebuttal round + +- Each side emits one critique packet against the other +- Critiques must reference visible claims, not guessed internal reasoning + +#### Step 4: Final revision + +- Each side may revise once in response to visible critiques +- Final revision must either: + - incorporate the critique, or + - explicitly reject it with evidence or reasoning + +#### Step 5: Adjudication + +- Judge sees: + - case packet + - first-pass artifacts + - critique packets + - final artifacts +- Judge packet must include the configured `budgets.min_margin_for_verdict` +- Judge returns verdict packet only +- Judge prompt forbids provider inference and hidden-reasoning requests + +#### Step 6: Optional swap rerun + +- Re-execute the run with swapped framing +- Compare winner, margin, decisive findings, and score spread + +### Hard Limits + +- Default visible rounds: `3` +- Default critique count per side: `1` +- Default final revision count per side: `1` +- Default time budget per model turn: `120s` +- Default response budget per visible turn: `4,000` output tokens +- Runs exceeding configured cost caps terminate according to the Budget Enforcement Rules section: `budget_exhausted` after a cleanly completed visible phase, or `budget_exhausted_no_verdict` when reserved adjudication budget is insufficient + +### Budget Enforcement Rules + +- Budget fairness is enforced at phase boundaries, not after each individual side turn +- If Side A completes a turn that pushes estimated spend over budget, Side B still receives the matching turn for that phase +- After all sides complete the current phase, the runner evaluates: + - whether another visible phase may begin + - whether reserved judge budget remains for adjudication +- If the run exceeds budget after a completed visible phase, terminate with `status = "budget_exhausted"` and do not start the next phase +- If visible phases complete but the remaining reserved budget is insufficient for adjudication, terminate with `status = "budget_exhausted_no_verdict"` +- No verdict may be emitted from an asymmetric partially completed phase + +## Adapter Contract + +### Provider Adapter Interface + +```typescript +export interface ConflictModelAdapter { + provider: 'anthropic' | 'openai' | string; + model: string; + invoke(request: ConflictTurnRequest): Promise; +} + +export interface ConflictTurnRequest { + runId: string; + sideId: string; + phase: 'first_pass' | 'rebuttal' | 'final' | 'judge'; + packet: object; + prompt: string; + maxOutputTokens: number; + timeoutMs: number; + temperature: number; + toolPolicy: 'none' | 'symmetric'; +} + +export interface ConflictTurnResponse { + content: string; + usage: { + inputTokens: number; + outputTokens: number; + estimatedCostUsd?: number; + }; + latencyMs: number; + stopReason: string; + rawProviderResponsePath: string; +} +``` + +### Adapter Requirements + +- Phase 1 adapters must use CLI or subprocess invocation against subscription-plan access, not API SDK calls +- Normalize timeout handling and error surfaces +- Emit token and latency accounting in a common shape +- Treat `estimatedCostUsd` as optional; subscription-mode runs may omit it or set it to `0` +- Record provider/model metadata in the run store, but not in the judge packet +- Support `toolPolicy = none` first; defer richer tool symmetry until after text-only validation + +### Subscription-Mode Execution + +Phase 1 uses subscription-backed local tooling rather than pay-per-token APIs. + +- **Claude side:** invoke the local `claude` CLI in non-interactive single-turn mode +- **GPT side:** invoke the local ChatGPT/Codex-compatible CLI or subprocess entry point tied to the user's GPT Pro access +- **Timeouts:** enforced at the subprocess level, not HTTP request level +- **Rate limiting:** treated as local CLI or subscription concurrency behavior, not API header parsing +- **Usage accounting:** best-effort only; token counts and estimated cost may be missing from CLI outputs + +The abstract adapter interface remains unchanged so API-backed adapters can exist later, but Phase 1 implementation must not require API keys or usage-based billing. + +## Data Model + +### Run Record Shape + +```json +{ + "run_id": "conflict-2026-04-13T181500Z-prd-hidden-scope-creep", + "scenario_id": "prd-hidden-scope-creep", + "mode": "head_to_head", + "status": "completed", + "sides": { + "side_a": { + "provider": "openai", + "model": "", + "access_mode": "subscription_cli", + "role": "challenger" + }, + "side_b": { + "provider": "anthropic", + "model": "", + "access_mode": "subscription_cli", + "role": "proposer" + } + }, + "judge": { + "provider": "", + "model": "", + "access_mode": "subscription_cli", + "blind_labels": true, + "family_blind": false, + "selection_policy": "rotating_batch_judge" + }, + "budgets": { + "max_visible_rounds": 3, + "max_cost_usd": 12, + "max_latency_ms": 360000, + "min_margin_for_verdict": 0.10 + }, + "results": { + "winner": "side_b", + "margin": 0.14, + "judge_confidence": "medium", + "swap_stable": true, + "needs_human_review": false + }, + "metrics": { + "disagreement_rate": 0.43, + "unsupported_claim_count": 1, + "self_contradiction_count": 0, + "total_estimated_cost_usd": 0 + } +} +``` + +### Transcript Layout + +```text +benchmarks/results/conflict-harness/// + config.json + case-packet.json + state.json + run.json + side-a/ + first-pass.md + rebuttal.md + final.md + side-b/ + first-pass.md + rebuttal.md + final.md + judge/ + verdict.json + verdict.md + swap-test/ + run.json + judge-verdict.json + review/ + blind-review-packet.md +``` + +### Derived Metrics + +The harness should compute these after every completed run: + +- `disagreement_rate`: for each side, the fraction of `claims[].claim_id` with `is_major = true` in that side's first-pass artifact that are referenced by at least one opposing `target_claim_ids`; the run-level value is the mean of the two side-level rates +- `adopted_critique_rate`: fraction of critique `finding_id`s whose final `critique_responses[].disposition = "adopted"` +- `unsupported_claim_count`: visible claims without evidence support where support was required +- `swap_stability`: boolean plus margin delta across swapped reruns; `false` when the winner changes or `abs(primary.margin - swap.margin) >= 0.20` +- `judge_margin`: score difference between winner and loser +- `cost_per_resolved_run`: total estimated cost divided by runs that end with a non-tie verdict; subscription CLI runs may record `0` + +## Adjudication Contract + +### Judging Rules + +- Judge sees only `Side A` and `Side B` +- Judge scores against the same rubric for both sides +- Judge must return a structured verdict even when the result is a tie +- Side-label blinding is required, but family-blindness is not guaranteed in v1 when the judge shares a model family with a competitor +- Published batch conclusions must either: + - rotate judge family across runs, or + - include alternate-judge calibration runs in Phase 2 before treating single-judge verdicts as strong evidence +- The judge prompt for a swap rerun must be structurally identical to the primary run prompt and must not mention swap framing, prior results, or rerun status +- Judge may set `needs_human_review = true` when: + - score margin is below `budgets.min_margin_for_verdict` + - both sides have material unsupported claims + - swap rerun is materially different + +### Rubric Dimensions + +Default rubric dimensions: + +- `claim quality` +- `evidence discipline` +- `responsiveness to critique` +- `internal consistency` +- `decision usefulness` + +Each dimension is scored `1` to `5`. The verdict packet stores: + +- raw scores by dimension +- weighted total +- decisive findings +- one-paragraph rationale + +`judge_confidence` must use this enum and derivation rule: + +- `high`: the winning side is clearly stronger on at least 3 rubric dimensions and has no major unsupported-claim problem +- `medium`: the winning side leads overall but has at least 1 weak dimension or absorbed only part of the opposing critique +- `low`: score margin is below `budgets.min_margin_for_verdict` or both sides have significant unsupported claims + +The judge prompt must include this confidence rubric so the field is derived rather than improvised. + +`judge_confidence` is a decisiveness signal for the current verdict, not a substitute for inter-rater reliability. Until Phase 2 calibration is complete, published benchmark summaries must treat it as advisory only. + +### Blind Review Fallback + +If `needs_human_review = true`, emit a stripped packet for `benchmarks/reviews/` with: + +- scenario text +- rubric +- visible artifacts +- verdict packet without provider metadata + +## Fairness and Safety Controls + +### Fairness Controls + +- Same case packet for every visible side +- Same visible output budget per side unless the mode explicitly says otherwise +- Same temperature by default +- Provider names hidden from judges +- Role-swap rerun available on the same scenario +- No side may use tools unless tool access is symmetric and logged +- Budget checks happen only at phase boundaries so both sides complete the same visible phase before termination +- Subscription-mode execution must not silently favor one side through richer local integration; if one side requires a local CLI wrapper, the other side must also run through a subprocess adapter rather than an API client + +### Safety Controls + +- Do not request or store hidden chain-of-thought +- Redact secrets from raw provider responses before long-term storage +- Fail closed on malformed packet schemas +- Reject final outputs that reference non-visible internal drafts + +### Known Failure Modes and Mitigations + +| Failure Mode | Risk | Control | +|---|---|---| +| Convergence after exchange | Both sides drift toward the same answer | Seal first pass; share only committed artifacts | +| Judge family affinity | A judge prefers outputs from its own model family even with hidden side labels | Treat v1 as label-blind only; rotate judge family or run alternate-judge calibration in Phase 2 | +| Judge monoculture | Winner reflects one judge's preferences more than argument quality | Blind labels; optional alternate judge; swap test | +| Tool asymmetry | One provider wins because it had better tool access | Default `toolPolicy = none`; require symmetry before enabling tools | +| Provider verbosity bias | Longer answers win despite weaker reasoning | Normalize token caps; score by rubric, not length | +| Identity leakage | Explicit self-identification contaminates blind judging | Best-effort string scan, one repair retry, judge-packet redaction, logged warning | +| Coalition leakage | Internal drafts leak to the opposing side | Expose only merged side artifacts | +| Prompt overfitting | Harness rewards style matched to one judge prompt | Rotate judges in calibration runs; compare against blind human review | + +## Non-Functional Requirements + +### Performance + +| Metric | Requirement | Measurement Method | +|---|---|---| +| Head-to-head completion time (p95) | < 3 minutes | Runner telemetry | +| Coalition run completion time (p95) | < 6 minutes | Runner telemetry | +| Run store write success | 100% or fail closed | File write + schema validation | +| Judge packet generation failure rate | < 1% | Run status logs | + +### Scalability + +- Must support batch execution across existing benchmark scenarios +- Initial target: 20 sequential runs without manual cleanup +- Parallelism is optional in v1; correctness beats throughput + +### Reliability + +- Every run is resumable from the last completed phase +- Partial runs must still persist packet artifacts and failure reason +- A provider timeout on one side does not erase the other side's transcript +- `state.json` must record at least `{ run_id, last_completed_phase, next_action, status }` + +### Observability + +- Log per-turn latency, token counts, and estimated cost +- Log schema validation failures with packet path and phase +- Emit a per-run summary markdown file for human review +- When CLI tooling does not expose token or cost data, persist `null` or `0` rather than inventing estimates + +## Rollout Plan + +### Phase 1: Text-only head-to-head pilot + +- Implement `head_to_head` +- Run on 3 existing benchmark scenarios +- Use one blind judge and manual transcript review +- Keep tools disabled + +### Phase 2: Swap stability and scoring hardening + +- Add `swap_test` +- Add `needs_human_review` thresholds +- Compare judge verdicts against blinded human review on a small sample +- Run alternate-judge calibration on at least one scenario per batch and record judge agreement rate before publishing benchmark conclusions + +### Phase 3: Coalition mode + +- Add `lead` / `wing` side structure +- Enforce merged side artifacts +- Measure whether coalition mode improves critique adoption without collapsing disagreement + +### Feature Flags + +| Flag | Description | Default | +|---|---|---| +| `CONFLICT_HARNESS_ENABLED` | Enables the runner entry point | Off | +| `CONFLICT_SWAP_TEST_ENABLED` | Enables optional reruns with swapped framing | Off | +| `CONFLICT_COALITION_MODE_ENABLED` | Enables coalition-vs-coalition runs | Off | + +### Rollback + +- Disable all conflict harness flags +- Preserve stored transcripts for audit +- Fall back to current single-family adversarial workflows until calibration issues are resolved + +## Decision Frame +- **Recommendation:** Build the harness as a provider-agnostic, sealed-first-pass conflict runtime with Claude and ChatGPT as the first adapters, not as a free-form shared-chat experiment. +- **Trade-off:** We give up some spontaneity and conversational richness in exchange for cleaner attribution, stronger disagreement signals, and fairer scoring. +- **Confidence:** Medium - the protocol is technically straightforward and fits the repo's benchmark shape, but coalition mode and judge calibration still need empirical proof. +- **Owner:** Shipwright maintainer +- **Decision Date:** 2026-04-13 +- **Revisit Trigger:** Revisit if pilot runs show low disagreement, unstable swap results, or judge-human disagreement above the accepted calibration threshold. + +## Unknowns & Evidence Gaps + +- Whether `coalition_vs_coalition` outperforms simpler `head_to_head` on real benchmark scenarios +- Whether one judge family produces meaningfully different verdicts than another on the same transcripts +- Coalition merge mechanism - model call, deterministic merge, or human editorial step - is not yet defined and must be resolved before Phase 3 design +- What score-margin threshold best predicts when a human tie-break is necessary +- Whether provider-specific tokenization or response-shaping quirks distort fairness even under equal token caps + +## Pass/Fail Readiness + +PASS if the first implementation can run a blinded `head_to_head` session on at least 3 benchmark scenarios, persist structured transcripts, and produce a verdict packet with stable schema validation; at Light depth, PASS requires only the protocol, schemas, and transcript layout to be defined in one reviewable spec. FAIL if the design still relies on shared hidden reasoning, exposes provider names to judges, or leaves visible-round limits unspecified. + +## Recommended Next Artifact + +Create a Phase 1 implementation plan that turns this spec into: + +- `schemas/conflict-case.schema.json` +- `schemas/conflict-run.schema.json` +- `schemas/conflict-verdict.schema.json` +- `scripts/build-case-packet.mjs` +- `scripts/run-conflict-harness.mjs` diff --git a/schemas/conflict-case.schema.json b/schemas/conflict-case.schema.json new file mode 100644 index 00000000..b7595b38 --- /dev/null +++ b/schemas/conflict-case.schema.json @@ -0,0 +1,118 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Shipwright Conflict Case Packet", + "type": "object", + "additionalProperties": false, + "required": [ + "scenario_id", + "prompt", + "artifact_type", + "rubric", + "constraints", + "evidence", + "max_rounds", + "tool_policy", + "sharing_policy", + "success_condition" + ], + "properties": { + "scenario_id": { "type": "string", "minLength": 1 }, + "title": { "type": "string", "minLength": 1 }, + "prompt": { "type": "string", "minLength": 1 }, + "artifact_type": { "type": "string", "minLength": 1 }, + "rubric": { + "type": "object", + "additionalProperties": false, + "required": ["dimensions", "scoring_scale", "expected_sections", "scoring_spec_ref"], + "properties": { + "dimensions": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "scoring_scale": { "type": "string", "enum": ["1-5"] }, + "expected_sections": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "scoring_spec_ref": { "type": ["string", "null"] } + } + }, + "constraints": { + "type": "object", + "additionalProperties": false, + "required": ["expected_sections", "expect_structured", "context_files", "scoring_spec_ref"], + "properties": { + "expected_sections": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "expect_structured": { "type": "boolean" }, + "context_files": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "scoring_spec_ref": { "type": ["string", "null"] } + } + }, + "evidence": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["evidence_id", "kind", "source_ref"], + "properties": { + "evidence_id": { "type": "string", "minLength": 1 }, + "kind": { "type": "string", "minLength": 1 }, + "source_ref": { "type": "string", "minLength": 1 }, + "content": { "type": ["string", "null"] }, + "confidence": { "type": ["string", "null"], "enum": ["low", "medium", "high", null] } + } + } + }, + "max_rounds": { "type": "integer" }, + "tool_policy": { "type": "string", "enum": ["none", "symmetric"] }, + "sharing_policy": { + "type": "object", + "additionalProperties": false, + "required": [ + "share_case_packet", + "share_committed_artifacts_after_first_pass", + "share_critiques_after_open", + "share_hidden_reasoning", + "share_provider_identity", + "share_internal_coalition_drafts" + ], + "properties": { + "share_case_packet": { "type": "boolean" }, + "share_committed_artifacts_after_first_pass": { "type": "boolean" }, + "share_critiques_after_open": { "type": "boolean" }, + "share_hidden_reasoning": { "type": "boolean" }, + "share_provider_identity": { "type": "boolean" }, + "share_internal_coalition_drafts": { "type": "boolean" } + } + }, + "success_condition": { + "type": "object", + "additionalProperties": false, + "required": ["type", "description", "validator"], + "properties": { + "type": { "type": "string", "enum": ["validator_contract"] }, + "description": { "type": "string", "minLength": 1 }, + "validator": { + "type": "object", + "additionalProperties": false, + "required": ["artifact_type", "expect_sections", "expect_structured"], + "properties": { + "artifact_type": { "type": "string", "minLength": 1 }, + "expect_sections": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "expect_structured": { "type": "boolean" } + } + } + } + } + } +} diff --git a/schemas/conflict-run.schema.json b/schemas/conflict-run.schema.json new file mode 100644 index 00000000..e0a23ce7 --- /dev/null +++ b/schemas/conflict-run.schema.json @@ -0,0 +1,635 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Shipwright Conflict Run Record", + "type": "object", + "additionalProperties": false, + "required": [ + "run_id", + "scenario_id", + "mode", + "status", + "sides", + "judge", + "budgets", + "results", + "metrics" + ], + "properties": { + "run_id": { "type": "string", "minLength": 1 }, + "scenario_id": { "type": "string", "minLength": 1 }, + "mode": { + "type": "string", + "enum": ["head_to_head", "coalition_vs_coalition", "swap_test"] + }, + "status": { + "type": "string", + "enum": [ + "initialized", + "first_pass_complete", + "rebuttal_complete", + "final_complete", + "completed", + "budget_exhausted", + "budget_exhausted_no_verdict", + "protocol_violation", + "error" + ] + }, + "sides": { + "type": "object", + "additionalProperties": false, + "required": ["side_a", "side_b"], + "properties": { + "side_a": { + "type": "object", + "additionalProperties": false, + "required": ["provider", "model", "reasoning_effort", "access_mode", "role", "first_pass", "rebuttal", "final"], + "properties": { + "provider": { "type": "string", "minLength": 1 }, + "model": { "type": "string", "minLength": 1 }, + "reasoning_effort": { "type": "string", "minLength": 1 }, + "access_mode": { "type": "string", "enum": ["subscription_cli"] }, + "role": { "type": "string", "minLength": 1 }, + "first_pass": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "run_id", + "side_id", + "round", + "artifact_markdown", + "claims", + "citations", + "conclusion_confidence", + "open_questions", + "critique_responses" + ], + "properties": { + "run_id": { "type": "string", "minLength": 1 }, + "side_id": { "type": "string", "enum": ["side_a", "side_b"] }, + "round": { "type": "string", "enum": ["first_pass", "final"] }, + "artifact_markdown": { "type": "string", "minLength": 1 }, + "claims": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["claim_id", "summary", "evidence_refs", "is_major"], + "properties": { + "claim_id": { "type": "string", "minLength": 1 }, + "summary": { "type": "string", "minLength": 1 }, + "evidence_refs": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "is_major": { "type": "boolean" } + } + } + }, + "citations": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "conclusion_confidence": { + "type": "string", + "enum": ["low", "medium", "high"] + }, + "open_questions": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "critique_responses": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["finding_id", "disposition", "rationale"], + "properties": { + "finding_id": { "type": "string", "minLength": 1 }, + "disposition": { + "type": "string", + "enum": ["adopted", "rejected", "deferred"] + }, + "rationale": { "type": "string", "minLength": 1 } + } + } + } + } + }, + "rebuttal": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "target_side", + "finding_id", + "target_claim_ids", + "claim_under_attack", + "attack_type", + "evidence_or_reason", + "severity" + ], + "properties": { + "target_side": { "type": "string", "enum": ["side_a", "side_b"] }, + "finding_id": { "type": "string", "minLength": 1 }, + "target_claim_ids": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "claim_under_attack": { "type": "string", "minLength": 1 }, + "attack_type": { "type": "string", "minLength": 1 }, + "evidence_or_reason": { "type": "string", "minLength": 1 }, + "severity": { "type": "string", "enum": ["low", "medium", "high"] } + } + }, + "final": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "run_id", + "side_id", + "round", + "artifact_markdown", + "claims", + "citations", + "conclusion_confidence", + "open_questions", + "critique_responses" + ], + "properties": { + "run_id": { "type": "string", "minLength": 1 }, + "side_id": { "type": "string", "enum": ["side_a", "side_b"] }, + "round": { "type": "string", "enum": ["first_pass", "final"] }, + "artifact_markdown": { "type": "string", "minLength": 1 }, + "claims": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["claim_id", "summary", "evidence_refs", "is_major"], + "properties": { + "claim_id": { "type": "string", "minLength": 1 }, + "summary": { "type": "string", "minLength": 1 }, + "evidence_refs": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "is_major": { "type": "boolean" } + } + } + }, + "citations": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "conclusion_confidence": { + "type": "string", + "enum": ["low", "medium", "high"] + }, + "open_questions": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "critique_responses": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["finding_id", "disposition", "rationale"], + "properties": { + "finding_id": { "type": "string", "minLength": 1 }, + "disposition": { + "type": "string", + "enum": ["adopted", "rejected", "deferred"] + }, + "rationale": { "type": "string", "minLength": 1 } + } + } + } + } + } + } + }, + "side_b": { + "type": "object", + "additionalProperties": false, + "required": ["provider", "model", "reasoning_effort", "access_mode", "role", "first_pass", "rebuttal", "final"], + "properties": { + "provider": { "type": "string", "minLength": 1 }, + "model": { "type": "string", "minLength": 1 }, + "reasoning_effort": { "type": "string", "minLength": 1 }, + "access_mode": { "type": "string", "enum": ["subscription_cli"] }, + "role": { "type": "string", "minLength": 1 }, + "first_pass": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "run_id", + "side_id", + "round", + "artifact_markdown", + "claims", + "citations", + "conclusion_confidence", + "open_questions", + "critique_responses" + ], + "properties": { + "run_id": { "type": "string", "minLength": 1 }, + "side_id": { "type": "string", "enum": ["side_a", "side_b"] }, + "round": { "type": "string", "enum": ["first_pass", "final"] }, + "artifact_markdown": { "type": "string", "minLength": 1 }, + "claims": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["claim_id", "summary", "evidence_refs", "is_major"], + "properties": { + "claim_id": { "type": "string", "minLength": 1 }, + "summary": { "type": "string", "minLength": 1 }, + "evidence_refs": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "is_major": { "type": "boolean" } + } + } + }, + "citations": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "conclusion_confidence": { + "type": "string", + "enum": ["low", "medium", "high"] + }, + "open_questions": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "critique_responses": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["finding_id", "disposition", "rationale"], + "properties": { + "finding_id": { "type": "string", "minLength": 1 }, + "disposition": { + "type": "string", + "enum": ["adopted", "rejected", "deferred"] + }, + "rationale": { "type": "string", "minLength": 1 } + } + } + } + } + }, + "rebuttal": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "target_side", + "finding_id", + "target_claim_ids", + "claim_under_attack", + "attack_type", + "evidence_or_reason", + "severity" + ], + "properties": { + "target_side": { "type": "string", "enum": ["side_a", "side_b"] }, + "finding_id": { "type": "string", "minLength": 1 }, + "target_claim_ids": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "claim_under_attack": { "type": "string", "minLength": 1 }, + "attack_type": { "type": "string", "minLength": 1 }, + "evidence_or_reason": { "type": "string", "minLength": 1 }, + "severity": { "type": "string", "enum": ["low", "medium", "high"] } + } + }, + "final": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "run_id", + "side_id", + "round", + "artifact_markdown", + "claims", + "citations", + "conclusion_confidence", + "open_questions", + "critique_responses" + ], + "properties": { + "run_id": { "type": "string", "minLength": 1 }, + "side_id": { "type": "string", "enum": ["side_a", "side_b"] }, + "round": { "type": "string", "enum": ["first_pass", "final"] }, + "artifact_markdown": { "type": "string", "minLength": 1 }, + "claims": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["claim_id", "summary", "evidence_refs", "is_major"], + "properties": { + "claim_id": { "type": "string", "minLength": 1 }, + "summary": { "type": "string", "minLength": 1 }, + "evidence_refs": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "is_major": { "type": "boolean" } + } + } + }, + "citations": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "conclusion_confidence": { + "type": "string", + "enum": ["low", "medium", "high"] + }, + "open_questions": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "critique_responses": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["finding_id", "disposition", "rationale"], + "properties": { + "finding_id": { "type": "string", "minLength": 1 }, + "disposition": { + "type": "string", + "enum": ["adopted", "rejected", "deferred"] + }, + "rationale": { "type": "string", "minLength": 1 } + } + } + } + } + } + } + } + } + }, + "judge": { + "type": "object", + "additionalProperties": false, + "required": [ + "provider", + "model", + "reasoning_effort", + "access_mode", + "blind_labels", + "family_blind", + "selection_policy", + "verdict" + ], + "properties": { + "provider": { "type": "string", "minLength": 1 }, + "model": { "type": "string", "minLength": 1 }, + "reasoning_effort": { "type": "string", "minLength": 1 }, + "access_mode": { "type": "string", "enum": ["subscription_cli"] }, + "blind_labels": { "type": "boolean" }, + "family_blind": { "type": "boolean" }, + "selection_policy": { "type": "string", "minLength": 1 }, + "verdict": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "winner", + "margin", + "rubric_scores", + "dimension_rationales", + "side_summaries", + "decisive_dimension", + "decisive_findings", + "judge_confidence", + "needs_human_review", + "rationale" + ], + "properties": { + "winner": { "type": "string", "enum": ["side_a", "side_b", "tie"] }, + "margin": { "type": "number" }, + "rubric_scores": { + "type": "object", + "additionalProperties": false, + "required": ["side_a", "side_b"], + "properties": { + "side_a": { + "type": "object", + "additionalProperties": false, + "required": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness", + "weighted_total" + ], + "properties": { + "claim_quality": { "type": "number" }, + "evidence_discipline": { "type": "number" }, + "responsiveness_to_critique": { "type": "number" }, + "internal_consistency": { "type": "number" }, + "decision_usefulness": { "type": "number" }, + "weighted_total": { "type": "number", "minimum": 1, "maximum": 5 } + } + }, + "side_b": { + "type": "object", + "additionalProperties": false, + "required": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness", + "weighted_total" + ], + "properties": { + "claim_quality": { "type": "number" }, + "evidence_discipline": { "type": "number" }, + "responsiveness_to_critique": { "type": "number" }, + "internal_consistency": { "type": "number" }, + "decision_usefulness": { "type": "number" }, + "weighted_total": { "type": "number", "minimum": 1, "maximum": 5 } + } + } + } + }, + "dimension_rationales": { + "type": "object", + "additionalProperties": false, + "required": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness" + ], + "properties": { + "claim_quality": { "type": "string", "minLength": 1 }, + "evidence_discipline": { "type": "string", "minLength": 1 }, + "responsiveness_to_critique": { "type": "string", "minLength": 1 }, + "internal_consistency": { "type": "string", "minLength": 1 }, + "decision_usefulness": { "type": "string", "minLength": 1 } + } + }, + "side_summaries": { + "type": "object", + "additionalProperties": false, + "required": ["side_a", "side_b"], + "properties": { + "side_a": { + "type": "object", + "additionalProperties": false, + "required": ["strengths", "weaknesses"], + "properties": { + "strengths": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "weaknesses": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + } + } + }, + "side_b": { + "type": "object", + "additionalProperties": false, + "required": ["strengths", "weaknesses"], + "properties": { + "strengths": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "weaknesses": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + } + } + } + } + }, + "decisive_findings": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "decisive_dimension": { + "type": "string", + "enum": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness" + ] + }, + "judge_confidence": { "type": "string", "enum": ["high", "medium", "low"] }, + "needs_human_review": { "type": "boolean" }, + "rationale": { "type": "string", "minLength": 1 } + } + } + } + }, + "budgets": { + "type": "object", + "additionalProperties": false, + "required": [ + "max_visible_rounds", + "max_cost_usd", + "max_latency_ms", + "min_margin_for_verdict" + ], + "properties": { + "max_visible_rounds": { "type": "integer" }, + "max_cost_usd": { "type": "number" }, + "max_latency_ms": { "type": "integer" }, + "min_margin_for_verdict": { "type": "number" } + } + }, + "results": { + "type": "object", + "additionalProperties": false, + "required": [ + "winner", + "margin", + "judge_confidence", + "swap_stable", + "needs_human_review" + ], + "properties": { + "winner": { "type": ["string", "null"], "enum": ["side_a", "side_b", "tie", null] }, + "margin": { "type": ["number", "null"] }, + "judge_confidence": { + "type": ["string", "null"], + "enum": ["high", "medium", "low", null] + }, + "swap_stable": { "type": ["boolean", "null"] }, + "needs_human_review": { "type": ["boolean", "null"] } + } + }, + "metrics": { + "type": "object", + "additionalProperties": false, + "required": [ + "disagreement_rate", + "declared_adoption_rate", + "substantive_revision_rate", + "unsupported_claim_count", + "self_contradiction_count", + "total_estimated_cost_usd", + "swap_margin_delta", + "judge_margin", + "cost_per_resolved_run" + ], + "properties": { + "disagreement_rate": { "type": ["number", "null"] }, + "declared_adoption_rate": { "type": ["number", "null"] }, + "substantive_revision_rate": { "type": ["number", "null"] }, + "unsupported_claim_count": { "type": ["integer", "null"] }, + "self_contradiction_count": { "type": ["integer", "null"] }, + "total_estimated_cost_usd": { "type": "number" }, + "swap_margin_delta": { "type": ["number", "null"] }, + "judge_margin": { "type": ["number", "null"] }, + "cost_per_resolved_run": { "type": ["number", "null"] } + } + }, + "audit": { + "type": "object", + "additionalProperties": false, + "required": ["identity_leak_warnings", "protocol_violations"], + "properties": { + "identity_leak_warnings": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "protocol_violations": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + } + } + } + } +} diff --git a/schemas/conflict-verdict.schema.json b/schemas/conflict-verdict.schema.json new file mode 100644 index 00000000..a2dc18a7 --- /dev/null +++ b/schemas/conflict-verdict.schema.json @@ -0,0 +1,146 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Shipwright Conflict Verdict Packet", + "type": "object", + "additionalProperties": false, + "required": [ + "winner", + "margin", + "rubric_scores", + "dimension_rationales", + "side_summaries", + "decisive_dimension", + "decisive_findings", + "judge_confidence", + "needs_human_review", + "rationale" + ], + "properties": { + "winner": { "type": "string", "enum": ["side_a", "side_b", "tie"] }, + "margin": { "type": "number" }, + "rubric_scores": { + "type": "object", + "additionalProperties": false, + "required": ["side_a", "side_b"], + "properties": { + "side_a": { + "type": "object", + "additionalProperties": false, + "required": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness", + "weighted_total" + ], + "properties": { + "claim_quality": { "type": "number" }, + "evidence_discipline": { "type": "number" }, + "responsiveness_to_critique": { "type": "number" }, + "internal_consistency": { "type": "number" }, + "decision_usefulness": { "type": "number" }, + "weighted_total": { "type": "number", "minimum": 1, "maximum": 5 } + } + }, + "side_b": { + "type": "object", + "additionalProperties": false, + "required": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness", + "weighted_total" + ], + "properties": { + "claim_quality": { "type": "number" }, + "evidence_discipline": { "type": "number" }, + "responsiveness_to_critique": { "type": "number" }, + "internal_consistency": { "type": "number" }, + "decision_usefulness": { "type": "number" }, + "weighted_total": { "type": "number", "minimum": 1, "maximum": 5 } + } + } + } + }, + "dimension_rationales": { + "type": "object", + "additionalProperties": false, + "required": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness" + ], + "properties": { + "claim_quality": { "type": "string", "minLength": 1 }, + "evidence_discipline": { "type": "string", "minLength": 1 }, + "responsiveness_to_critique": { "type": "string", "minLength": 1 }, + "internal_consistency": { "type": "string", "minLength": 1 }, + "decision_usefulness": { "type": "string", "minLength": 1 } + } + }, + "side_summaries": { + "type": "object", + "additionalProperties": false, + "required": ["side_a", "side_b"], + "properties": { + "side_a": { + "type": "object", + "additionalProperties": false, + "required": ["strengths", "weaknesses"], + "properties": { + "strengths": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "weaknesses": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + } + } + }, + "side_b": { + "type": "object", + "additionalProperties": false, + "required": ["strengths", "weaknesses"], + "properties": { + "strengths": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "weaknesses": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + } + } + } + } + }, + "decisive_findings": { + "type": "array", + "minItems": 1, + "items": { "type": "string", "minLength": 1 } + }, + "decisive_dimension": { + "type": "string", + "enum": [ + "claim_quality", + "evidence_discipline", + "responsiveness_to_critique", + "internal_consistency", + "decision_usefulness" + ] + }, + "judge_confidence": { "type": "string", "enum": ["high", "medium", "low"] }, + "needs_human_review": { "type": "boolean" }, + "rationale": { "type": "string", "minLength": 1 } + } +} diff --git a/scripts/build-case-packet.mjs b/scripts/build-case-packet.mjs new file mode 100644 index 00000000..0ab18e55 --- /dev/null +++ b/scripts/build-case-packet.mjs @@ -0,0 +1,386 @@ +#!/usr/bin/env node + +import { readFileSync } from 'node:fs'; +import { readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +import { + DEFAULT_SCENARIO_DIR, + loadBenchmarkScenario, +} from './run-benchmarks.mjs'; + +const SCHEMA_FILE_BY_NAME = Object.freeze({ + case: 'conflict-case.schema.json', + run: 'conflict-run.schema.json', + verdict: 'conflict-verdict.schema.json', +}); + +export const DEFAULT_CONFLICT_SCHEMA_DIR = path.resolve('schemas'); +export const DEFAULT_CONFLICT_MAX_ROUNDS = 3; +export const DEFAULT_TOOL_POLICY = 'none'; +export const DEFAULT_RUBRIC_DIMENSIONS = Object.freeze([ + 'claim quality', + 'evidence discipline', + 'responsiveness to critique', + 'internal consistency', + 'decision usefulness', +]); +export const DEFAULT_SHARING_POLICY = Object.freeze({ + share_case_packet: true, + share_committed_artifacts_after_first_pass: true, + share_critiques_after_open: true, + share_hidden_reasoning: false, + share_provider_identity: false, + share_internal_coalition_drafts: false, +}); + +const schemaCache = new Map(); + +export function loadConflictSchema(schemaName) { + if (!SCHEMA_FILE_BY_NAME[schemaName]) { + throw new Error(`Unsupported conflict schema: ${schemaName}`); + } + + if (schemaCache.has(schemaName)) { + return schemaCache.get(schemaName); + } + + const schemaPath = path.resolve(DEFAULT_CONFLICT_SCHEMA_DIR, SCHEMA_FILE_BY_NAME[schemaName]); + const schema = JSON.parse(readFileSync(schemaPath, 'utf8')); + schemaCache.set(schemaName, schema); + return schema; +} + +export function validateConflictValue(value, schema, currentPath = '$', errors = []) { + if (!schema || typeof schema !== 'object') return errors; + + if ('type' in schema && !matchesSchemaType(value, schema.type)) { + errors.push({ + path: currentPath, + message: `Expected ${formatExpectedType(schema.type)}.`, + }); + return errors; + } + + if ('enum' in schema && !schema.enum.includes(value)) { + errors.push({ + path: currentPath, + message: `Expected one of: ${schema.enum.map((entry) => String(entry)).join(', ')}.`, + }); + return errors; + } + + if (schema.type === 'string' && typeof schema.minLength === 'number' && value.length < schema.minLength) { + errors.push({ + path: currentPath, + message: `Expected string length >= ${schema.minLength}.`, + }); + return errors; + } + + if ( + (schema.type === 'number' || schema.type === 'integer') && + typeof schema.minimum === 'number' && + value < schema.minimum + ) { + errors.push({ + path: currentPath, + message: `Expected value >= ${schema.minimum}.`, + }); + return errors; + } + + if ( + (schema.type === 'number' || schema.type === 'integer') && + typeof schema.maximum === 'number' && + value > schema.maximum + ) { + errors.push({ + path: currentPath, + message: `Expected value <= ${schema.maximum}.`, + }); + return errors; + } + + if (matchesSchemaType(value, 'object')) { + const properties = schema.properties || {}; + const required = schema.required || []; + + for (const key of required) { + if (!(key in value)) { + errors.push({ + path: `${currentPath}.${key}`, + message: 'Missing required property.', + }); + } + } + + if (schema.additionalProperties === false) { + for (const key of Object.keys(value)) { + if (!(key in properties)) { + errors.push({ + path: `${currentPath}.${key}`, + message: 'Unexpected property.', + }); + } + } + } + + for (const [key, childSchema] of Object.entries(properties)) { + if (!(key in value)) continue; + validateConflictValue(value[key], childSchema, `${currentPath}.${key}`, errors); + } + return errors; + } + + if (Array.isArray(value)) { + if (typeof schema.minItems === 'number' && value.length < schema.minItems) { + errors.push({ + path: currentPath, + message: `Expected at least ${schema.minItems} item(s).`, + }); + } + + if (schema.items) { + value.forEach((item, index) => { + validateConflictValue(item, schema.items, `${currentPath}[${index}]`, errors); + }); + } + } + + return errors; +} + +export function validateConflictDocument(document, schemaName) { + const schema = loadConflictSchema(schemaName); + const errors = validateConflictValue(document, schema, '$', []); + return { schema, errors }; +} + +export function buildCasePacketFromScenario(scenario, options = {}) { + if (!scenario || typeof scenario !== 'object' || Array.isArray(scenario)) { + throw new Error('Scenario must be a benchmark scenario object.'); + } + + const expectedSections = Array.isArray(scenario.validator?.expect_sections) + ? scenario.validator.expect_sections + : []; + const expectStructured = Boolean(scenario.validator?.expect_structured); + const contextFiles = Array.isArray(scenario.inputs?.context_files) + ? scenario.inputs.context_files + : []; + const scoringSpecRef = scenario.inputs?.scoring_spec_ref || null; + + const packet = { + scenario_id: scenario.id, + title: scenario.title || scenario.id, + prompt: scenario.inputs?.prompt || '', + artifact_type: scenario.inputs?.expected_artifact_type || '', + rubric: { + dimensions: options.rubricDimensions || [...DEFAULT_RUBRIC_DIMENSIONS], + scoring_scale: '1-5', + expected_sections: expectedSections, + scoring_spec_ref: scoringSpecRef, + }, + constraints: { + expected_sections: expectedSections, + expect_structured: expectStructured, + context_files: contextFiles, + scoring_spec_ref: scoringSpecRef, + }, + evidence: contextFiles.map((filePath, index) => ({ + evidence_id: `ctx-${index + 1}`, + kind: 'context_file', + source_ref: filePath, + content: null, + confidence: null, + })), + max_rounds: options.maxRounds ?? DEFAULT_CONFLICT_MAX_ROUNDS, + tool_policy: options.toolPolicy || DEFAULT_TOOL_POLICY, + sharing_policy: { + ...DEFAULT_SHARING_POLICY, + ...(options.sharingPolicy || {}), + }, + success_condition: { + type: 'validator_contract', + description: + options.successDescription || + `Produce a ${scenario.inputs?.expected_artifact_type || 'signed-off'} artifact that satisfies the scenario validator contract.`, + validator: { + artifact_type: scenario.inputs?.expected_artifact_type || '', + expect_sections: expectedSections, + expect_structured: expectStructured, + }, + }, + }; + + const validation = validateConflictDocument(packet, 'case'); + if (validation.errors.length > 0) { + const details = validation.errors + .map((error) => `${error.path}: ${error.message}`) + .join('\n'); + throw new Error(`Generated case packet failed validation:\n${details}`); + } + + return packet; +} + +export async function loadCasePacket(filePath) { + const resolved = path.resolve(filePath); + const packet = JSON.parse(await readFile(resolved, 'utf8')); + const validation = validateConflictDocument(packet, 'case'); + if (validation.errors.length > 0) { + const details = validation.errors + .map((error) => `${error.path}: ${error.message}`) + .join('\n'); + throw new Error(`Case packet failed validation:\n${details}`); + } + return packet; +} + +export async function buildCasePacket(options = {}) { + const scenarioArg = typeof options.scenario === 'string' ? options.scenario.trim() : ''; + const scenarioDir = options.scenarioDir || DEFAULT_SCENARIO_DIR; + + if (!scenarioArg) { + throw new Error('Missing required --scenario.'); + } + + const scenarioPath = resolveScenarioPath(scenarioArg, scenarioDir); + const scenario = await loadBenchmarkScenario(scenarioPath); + return buildCasePacketFromScenario(scenario, options); +} + +export function parseCliArgs(argv) { + const parsed = { + scenario: '', + scenarioDir: DEFAULT_SCENARIO_DIR, + outPath: null, + maxRounds: DEFAULT_CONFLICT_MAX_ROUNDS, + toolPolicy: DEFAULT_TOOL_POLICY, + format: 'json', + }; + + for (let index = 0; index < argv.length; index += 1) { + const token = argv[index]; + switch (token) { + case '--scenario': + parsed.scenario = argv[index + 1] || ''; + index += 1; + break; + case '--scenario-dir': + parsed.scenarioDir = argv[index + 1] || DEFAULT_SCENARIO_DIR; + index += 1; + break; + case '--out': + parsed.outPath = argv[index + 1] || null; + index += 1; + break; + case '--max-rounds': + parsed.maxRounds = Number(argv[index + 1] || ''); + index += 1; + break; + case '--tool-policy': + parsed.toolPolicy = argv[index + 1] || DEFAULT_TOOL_POLICY; + index += 1; + break; + case '--format': + parsed.format = argv[index + 1] || 'json'; + index += 1; + break; + case '--help': + parsed.help = true; + break; + default: + if (token.startsWith('--')) { + throw new Error(`Unknown flag: ${token}`); + } + break; + } + } + + if (!['json'].includes(parsed.format)) { + throw new Error(`Unsupported format "${parsed.format}". Use json.`); + } + + if (!Number.isInteger(parsed.maxRounds) || parsed.maxRounds <= 0) { + throw new Error('--max-rounds must be a positive integer.'); + } + + if (!['none', 'symmetric'].includes(parsed.toolPolicy)) { + throw new Error('Unsupported --tool-policy. Use none or symmetric.'); + } + + return parsed; +} + +export async function main(argv = process.argv.slice(2)) { + const args = parseCliArgs(argv); + if (args.help) { + console.log( + 'Usage: node scripts/build-case-packet.mjs --scenario scenario-id [--scenario-dir dir] [--out path] [--max-rounds 3] [--tool-policy none]', + ); + return; + } + + const packet = await buildCasePacket(args); + const output = `${JSON.stringify(packet, null, 2)}\n`; + + if (args.outPath) { + await writeFile(path.resolve(args.outPath), output, 'utf8'); + console.log(path.resolve(args.outPath)); + return; + } + + process.stdout.write(output); +} + +function resolveScenarioPath(scenarioArg, scenarioDir) { + if (scenarioArg.endsWith('.json')) { + return path.resolve(scenarioArg); + } + return path.resolve(scenarioDir, `${scenarioArg}.json`); +} + +function matchesSchemaType(value, expectedType) { + const candidates = Array.isArray(expectedType) ? expectedType : [expectedType]; + return candidates.some((type) => matchesSingleType(value, type)); +} + +function matchesSingleType(value, type) { + switch (type) { + case 'object': + return Boolean(value) && typeof value === 'object' && !Array.isArray(value); + case 'array': + return Array.isArray(value); + case 'string': + return typeof value === 'string'; + case 'number': + return typeof value === 'number' && Number.isFinite(value); + case 'integer': + return Number.isInteger(value); + case 'boolean': + return typeof value === 'boolean'; + case 'null': + return value === null; + default: + return false; + } +} + +function formatExpectedType(type) { + return Array.isArray(type) ? type.join(' or ') : String(type); +} + +function isDirectRun() { + if (!process.argv[1]) return false; + return import.meta.url === pathToFileURL(path.resolve(process.argv[1])).href; +} + +if (isDirectRun()) { + main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; + }); +} diff --git a/scripts/rejudge-conflict-batch.mjs b/scripts/rejudge-conflict-batch.mjs new file mode 100644 index 00000000..9077757e --- /dev/null +++ b/scripts/rejudge-conflict-batch.mjs @@ -0,0 +1,283 @@ +#!/usr/bin/env node + +import { readdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { rejudgeConflictRun } from './rejudge-conflict-run.mjs'; + +const DEFAULT_ROOT_DIR = path.resolve('benchmarks', 'results', 'conflict-harness'); +const VALID_FORMATS = new Set(['text', 'json']); + +export async function discoverRunDirs(rootDir, scenarioFilters) { + const scenarioDirs = await readdir(rootDir, { withFileTypes: true }); + const allowedScenarios = scenarioFilters && scenarioFilters.length > 0 + ? new Set(scenarioFilters) + : null; + + const unknownScenarios = []; + if (allowedScenarios) { + for (const scenario of allowedScenarios) { + if (!scenarioDirs.some((entry) => entry.isDirectory() && entry.name === scenario)) { + unknownScenarios.push(scenario); + } + } + } + if (unknownScenarios.length > 0) { + throw new Error(`Unknown scenario(s): ${unknownScenarios.join(', ')}`); + } + + const runDirs = []; + for (const scenarioEntry of scenarioDirs) { + if (!scenarioEntry.isDirectory()) continue; + if (allowedScenarios && !allowedScenarios.has(scenarioEntry.name)) continue; + + const scenarioPath = path.join(rootDir, scenarioEntry.name); + const entries = await readdir(scenarioPath, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) continue; + if (!entry.name.startsWith('conflict-')) continue; + runDirs.push(path.join(scenarioPath, entry.name)); + } + } + + return runDirs.sort(); +} + +export async function runRejudgeBatch(options = {}) { + const rootDir = path.resolve(options.rootDir || DEFAULT_ROOT_DIR); + const runDirs = options.runDirs && options.runDirs.length > 0 + ? options.runDirs.map((runDir) => path.resolve(runDir)) + : await discoverRunDirs(rootDir, options.scenarios); + + const results = []; + const totalRuns = runDirs.length; + + for (let index = 0; index < runDirs.length; index += 1) { + const runDir = runDirs[index]; + const label = `[${index + 1}/${totalRuns}] ${path.basename(runDir)}`; + process.stderr.write(`${label} — rejudging\n`); + + try { + const run = JSON.parse(await readFile(path.join(runDir, 'run.json'), 'utf8')); + const result = await rejudgeConflictRun({ + runDir, + judgeAgent: options.judgeAgent, + judgeCommand: options.judgeCommand, + judgeProvider: options.judgeProvider, + judgeModel: options.judgeModel, + judgeReasoningEffort: options.judgeReasoningEffort, + label: options.label, + cwd: options.cwd, + timeoutMs: options.timeoutMs, + turnRunner: options.turnRunner, + }); + + results.push({ + runDir, + runId: run.run_id, + scenario: run.scenario_id, + status: 'completed', + winner: result.verdict.winner, + margin: result.verdict.margin, + judgeConfidence: result.verdict.judge_confidence, + needsHumanReview: result.verdict.needs_human_review, + outputDir: result.outputDir, + error: null, + }); + process.stderr.write(`${label} — completed, winner: ${result.verdict.winner}\n`); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + let runId = null; + let scenario = path.basename(path.dirname(runDir)); + try { + const run = JSON.parse(await readFile(path.join(runDir, 'run.json'), 'utf8')); + runId = run.run_id; + scenario = run.scenario_id; + } catch { + // Leave fallback values. + } + + results.push({ + runDir, + runId, + scenario, + status: 'error', + winner: null, + margin: null, + judgeConfidence: null, + needsHumanReview: null, + outputDir: null, + error: message, + }); + process.stderr.write(`${label} — ERROR: ${message}\n`); + } + } + + return results; +} + +export function buildBatchSummary(results, judgeLabel) { + const lines = []; + lines.push('# Rejudge Batch Summary'); + lines.push(''); + lines.push(`Judge: ${judgeLabel}`); + lines.push(`Runs attempted: ${results.length}`); + lines.push(`Errors: ${results.filter((result) => result.status === 'error').length}`); + lines.push(''); + lines.push('| Scenario | Run ID | Status | Winner | Margin | Confidence | Human Review |'); + lines.push('|---|---|---|---|---|---|---|'); + + for (const result of results) { + lines.push([ + '', + result.scenario || '—', + result.runId || '—', + result.status, + result.winner || '—', + result.margin != null ? result.margin.toFixed(2) : '—', + result.judgeConfidence || '—', + result.needsHumanReview != null ? String(result.needsHumanReview) : '—', + '', + ].join(' | ')); + } + + return `${lines.join('\n')}\n`; +} + +export function parseCliArgs(argv) { + const parsed = { + rootDir: DEFAULT_ROOT_DIR, + scenarios: [], + runDirs: [], + outPath: null, + judgeAgent: 'gemini', + judgeCommand: '', + judgeProvider: '', + judgeModel: '', + judgeReasoningEffort: 'medium', + label: '', + cwd: process.cwd(), + timeoutMs: 120000, + format: 'text', + }; + + for (let index = 0; index < argv.length; index += 1) { + const token = argv[index]; + switch (token) { + case '--root-dir': + parsed.rootDir = argv[index + 1] || DEFAULT_ROOT_DIR; + index += 1; + break; + case '--scenario': + parsed.scenarios.push(argv[index + 1] || ''); + index += 1; + break; + case '--run-dir': + parsed.runDirs.push(argv[index + 1] || ''); + index += 1; + break; + case '--out': + parsed.outPath = argv[index + 1] || null; + index += 1; + break; + case '--judge-agent': + parsed.judgeAgent = argv[index + 1] || 'gemini'; + index += 1; + break; + case '--judge-command': + parsed.judgeCommand = argv[index + 1] || ''; + index += 1; + break; + case '--judge-provider': + parsed.judgeProvider = argv[index + 1] || ''; + index += 1; + break; + case '--judge-model': + parsed.judgeModel = argv[index + 1] || ''; + index += 1; + break; + case '--judge-reasoning-effort': + parsed.judgeReasoningEffort = argv[index + 1] || 'medium'; + index += 1; + break; + case '--label': + parsed.label = argv[index + 1] || ''; + index += 1; + break; + case '--cwd': + parsed.cwd = argv[index + 1] || process.cwd(); + index += 1; + break; + case '--timeout-ms': + parsed.timeoutMs = Number(argv[index + 1] || ''); + index += 1; + break; + case '--format': + parsed.format = argv[index + 1] || 'text'; + index += 1; + break; + case '--help': + parsed.help = true; + break; + default: + if (token.startsWith('--')) { + throw new Error(`Unknown flag: ${token}`); + } + break; + } + } + + if (!VALID_FORMATS.has(parsed.format)) { + throw new Error(`Unsupported format "${parsed.format}". Use text or json.`); + } + + return parsed; +} + +export async function main(argv = process.argv.slice(2)) { + const args = parseCliArgs(argv); + if (args.help) { + console.log( + 'Usage: node scripts/rejudge-conflict-batch.mjs [--scenario scenario-id] [--run-dir completed-run-dir] [--judge-agent gemini] [--out summary.md] [--format text|json]', + ); + return; + } + + const results = await runRejudgeBatch({ + rootDir: args.rootDir, + scenarios: args.scenarios.length > 0 ? args.scenarios : undefined, + runDirs: args.runDirs.length > 0 ? args.runDirs : undefined, + judgeAgent: args.judgeAgent, + judgeCommand: args.judgeCommand || undefined, + judgeProvider: args.judgeProvider || undefined, + judgeModel: args.judgeModel || undefined, + judgeReasoningEffort: args.judgeReasoningEffort, + label: args.label || undefined, + cwd: args.cwd, + timeoutMs: args.timeoutMs, + }); + + if (args.format === 'json') { + const output = JSON.stringify(results, null, 2); + if (args.outPath) { + await writeFile(args.outPath, `${output}\n`, 'utf8'); + } + console.log(output); + return; + } + + const judgeLabel = args.label || `${args.judgeAgent}-judge`; + const summary = buildBatchSummary(results, judgeLabel); + if (args.outPath) { + await writeFile(args.outPath, summary, 'utf8'); + } + console.log(summary.trimEnd()); +} + +const isMainModule = process.argv[1] && import.meta.url === new URL(`file://${process.argv[1]}`).href; + +if (isMainModule) { + main().catch((error) => { + process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`); + process.exitCode = 1; + }); +} diff --git a/scripts/rejudge-conflict-run.mjs b/scripts/rejudge-conflict-run.mjs new file mode 100644 index 00000000..6ea9fc5f --- /dev/null +++ b/scripts/rejudge-conflict-run.mjs @@ -0,0 +1,563 @@ +#!/usr/bin/env node + +import { existsSync } from 'node:fs'; +import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { createShellTurnRunner } from './run-conflict-harness.mjs'; +import { validateConflictDocument } from './build-case-packet.mjs'; +import { AGENT_PROFILES } from './run-conflict-batch.mjs'; + +const DEFAULT_REASONING_EFFORT = 'medium'; +const DEFAULT_TIMEOUT_MS = 120000; +const VALID_FORMATS = new Set(['text', 'json']); +const MAX_GEMINI_TURN_ATTEMPTS = 3; + +export async function rejudgeConflictRun(options = {}) { + const runDir = normalizeRequiredPath(options.runDir, '--run-dir'); + const judgeAgent = options.judgeAgent || 'gemini'; + const profile = resolveJudgeProfile(judgeAgent); + const judgeLabel = options.label || `${judgeAgent}-judge`; + const judgeCommand = options.judgeCommand || profile.command; + const judgeProvider = options.judgeProvider || profile.provider; + const judgeModel = options.judgeModel || profile.model; + const judgeReasoningEffort = normalizeRequiredString( + options.judgeReasoningEffort || DEFAULT_REASONING_EFFORT, + '--judge-reasoning-effort', + ); + const timeoutMs = normalizePositiveInteger(options.timeoutMs || DEFAULT_TIMEOUT_MS, '--timeout-ms'); + const cwd = path.resolve(options.cwd || process.cwd()); + + const existingJudgeDir = path.join(runDir, 'judge'); + const promptFilePath = path.join(existingJudgeDir, 'verdict.prompt.txt'); + const packetFilePath = path.join(existingJudgeDir, 'verdict.input.json'); + const runPath = path.join(runDir, 'run.json'); + + const [savedPrompt, packetText, runText] = await Promise.all([ + readFile(promptFilePath, 'utf8'), + readFile(packetFilePath, 'utf8'), + readFile(runPath, 'utf8'), + ]); + + const packet = JSON.parse(packetText); + const run = JSON.parse(runText); + const outputDir = path.join(runDir, 'rejudges', judgeLabel); + await mkdir(outputDir, { recursive: true }); + const prompt = buildReplayJudgePrompt(savedPrompt); + const repairTelemetry = { + repair_attempted: false, + repair_attempts: 0, + }; + + await Promise.all([ + writeFile(path.join(outputDir, 'verdict.prompt.txt'), prompt), + writeFile(path.join(outputDir, 'verdict.input.json'), `${JSON.stringify(packet, null, 2)}\n`), + ]); + + const rawOutputPath = path.join(outputDir, 'verdict.raw.txt'); + const turnRunner = options.turnRunner || createShellTurnRunner({ shell: resolveReplayShell() }); + const response = await runJudgeTurnWithRetries({ + judgeAgent, + turnRunner, + turnOptions: { + phase: 'judge', + sideId: null, + runId: run.run_id, + prompt, + packet, + cwd, + timeoutMs, + command: judgeCommand, + reasoningEffort: judgeReasoningEffort, + outDir: outputDir, + promptFilePath, + packetFilePath, + attempt: 0, + }, + }); + + if (typeof response.exitCode === 'number' && response.exitCode !== 0) { + throw new Error( + `judge turn failed with exit code ${response.exitCode}: ${response.stderr || ''}`.trim(), + ); + } + + if (typeof response.stdout !== 'string') { + throw new Error('Judge turn did not return stdout.'); + } + + await writeFile(rawOutputPath, response.stdout); + const verdict = await parseAndValidateVerdict({ + output: response.stdout, + judgeAgent, + judgeCommand, + judgeReasoningEffort, + runId: run.run_id, + cwd, + timeoutMs, + turnRunner, + outDir: outputDir, + promptFilePath, + packetFilePath, + repairTelemetry, + }); + + const metadata = { + source_run_dir: runDir, + source_run_id: run.run_id, + judge: { + label: judgeLabel, + agent: judgeAgent, + provider: judgeProvider, + model: judgeModel, + reasoning_effort: judgeReasoningEffort, + command: judgeCommand, + }, + replay: repairTelemetry, + }; + + await Promise.all([ + writeFile(path.join(outputDir, 'verdict.json'), `${JSON.stringify(verdict, null, 2)}\n`), + writeFile(path.join(outputDir, 'verdict.md'), formatVerdictMarkdown(verdict)), + writeFile(path.join(outputDir, 'metadata.json'), `${JSON.stringify(metadata, null, 2)}\n`), + ]); + + return { + outputDir, + verdict, + metadata, + }; +} + +export function parseCliArgs(argv) { + const parsed = { + runDir: '', + judgeAgent: 'gemini', + judgeCommand: '', + judgeProvider: '', + judgeModel: '', + judgeReasoningEffort: DEFAULT_REASONING_EFFORT, + label: '', + cwd: process.cwd(), + timeoutMs: DEFAULT_TIMEOUT_MS, + format: 'text', + }; + + for (let index = 0; index < argv.length; index += 1) { + const token = argv[index]; + switch (token) { + case '--run-dir': + parsed.runDir = argv[index + 1] || ''; + index += 1; + break; + case '--judge-agent': + parsed.judgeAgent = argv[index + 1] || 'gemini'; + index += 1; + break; + case '--judge-command': + parsed.judgeCommand = argv[index + 1] || ''; + index += 1; + break; + case '--judge-provider': + parsed.judgeProvider = argv[index + 1] || ''; + index += 1; + break; + case '--judge-model': + parsed.judgeModel = argv[index + 1] || ''; + index += 1; + break; + case '--judge-reasoning-effort': + parsed.judgeReasoningEffort = argv[index + 1] || DEFAULT_REASONING_EFFORT; + index += 1; + break; + case '--label': + parsed.label = argv[index + 1] || ''; + index += 1; + break; + case '--cwd': + parsed.cwd = argv[index + 1] || process.cwd(); + index += 1; + break; + case '--timeout-ms': + parsed.timeoutMs = Number(argv[index + 1] || ''); + index += 1; + break; + case '--format': + parsed.format = argv[index + 1] || 'text'; + index += 1; + break; + case '--help': + parsed.help = true; + break; + default: + if (token.startsWith('--')) { + throw new Error(`Unknown flag: ${token}`); + } + break; + } + } + + if (!VALID_FORMATS.has(parsed.format)) { + throw new Error(`Unsupported format "${parsed.format}". Use text or json.`); + } + + return parsed; +} + +export async function main(argv = process.argv.slice(2)) { + const args = parseCliArgs(argv); + if (args.help) { + console.log( + 'Usage: node scripts/rejudge-conflict-run.mjs --run-dir [--judge-agent gemini] [--label gemini-judge] [--format text|json]', + ); + return; + } + + const result = await rejudgeConflictRun({ + runDir: args.runDir, + judgeAgent: args.judgeAgent, + judgeCommand: args.judgeCommand || undefined, + judgeProvider: args.judgeProvider || undefined, + judgeModel: args.judgeModel || undefined, + judgeReasoningEffort: args.judgeReasoningEffort, + label: args.label || undefined, + cwd: args.cwd, + timeoutMs: args.timeoutMs, + }); + + if (args.format === 'json') { + console.log(JSON.stringify(result, null, 2)); + return; + } + + console.log(formatResultSummary(result)); +} + +function resolveJudgeProfile(judgeAgent) { + const profile = AGENT_PROFILES[judgeAgent]; + if (!profile) { + const supported = Object.keys(AGENT_PROFILES).join(', '); + throw new Error(`Unknown judge agent "${judgeAgent}". Supported agents: ${supported}`); + } + return profile; +} + +function normalizeRequiredPath(value, flagName) { + const normalized = normalizeRequiredString(value, flagName); + return path.resolve(normalized); +} + +function normalizeRequiredString(value, flagName) { + if (typeof value === 'string' && value.trim().length > 0) { + return value.trim(); + } + throw new Error(`Missing required ${flagName}.`); +} + +function normalizePositiveInteger(value, flagName) { + if (!Number.isInteger(value) || value <= 0) { + throw new Error(`${flagName} must be a positive integer.`); + } + return value; +} + +function parseJsonResponse(output) { + const candidate = extractJsonCandidate(output); + try { + return JSON.parse(candidate); + } catch (error) { + throw new Error(`Model output is not valid JSON: ${error instanceof Error ? error.message : String(error)}`); + } +} + +function resolveReplayShell() { + if (process.platform !== 'win32') { + return undefined; + } + + const gitBashCandidates = [ + 'C:\\Program Files\\Git\\bin\\bash.exe', + 'C:\\Program Files (x86)\\Git\\bin\\bash.exe', + ]; + + return gitBashCandidates.find((candidate) => existsSync(candidate)); +} + +async function runJudgeTurnWithRetries(options) { + let lastResponse = null; + let lastFailure = null; + + for (let attempt = 0; attempt < MAX_GEMINI_TURN_ATTEMPTS; attempt += 1) { + const response = await options.turnRunner({ + ...options.turnOptions, + attempt, + }); + lastResponse = response; + + if (!shouldRetryJudgeResponse(response, options.judgeAgent) || attempt === MAX_GEMINI_TURN_ATTEMPTS - 1) { + return response; + } + + lastFailure = response.stderr || response.spawnError || 'unknown judge turn failure'; + } + + if (lastResponse) { + return lastResponse; + } + + throw new Error(`Judge turn failed before producing a response: ${lastFailure || 'unknown error'}`); +} + +async function parseAndValidateVerdict(options) { + const parsed = parseJsonResponse(options.output); + const validation = validateConflictDocument(parsed, 'verdict'); + if (validation.errors.length === 0) { + return parsed; + } + + if (!shouldAttemptVerdictRepair(validation.errors, options.judgeAgent)) { + throw new Error(formatValidationErrors('Verdict packet failed validation', validation.errors)); + } + + const repaired = await repairVerdict(options, parsed, validation.errors); + const repairedValidation = validateConflictDocument(repaired, 'verdict'); + if (repairedValidation.errors.length > 0) { + throw new Error(formatValidationErrors('Verdict packet failed validation', repairedValidation.errors)); + } + + return repaired; +} + +function shouldAttemptVerdictRepair(errors, judgeAgent) { + return errors.every((error) => + error.message === 'Missing required property.' && isRepairableVerdictValidationPath(error.path), + ); +} + +function isRepairableVerdictValidationPath(pathValue) { + return pathValue === '$.dimension_rationales' || + pathValue === '$.side_summaries' || + pathValue === '$.decisive_dimension' || + pathValue === '$.rubric_scores.side_a.weighted_total' || + pathValue === '$.rubric_scores.side_b.weighted_total'; +} + +function shouldRetryJudgeResponse(response, judgeAgent) { + if (judgeAgent !== 'gemini') { + return false; + } + + if (typeof response?.exitCode !== 'number' || response.exitCode === 0) { + return false; + } + + const stderr = `${response.stderr || ''}\n${response.spawnError || ''}`; + return /ERR_STREAM_PREMATURE_CLOSE|Premature close/i.test(stderr); +} + +async function repairVerdict(options, parsedVerdict, errors) { + options.repairTelemetry.repair_attempted = true; + options.repairTelemetry.repair_attempts += 1; + const repairPrompt = [ + 'Your previous judge output was close but rejected by schema validation.', + 'Rewrite it as ONLY a JSON object that preserves the same winner, margin, rubric_scores, decisive_dimension, judge_confidence, needs_human_review, decisive_findings, and rationale unless the schema absolutely requires clarification.', + 'Any rubric_scores.*.weighted_total value must be a normalized aggregate on the same 1-5 scale as the rubric dimensions, not a raw sum.', + 'You MUST include these missing required properties with substantive content:', + '- dimension_rationales', + '- side_summaries', + '- decisive_dimension', + '- rubric_scores.side_a.weighted_total', + '- rubric_scores.side_b.weighted_total', + '', + 'Required schema shape:', + JSON.stringify(buildVerdictShapeExample(), null, 2), + '', + `Validation errors:\n${formatValidationErrors('Verdict packet failed validation', errors)}`, + '', + 'Previous verdict JSON:', + JSON.stringify(parsedVerdict, null, 2), + ].join('\n'); + + const repairPromptPath = path.join(options.outDir, 'verdict.repair.prompt.txt'); + const repairOutputPath = path.join(options.outDir, 'verdict.repair.raw.txt'); + await writeFile(repairPromptPath, `${repairPrompt}\n`); + + const response = await options.turnRunner({ + phase: 'judge', + sideId: null, + runId: options.runId, + prompt: repairPrompt, + packet: parsedVerdict, + cwd: options.cwd, + timeoutMs: options.timeoutMs, + command: options.judgeCommand, + reasoningEffort: options.judgeReasoningEffort, + outDir: options.outDir, + promptFilePath: repairPromptPath, + packetFilePath: options.packetFilePath, + attempt: 1, + }); + + if (typeof response.exitCode === 'number' && response.exitCode !== 0) { + throw new Error( + `judge repair turn failed with exit code ${response.exitCode}: ${response.stderr || ''}`.trim(), + ); + } + + if (typeof response.stdout !== 'string') { + throw new Error('Judge repair turn did not return stdout.'); + } + + await writeFile(repairOutputPath, response.stdout); + return parseJsonResponse(response.stdout); +} + +function extractJsonCandidate(output) { + const trimmed = output.trim(); + if (trimmed.startsWith('{')) return trimmed; + + const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i); + if (fenced) { + return fenced[1].trim(); + } + + const firstBrace = trimmed.indexOf('{'); + const lastBrace = trimmed.lastIndexOf('}'); + if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) { + return trimmed.slice(firstBrace, lastBrace + 1); + } + + return trimmed; +} + +function formatValidationErrors(label, errors) { + return `${label}:\n${errors.map((error) => `${error.path}: ${error.message}`).join('\n')}`; +} + +function buildReplayJudgePrompt(savedPrompt) { + return [ + savedPrompt.trim(), + '', + 'Replay addendum:', + '- This replay will be rejected unless EVERY required property is present.', + '- Do not omit `dimension_rationales`.', + '- Do not omit `side_summaries`.', + '- `rubric_scores.side_a.weighted_total` and `rubric_scores.side_b.weighted_total` must each stay within the same 1-5 scale as the rubric dimensions.', + '- Return ONLY the JSON object. No markdown fences, no prose before or after.', + ].join('\n'); +} + +function buildVerdictShapeExample() { + return { + winner: 'tie', + margin: 0, + rubric_scores: { + side_a: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3, + }, + side_b: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3, + }, + }, + dimension_rationales: { + claim_quality: 'Which side made the stronger claims and why.', + evidence_discipline: 'How each side used or overstated evidence.', + responsiveness_to_critique: 'How each side responded to the critique phase.', + internal_consistency: 'Contradictions, coherence, or missing logic.', + decision_usefulness: 'Which artifact better supports an actual decision.', + }, + side_summaries: { + side_a: { + strengths: ['One concise strength of Side A.'], + weaknesses: ['One concise weakness of Side A.'], + }, + side_b: { + strengths: ['One concise strength of Side B.'], + weaknesses: ['One concise weakness of Side B.'], + }, + }, + decisive_dimension: 'decision_usefulness', + decisive_findings: ['Replace with the actual decisive findings from this run.'], + judge_confidence: 'low', + needs_human_review: true, + rationale: 'One-paragraph explanation of the verdict.', + }; +} + +function formatVerdictMarkdown(verdict) { + return [ + `# Verdict: ${verdict.winner}`, + '', + `- Margin: ${verdict.margin}`, + `- Confidence: ${verdict.judge_confidence}`, + `- Needs human review: ${verdict.needs_human_review}`, + '', + '## Dimension Rationales', + '', + `- Claim Quality: ${verdict.dimension_rationales?.claim_quality || 'n/a'}`, + `- Evidence Discipline: ${verdict.dimension_rationales?.evidence_discipline || 'n/a'}`, + `- Responsiveness To Critique: ${verdict.dimension_rationales?.responsiveness_to_critique || 'n/a'}`, + `- Internal Consistency: ${verdict.dimension_rationales?.internal_consistency || 'n/a'}`, + `- Decision Usefulness: ${verdict.dimension_rationales?.decision_usefulness || 'n/a'}`, + '', + '## Side Summaries', + '', + '### Side A Strengths', + ...((verdict.side_summaries?.side_a?.strengths || []).map((entry) => `- ${entry}`)), + '', + '### Side A Weaknesses', + ...((verdict.side_summaries?.side_a?.weaknesses || []).map((entry) => `- ${entry}`)), + '', + '### Side B Strengths', + ...((verdict.side_summaries?.side_b?.strengths || []).map((entry) => `- ${entry}`)), + '', + '### Side B Weaknesses', + ...((verdict.side_summaries?.side_b?.weaknesses || []).map((entry) => `- ${entry}`)), + '', + `## Decisive Dimension`, + '', + verdict.decisive_dimension || 'n/a', + '', + '## Rationale', + '', + verdict.rationale, + '', + '## Decisive Findings', + '', + ...(verdict.decisive_findings || []).map((finding) => `- ${finding}`), + '', + '## Rubric Scores', + '', + `- Side A weighted total: ${verdict.rubric_scores?.side_a?.weighted_total ?? 'n/a'}`, + `- Side B weighted total: ${verdict.rubric_scores?.side_b?.weighted_total ?? 'n/a'}`, + '', + ].join('\n'); +} + +function formatResultSummary(result) { + return [ + `Rejudge output: ${result.outputDir}`, + `Winner: ${result.verdict.winner}`, + `Margin: ${result.verdict.margin}`, + `Confidence: ${result.verdict.judge_confidence}`, + `Needs human review: ${result.verdict.needs_human_review}`, + ].join('\n'); +} + +const isMainModule = process.argv[1] && import.meta.url === new URL(`file://${process.argv[1]}`).href; + +if (isMainModule) { + main().catch((error) => { + process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`); + process.exitCode = 1; + }); +} diff --git a/scripts/run-conflict-batch.mjs b/scripts/run-conflict-batch.mjs new file mode 100644 index 00000000..509fd07f --- /dev/null +++ b/scripts/run-conflict-batch.mjs @@ -0,0 +1,484 @@ +#!/usr/bin/env node + +import { readdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; +import { runConflictHarness } from './run-conflict-harness.mjs'; + +const DEFAULT_SCENARIO_DIR = path.resolve('benchmarks', 'scenarios'); +const DEFAULT_REASONING_EFFORT = 'medium'; +export const AGENT_PROFILES = { + claude: { + id: 'claude', + label: 'claude', + command: "cat {{prompt_file}} | claude -p --no-session-persistence --output-format text", + provider: 'anthropic', + model: 'claude-max', + }, + gpt: { + id: 'gpt', + label: 'gpt', + command: "cat {{prompt_file}} | codex exec --ephemeral --sandbox read-only", + provider: 'openai', + model: 'chatgpt-pro', + }, + gemini: { + id: 'gemini', + label: 'gemini', + command: 'cat {{prompt_file}} | gemini --approval-mode plan --output-format text -p "Use stdin as the full task. Return only the requested JSON object."', + provider: 'google', + model: 'gemini-2.5-flash-lite', + }, +}; + +const DEFAULT_SIDE_A_AGENT = 'claude'; +const DEFAULT_SIDE_B_AGENT = 'gpt'; +const DEFAULT_JUDGE_AGENTS = ['claude', 'gpt']; + +function resolveAgentProfile(agentId, roleName) { + const profile = AGENT_PROFILES[agentId]; + if (!profile) { + const supported = Object.keys(AGENT_PROFILES).join(', '); + throw new Error(`Unknown ${roleName} "${agentId}". Supported agents: ${supported}`); + } + return { ...profile }; +} + +function createRoleConfig(options = {}) { + if (options.swapSides && (options.sideAAgent || options.sideBAgent)) { + throw new Error('Use either --swap-sides or explicit --side-a-agent/--side-b-agent flags, not both.'); + } + + const defaultSideAAgent = options.swapSides ? DEFAULT_SIDE_B_AGENT : DEFAULT_SIDE_A_AGENT; + const defaultSideBAgent = options.swapSides ? DEFAULT_SIDE_A_AGENT : DEFAULT_SIDE_B_AGENT; + const sideA = resolveAgentProfile(options.sideAAgent || defaultSideAAgent, 'side-a agent'); + const sideB = resolveAgentProfile(options.sideBAgent || defaultSideBAgent, 'side-b agent'); + + if (sideA.id === sideB.id) { + throw new Error(`Side A and Side B must use different agents. Both were set to "${sideA.id}".`); + } + + const judgeIds = options.judgeAgents && options.judgeAgents.length > 0 + ? options.judgeAgents + : DEFAULT_JUDGE_AGENTS; + const judges = judgeIds.map((judgeId) => { + const judge = resolveAgentProfile(judgeId, 'judge agent'); + return { + ...judge, + label: `${judge.label}-judge`, + }; + }); + + return { sideA, sideB, judges }; +} + +export async function runBatch(options = {}) { + const scenarioDir = options.scenarioDir || DEFAULT_SCENARIO_DIR; + const scenarios = await listScenarios(scenarioDir, options.scenarios); + const roleConfig = createRoleConfig(options); + const sideACommand = options.sideACommand || roleConfig.sideA.command; + const sideBCommand = options.sideBCommand || roleConfig.sideB.command; + const judgeConfigs = options.judgeConfigs || roleConfig.judges; + const dryRun = Boolean(options.dryRun); + const sideAReasoningEffort = options.sideAReasoningEffort || DEFAULT_REASONING_EFFORT; + const sideBReasoningEffort = options.sideBReasoningEffort || DEFAULT_REASONING_EFFORT; + const judgeReasoningEffort = options.judgeReasoningEffort || DEFAULT_REASONING_EFFORT; + + const results = []; + const totalRuns = scenarios.length * judgeConfigs.length; + let completed = 0; + + for (const scenario of scenarios) { + for (const judge of judgeConfigs) { + completed += 1; + const label = `[${completed}/${totalRuns}] ${scenario} + ${judge.label}`; + process.stderr.write(`${label} — starting\n`); + + if (dryRun) { + results.push({ + scenario, + judgeLabel: judge.label, + sideALabel: roleConfig.sideA.label, + sideBLabel: roleConfig.sideB.label, + status: 'dry_run', + winner: null, + margin: null, + judgeConfidence: null, + needsHumanReview: null, + disagreementRate: null, + declaredAdoptionRate: null, + substantiveRevisionRate: null, + unsupportedClaimCount: null, + runId: null, + error: null, + }); + process.stderr.write(`${label} — skipped (dry run)\n`); + continue; + } + + try { + const { run } = await runConflictHarness({ + scenario, + scenarioDir, + sideACommand, + sideBCommand, + judgeCommand: judge.command, + sideAProvider: roleConfig.sideA.provider, + sideAModel: roleConfig.sideA.model, + sideAReasoningEffort, + sideBProvider: roleConfig.sideB.provider, + sideBModel: roleConfig.sideB.model, + sideBReasoningEffort, + judgeProvider: judge.provider, + judgeModel: judge.model, + judgeReasoningEffort, + }); + + results.push({ + scenario, + judgeLabel: judge.label, + sideALabel: roleConfig.sideA.label, + sideBLabel: roleConfig.sideB.label, + status: run.status, + winner: run.results.winner, + margin: run.results.margin, + judgeConfidence: run.results.judge_confidence, + needsHumanReview: run.results.needs_human_review, + disagreementRate: run.metrics.disagreement_rate, + declaredAdoptionRate: run.metrics.declared_adoption_rate, + substantiveRevisionRate: run.metrics.substantive_revision_rate, + unsupportedClaimCount: run.metrics.unsupported_claim_count, + runId: run.run_id, + error: null, + }); + + process.stderr.write(`${label} — ${run.status}, winner: ${run.results.winner || 'none'}\n`); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + results.push({ + scenario, + judgeLabel: judge.label, + sideALabel: roleConfig.sideA.label, + sideBLabel: roleConfig.sideB.label, + status: 'error', + winner: null, + margin: null, + judgeConfidence: null, + needsHumanReview: null, + disagreementRate: null, + declaredAdoptionRate: null, + substantiveRevisionRate: null, + unsupportedClaimCount: null, + runId: null, + error: message, + }); + process.stderr.write(`${label} — ERROR: ${message}\n`); + } + } + } + + return results; +} + +export function buildSummary(results) { + const lines = []; + + lines.push('# Conflict Harness Batch Summary'); + lines.push(`\nRuns completed: ${results.length}`); + lines.push(`Errors: ${results.filter((r) => r.status === 'error').length}`); + lines.push(''); + + // Per-run results table + lines.push('## Run Results'); + lines.push(''); + lines.push('| Scenario | Side A | Side B | Judge | Status | Winner | Margin | Confidence | Human Review | Disagreement | Declared | Revised |'); + lines.push('|---|---|---|---|---|---|---|---|---|---|---|---|'); + + for (const r of results) { + lines.push([ + '', + r.scenario, + r.sideALabel || '—', + r.sideBLabel || '—', + r.judgeLabel, + r.status, + r.winner || '—', + r.margin != null ? r.margin.toFixed(2) : '—', + r.judgeConfidence || '—', + r.needsHumanReview != null ? String(r.needsHumanReview) : '—', + r.disagreementRate != null ? r.disagreementRate.toFixed(2) : '—', + r.declaredAdoptionRate != null ? r.declaredAdoptionRate.toFixed(2) : '—', + r.substantiveRevisionRate != null ? r.substantiveRevisionRate.toFixed(2) : '—', + '', + ].join(' | ')); + } + + // Judge agreement analysis + const scenarios = [...new Set(results.map((r) => r.scenario))]; + const judgeLabels = [...new Set(results.map((r) => r.judgeLabel))]; + + if (judgeLabels.length === 2) { + lines.push(''); + lines.push('## Judge Agreement Analysis'); + lines.push(''); + lines.push('| Scenario | ' + judgeLabels.join(' Winner | ') + ' Winner | Agree? | Margin Delta |'); + lines.push('|---|---|---|---|---|'); + + let agreements = 0; + let comparisons = 0; + const marginDeltas = []; + + for (const scenario of scenarios) { + const runs = judgeLabels.map((label) => + results.find((r) => r.scenario === scenario && r.judgeLabel === label) + ); + + if (runs.some((r) => !r || r.status !== 'completed')) { + lines.push(`| ${scenario} | ${runs.map((r) => r?.winner || 'ERROR').join(' | ')} | — | — |`); + continue; + } + + comparisons += 1; + const agree = runs[0].winner === runs[1].winner; + if (agree) agreements += 1; + + const marginDelta = Math.abs((runs[0].margin || 0) - (runs[1].margin || 0)); + marginDeltas.push(marginDelta); + + lines.push([ + '', + scenario, + runs[0].winner || '—', + runs[1].winner || '—', + agree ? 'YES' : '**NO**', + marginDelta.toFixed(2), + '', + ].join(' | ')); + } + + const incomplete = scenarios.length - comparisons; + + lines.push(''); + lines.push(`**Completed comparisons:** ${comparisons}/${scenarios.length}`); + if (comparisons > 0) { + const agreementRate = ((agreements / comparisons) * 100).toFixed(0); + const avgMarginDelta = (marginDeltas.reduce((a, b) => a + b, 0) / marginDeltas.length).toFixed(2); + lines.push(`**Judge agreement rate:** ${agreements}/${comparisons} (${agreementRate}%)`); + lines.push(`**Average margin delta:** ${avgMarginDelta}`); + lines.push(''); + + if (incomplete > 0) { + lines.push(`**WARNING:** ${incomplete} scenario(s) did not complete with both judges. The agreement rate is based on partial coverage and should not be used for publishability decisions. Rerun failed scenarios before drawing conclusions.`); + } else if (Number(agreementRate) >= 70) { + lines.push('Interpretation: Agreement is above 70%. Single-judge runs are usable with a caveat that judge family affinity exists but does not dominate across scenarios.'); + } else if (Number(agreementRate) >= 50) { + lines.push('Interpretation: Agreement is between 50-70%. Judge family affinity is a significant variable. Dual-judge runs should be the default for any publishable conclusion.'); + } else { + lines.push('Interpretation: Agreement is below 50%. Judge family affinity dominates the verdict. The harness needs a third-provider judge or consensus mechanism before any conclusion is publishable.'); + } + } else { + lines.push(''); + lines.push('**No completed comparisons.** Cannot assess judge agreement. All scenario pairs must complete with both judges before any interpretation is valid.'); + } + } + + // Error details + const errors = results.filter((r) => r.error); + if (errors.length > 0) { + lines.push(''); + lines.push('## Errors'); + lines.push(''); + for (const r of errors) { + lines.push(`- **${r.scenario}** (${r.judgeLabel}): ${r.error}`); + } + } + + return lines.join('\n') + '\n'; +} + +async function listScenarios(scenarioDir, filter) { + const files = await readdir(scenarioDir); + let scenarios = files + .filter((f) => f.endsWith('.json')) + .map((f) => f.replace(/\.json$/, '')) + .sort(); + + if (Array.isArray(filter) && filter.length > 0) { + const missing = filter.filter((s) => !scenarios.includes(s)); + if (missing.length > 0) { + throw new Error(`Unknown scenario(s): ${missing.join(', ')}. Available: ${scenarios.join(', ')}`); + } + scenarios = scenarios.filter((s) => filter.includes(s)); + } + + if (scenarios.length === 0) { + throw new Error(`No scenarios found in ${scenarioDir}`); + } + + return scenarios; +} + +export function parseCliArgs(argv) { + const parsed = { + scenarios: [], + scenarioDir: DEFAULT_SCENARIO_DIR, + outPath: null, + dryRun: false, + swapSides: false, + sideAAgent: '', + sideBAgent: '', + judgeAgents: [], + sideAReasoningEffort: DEFAULT_REASONING_EFFORT, + sideBReasoningEffort: DEFAULT_REASONING_EFFORT, + judgeReasoningEffort: DEFAULT_REASONING_EFFORT, + format: 'text', + help: false, + }; + + for (let i = 0; i < argv.length; i += 1) { + const token = argv[i]; + switch (token) { + case '--scenario': + parsed.scenarios.push(argv[i + 1] || ''); + i += 1; + break; + case '--scenario-dir': + parsed.scenarioDir = argv[i + 1] || DEFAULT_SCENARIO_DIR; + i += 1; + break; + case '--out': + parsed.outPath = argv[i + 1] || null; + i += 1; + break; + case '--dry-run': + parsed.dryRun = true; + break; + case '--swap-sides': + parsed.swapSides = true; + break; + case '--side-a-agent': + parsed.sideAAgent = argv[i + 1] || ''; + i += 1; + break; + case '--side-b-agent': + parsed.sideBAgent = argv[i + 1] || ''; + i += 1; + break; + case '--judge-agent': + parsed.judgeAgents.push(argv[i + 1] || ''); + i += 1; + break; + case '--side-a-reasoning-effort': + parsed.sideAReasoningEffort = argv[i + 1] || DEFAULT_REASONING_EFFORT; + i += 1; + break; + case '--side-b-reasoning-effort': + parsed.sideBReasoningEffort = argv[i + 1] || DEFAULT_REASONING_EFFORT; + i += 1; + break; + case '--judge-reasoning-effort': + parsed.judgeReasoningEffort = argv[i + 1] || DEFAULT_REASONING_EFFORT; + i += 1; + break; + case '--format': + parsed.format = argv[i + 1] || 'text'; + i += 1; + break; + case '--help': + parsed.help = true; + break; + default: + if (token.startsWith('--')) { + throw new Error(`Unknown flag: ${token}`); + } + break; + } + } + + return parsed; +} + +export async function main(argv = process.argv.slice(2)) { + const args = parseCliArgs(argv); + + if (args.help) { + process.stdout.write([ + 'Usage: node scripts/run-conflict-batch.mjs [OPTIONS]', + '', + 'Runs every scenario with both Claude and GPT as judge, then compares verdicts.', + '', + 'Options:', + ' --scenario Run only this scenario (repeatable)', + ' --scenario-dir Scenario directory (default: benchmarks/scenarios)', + ' --out Write summary to file', + ' --dry-run List what would run without executing', + ' --swap-sides Run with Side A = GPT and Side B = Claude', + ' --side-a-agent Agent for Side A (claude|gpt|gemini)', + ' --side-b-agent Agent for Side B (claude|gpt|gemini)', + ' --judge-agent Judge agent to include (repeatable; default: claude + gpt)', + ' --side-a-reasoning-effort Explicit reasoning effort for Side A (default: medium)', + ' --side-b-reasoning-effort Explicit reasoning effort for Side B (default: medium)', + ' --judge-reasoning-effort Explicit reasoning effort for judges (default: medium)', + ' --format text|json Output format (default: text)', + ' --help Show this help', + '', + 'Examples:', + ' node scripts/run-conflict-batch.mjs', + ' node scripts/run-conflict-batch.mjs --scenario prd-hidden-scope-creep --scenario handoff-contradiction', + ' node scripts/run-conflict-batch.mjs --dry-run', + ' node scripts/run-conflict-batch.mjs --out benchmarks/results/conflict-harness/batch-summary.md', + '', + ].join('\n')); + return; + } + + const results = await runBatch({ + scenarios: args.scenarios.length > 0 ? args.scenarios : undefined, + scenarioDir: args.scenarioDir, + dryRun: args.dryRun, + swapSides: args.swapSides, + sideAAgent: args.sideAAgent || undefined, + sideBAgent: args.sideBAgent || undefined, + judgeAgents: args.judgeAgents.length > 0 ? args.judgeAgents : undefined, + sideAReasoningEffort: args.sideAReasoningEffort, + sideBReasoningEffort: args.sideBReasoningEffort, + judgeReasoningEffort: args.judgeReasoningEffort, + }); + + if (args.format === 'json') { + const output = JSON.stringify(results, null, 2) + '\n'; + if (args.outPath) { + await writeFile(path.resolve(args.outPath), output, 'utf8'); + process.stderr.write(`Results written to ${path.resolve(args.outPath)}\n`); + } else { + process.stdout.write(output); + } + return; + } + + const summary = buildSummary(results); + if (args.outPath) { + await writeFile(path.resolve(args.outPath), summary, 'utf8'); + process.stderr.write(`Summary written to ${path.resolve(args.outPath)}\n`); + } + process.stdout.write(summary); +} + +function isDirectRun() { + if (!process.argv[1]) return false; + return import.meta.url === pathToFileURL(path.resolve(process.argv[1])).href; +} + +if (isDirectRun()) { + process.on('uncaughtException', (error) => { + process.stderr.write(`Uncaught exception (non-fatal): ${error.message}\n`); + }); + process.on('unhandledRejection', (reason) => { + const message = reason instanceof Error ? reason.message : String(reason); + process.stderr.write(`Unhandled rejection (non-fatal): ${message}\n`); + }); + main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; + }); +} diff --git a/scripts/run-conflict-harness.mjs b/scripts/run-conflict-harness.mjs new file mode 100644 index 00000000..c500e164 --- /dev/null +++ b/scripts/run-conflict-harness.mjs @@ -0,0 +1,1696 @@ +#!/usr/bin/env node + +import { spawn } from 'node:child_process'; +import { mkdir, readFile, writeFile, unlink } from 'node:fs/promises'; +import { existsSync, mkdtempSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +import { + buildCasePacket, + loadCasePacket, + validateConflictDocument, +} from './build-case-packet.mjs'; + +const VALID_FORMATS = new Set(['text', 'json']); +const VALID_MODES = new Set(['head_to_head', 'coalition_vs_coalition', 'swap_test']); +const VALID_ACCESS_MODES = new Set(['subscription_cli']); +const SIDE_IDS = Object.freeze(['side_a', 'side_b']); +const IDENTITY_LEAK_PATTERNS = [ + /\bI am Claude\b/i, + /\bAs an OpenAI model\b/i, + /\bAs ChatGPT\b/i, + /\bAnthropic\b/i, + /\bOpenAI\b/i, +]; +const UNSEEN_OPPONENT_PATTERNS = [ + /\bSide A\b/i, + /\bSide B\b/i, + /\byour first-pass artifact\b/i, + /\bas Side A argued\b/i, + /\bas Side B argued\b/i, +]; +const DEFAULT_BUDGETS = Object.freeze({ + maxVisibleRounds: 3, + maxCostUsd: 1_000_000, + maxLatencyMs: 360_000, + minMarginForVerdict: 0.1, + judgeReservedBudgetUsd: 0, +}); +const DEFAULT_REASONING_EFFORT = 'medium'; + +export async function runConflictHarness(options = {}) { + const casePacket = await resolveCasePacket(options); + const mode = normalizeMode(options.mode || 'head_to_head'); + if (mode !== 'head_to_head') { + throw new Error('Phase 1 only supports head_to_head mode.'); + } + + const runId = normalizeRunId(options.runId, casePacket.scenario_id); + const outDir = resolveRunDirectory(casePacket.scenario_id, runId, options.outDir); + const cwd = path.resolve(options.cwd || process.cwd()); + const timeoutMs = normalizePositiveNumber(options.timeoutMs ?? 120000, '--timeout-ms'); + const budgets = { + max_visible_rounds: normalizePositiveInteger( + options.maxVisibleRounds ?? casePacket.max_rounds ?? DEFAULT_BUDGETS.maxVisibleRounds, + '--max-visible-rounds', + ), + max_cost_usd: normalizeNonNegativeNumber( + options.maxCostUsd ?? DEFAULT_BUDGETS.maxCostUsd, + '--max-cost-usd', + ), + max_latency_ms: normalizePositiveInteger( + options.maxLatencyMs ?? DEFAULT_BUDGETS.maxLatencyMs, + '--max-latency-ms', + ), + min_margin_for_verdict: normalizeNonNegativeNumber( + options.minMarginForVerdict ?? DEFAULT_BUDGETS.minMarginForVerdict, + '--min-margin-for-verdict', + ), + }; + const judgeReservedBudgetUsd = normalizeNonNegativeNumber( + options.judgeReservedBudgetUsd ?? DEFAULT_BUDGETS.judgeReservedBudgetUsd, + '--judge-reserved-budget-usd', + ); + const turnRunner = options.turnRunner || createShellTurnRunner(); + const commandConfig = normalizeCommandConfig(options); + const reasoningEfforts = { + side_a: normalizeReasoningEffort( + options.sideAReasoningEffort ?? DEFAULT_REASONING_EFFORT, + '--side-a-reasoning-effort', + ), + side_b: normalizeReasoningEffort( + options.sideBReasoningEffort ?? DEFAULT_REASONING_EFFORT, + '--side-b-reasoning-effort', + ), + judge: normalizeReasoningEffort( + options.judgeReasoningEffort ?? DEFAULT_REASONING_EFFORT, + '--judge-reasoning-effort', + ), + }; + + await createTranscriptLayout(outDir); + const run = createInitialRunRecord({ + runId, + casePacket, + mode, + budgets, + sideAProvider: options.sideAProvider || 'openai', + sideAModel: options.sideAModel || 'chatgpt-pro', + sideAReasoningEffort: reasoningEfforts.side_a, + sideBProvider: options.sideBProvider || 'anthropic', + sideBModel: options.sideBModel || 'claude-max', + sideBReasoningEffort: reasoningEfforts.side_b, + judgeProvider: options.judgeProvider || 'openai', + judgeModel: options.judgeModel || 'chatgpt-pro', + judgeReasoningEffort: reasoningEfforts.judge, + judgeSelectionPolicy: options.judgeSelectionPolicy || 'single_judge_cli', + }); + + await writeJson(path.join(outDir, 'config.json'), buildConfigRecord({ + cwd, + mode, + runId, + casePacket, + budgets, + timeoutMs, + judgeReservedBudgetUsd, + commandConfig, + reasoningEfforts, + })); + await writeJson(path.join(outDir, 'case-packet.json'), casePacket); + await persistRunState(outDir, run, { + lastCompletedPhase: 'init', + nextAction: 'first_pass', + }); + + let findingSequence = 0; + + for (const sideId of SIDE_IDS) { + const response = await invokeCommittedArtifactTurn({ + phase: 'first_pass', + sideId, + run, + casePacket, + outDir, + cwd, + timeoutMs, + turnRunner, + command: commandConfig[sideId], + reasoningEffort: reasoningEfforts[sideId], + }); + run.sides[sideId].first_pass = response.packet; + run.metrics.total_estimated_cost_usd += response.estimatedCostUsd; + } + + run.status = 'first_pass_complete'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'first_pass', + nextAction: 'rebuttal', + }); + + if (hasExceededVisiblePhaseBudget(run, judgeReservedBudgetUsd)) { + run.status = 'budget_exhausted'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'first_pass', + nextAction: 'stopped', + }); + return { run, outDir }; + } + + for (const sideId of SIDE_IDS) { + findingSequence += 1; + const targetSide = sideId === 'side_a' ? 'side_b' : 'side_a'; + const response = await invokeCritiqueTurn({ + sideId, + targetSide, + run, + casePacket, + outDir, + cwd, + timeoutMs, + turnRunner, + command: commandConfig[sideId], + reasoningEffort: reasoningEfforts[sideId], + findingSequence, + }); + run.sides[sideId].rebuttal = response.packet; + run.metrics.total_estimated_cost_usd += response.estimatedCostUsd; + } + + run.status = 'rebuttal_complete'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'rebuttal', + nextAction: 'final', + }); + + if (hasExceededVisiblePhaseBudget(run, judgeReservedBudgetUsd)) { + run.status = 'budget_exhausted'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'rebuttal', + nextAction: 'stopped', + }); + return { run, outDir }; + } + + for (const sideId of SIDE_IDS) { + const targetSide = sideId === 'side_a' ? 'side_b' : 'side_a'; + const response = await invokeCommittedArtifactTurn({ + phase: 'final', + sideId, + run, + casePacket, + outDir, + cwd, + timeoutMs, + turnRunner, + command: commandConfig[sideId], + reasoningEffort: reasoningEfforts[sideId], + critiquePacket: run.sides[targetSide].rebuttal, + }); + run.sides[sideId].final = response.packet; + run.metrics.total_estimated_cost_usd += response.estimatedCostUsd; + } + + run.status = 'final_complete'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'final', + nextAction: 'adjudication', + }); + + if (hasExceededVisiblePhaseBudget(run, judgeReservedBudgetUsd)) { + run.status = 'budget_exhausted'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'final', + nextAction: 'stopped', + }); + return { run, outDir }; + } + + if (!hasReservedJudgeBudget(run, judgeReservedBudgetUsd)) { + run.status = 'budget_exhausted_no_verdict'; + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'final', + nextAction: 'stopped', + }); + return { run, outDir }; + } + + const judgeResponse = await invokeJudgeTurn({ + run, + casePacket, + outDir, + cwd, + timeoutMs, + turnRunner, + command: commandConfig.judge, + reasoningEffort: reasoningEfforts.judge, + }); + run.judge.verdict = judgeResponse.packet; + run.metrics.total_estimated_cost_usd += judgeResponse.estimatedCostUsd; + applyVerdictToRun(run, judgeResponse.packet); + computeRunMetrics(run); + run.status = 'completed'; + + await validateAndPersistRun(outDir, run, { + lastCompletedPhase: 'adjudication', + nextAction: 'done', + }); + + return { run, outDir }; +} + +export function createShellTurnRunner(options = {}) { + const shellConfig = resolveShellConfig(options.shell); + + return async function runShellTurn(turnOptions) { + if (!turnOptions.command || turnOptions.command.trim().length === 0) { + throw new Error(`Missing command for ${turnOptions.phase} ${turnOptions.sideId || 'judge'} turn.`); + } + + let command = expandCommandTemplate(turnOptions.command, turnOptions); + command = injectReasoningEffort(command, turnOptions.reasoningEffort); + + // For codex exec commands, inject --output-last-message to get file-based + // output as a fallback when stdout pipes break (EPIPE). + let outputFilePath = null; + if (/\bcodex\s+exec\b/.test(command)) { + const tmpDir = mkdtempSync(path.join(tmpdir(), 'shipwright-codex-')); + outputFilePath = path.join(tmpDir, 'output.txt'); + command = command.replace( + /\bcodex\s+exec\b/, + `codex exec --output-last-message '${outputFilePath}'`, + ); + } + + const startedAt = Date.now(); + + const result = await new Promise((resolve) => { + const child = spawn(shellConfig.command, shellConfig.args(command), { + cwd: turnOptions.cwd, + env: { + ...process.env, + SHIPWRIGHT_CONFLICT_RUN_ID: turnOptions.runId, + SHIPWRIGHT_CONFLICT_SIDE_ID: turnOptions.sideId || '', + SHIPWRIGHT_CONFLICT_PHASE: turnOptions.phase, + SHIPWRIGHT_CONFLICT_PROMPT_FILE: turnOptions.promptFilePath, + SHIPWRIGHT_CONFLICT_PACKET_FILE: turnOptions.packetFilePath, + SHIPWRIGHT_CONFLICT_OUT_DIR: turnOptions.outDir, + SHIPWRIGHT_CONFLICT_REASONING_EFFORT: turnOptions.reasoningEffort || '', + }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + + let stdout = ''; + let stderr = ''; + let settled = false; + let timeoutHandle = null; + + const finalize = (payload) => { + if (settled) return; + settled = true; + if (timeoutHandle) clearTimeout(timeoutHandle); + resolve({ + stdout, + stderr, + durationMs: Date.now() - startedAt, + ...payload, + }); + }; + + child.stdout.setEncoding('utf8'); + child.stderr.setEncoding('utf8'); + child.stdout.on('data', (chunk) => { + stdout += chunk; + }); + child.stderr.on('data', (chunk) => { + stderr += chunk; + }); + child.on('error', (error) => { + finalize({ exitCode: 1, spawnError: error.message }); + }); + child.on('close', (code) => { + finalize({ exitCode: typeof code === 'number' ? code : 1 }); + }); + + timeoutHandle = setTimeout(() => { + stderr += `${stderr ? '\n' : ''}Timed out after ${turnOptions.timeoutMs}ms.`; + child.kill('SIGKILL'); + }, turnOptions.timeoutMs); + + child.stdin.on('error', (error) => { + stderr += `${stderr ? '\n' : ''}stdin write error: ${error.message}`; + }); + child.stdin.end(turnOptions.prompt); + }); + + // If codex wrote to the output file, prefer it over stdout (which may + // be empty or truncated due to EPIPE). + if (outputFilePath) { + try { + const fileContent = await readFile(outputFilePath, 'utf8'); + if (fileContent.trim().length > 0) { + result.stdout = fileContent; + } + await unlink(outputFilePath); + } catch { + // File doesn't exist or can't be read — fall through to stdout. + } + } + + return result; + }; +} + +function resolveShellConfig(explicitShell) { + const shell = explicitShell || detectDefaultShell(); + const shellLower = shell.toLowerCase(); + + if (shellLower.endsWith('bash.exe') || shellLower.endsWith('/bash') || shellLower.endsWith('\\bash')) { + return { + command: shell, + args: (command) => ['-lc', command], + }; + } + + if (shellLower.endsWith('zsh') || shellLower.endsWith('zsh.exe') || shellLower.endsWith('/sh')) { + return { + command: shell, + args: (command) => ['-lc', command], + }; + } + + if (shellLower.endsWith('powershell.exe') || shellLower.endsWith('pwsh.exe')) { + return { + command: shell, + args: (command) => ['-Command', command], + }; + } + + if (shellLower.endsWith('cmd.exe')) { + return { + command: shell, + args: (command) => ['/d', '/s', '/c', command], + }; + } + + return { + command: shell, + args: (command) => ['-lc', command], + }; +} + +function detectDefaultShell() { + if (process.env.SHELL) { + return process.env.SHELL; + } + + if (process.platform === 'win32') { + const gitBashCandidates = [ + 'C:\\Program Files\\Git\\bin\\bash.exe', + 'C:\\Program Files (x86)\\Git\\bin\\bash.exe', + ]; + const gitBash = gitBashCandidates.find((candidate) => existsSync(candidate)); + if (gitBash) { + return gitBash; + } + return process.env.ComSpec || 'powershell.exe'; + } + + return '/bin/zsh'; +} + +export function parseCliArgs(argv) { + const parsed = { + scenario: '', + scenarioDir: null, + casePacketPath: '', + outDir: null, + cwd: process.cwd(), + runId: null, + mode: 'head_to_head', + sideACommand: '', + sideBCommand: '', + judgeCommand: '', + sideAProvider: 'openai', + sideAModel: 'chatgpt-pro', + sideAReasoningEffort: DEFAULT_REASONING_EFFORT, + sideBProvider: 'anthropic', + sideBModel: 'claude-max', + sideBReasoningEffort: DEFAULT_REASONING_EFFORT, + judgeProvider: 'openai', + judgeModel: 'chatgpt-pro', + judgeReasoningEffort: DEFAULT_REASONING_EFFORT, + judgeSelectionPolicy: 'single_judge_cli', + maxCostUsd: DEFAULT_BUDGETS.maxCostUsd, + maxLatencyMs: DEFAULT_BUDGETS.maxLatencyMs, + minMarginForVerdict: DEFAULT_BUDGETS.minMarginForVerdict, + timeoutMs: 120000, + judgeReservedBudgetUsd: DEFAULT_BUDGETS.judgeReservedBudgetUsd, + format: 'text', + }; + + for (let index = 0; index < argv.length; index += 1) { + const token = argv[index]; + switch (token) { + case '--scenario': + parsed.scenario = argv[index + 1] || ''; + index += 1; + break; + case '--scenario-dir': + parsed.scenarioDir = argv[index + 1] || null; + index += 1; + break; + case '--case-packet': + parsed.casePacketPath = argv[index + 1] || ''; + index += 1; + break; + case '--out-dir': + parsed.outDir = argv[index + 1] || null; + index += 1; + break; + case '--cwd': + parsed.cwd = argv[index + 1] || process.cwd(); + index += 1; + break; + case '--run-id': + parsed.runId = argv[index + 1] || null; + index += 1; + break; + case '--mode': + parsed.mode = argv[index + 1] || 'head_to_head'; + index += 1; + break; + case '--side-a-command': + parsed.sideACommand = argv[index + 1] || ''; + index += 1; + break; + case '--side-b-command': + parsed.sideBCommand = argv[index + 1] || ''; + index += 1; + break; + case '--judge-command': + parsed.judgeCommand = argv[index + 1] || ''; + index += 1; + break; + case '--side-a-provider': + parsed.sideAProvider = argv[index + 1] || 'openai'; + index += 1; + break; + case '--side-a-model': + parsed.sideAModel = argv[index + 1] || 'chatgpt-pro'; + index += 1; + break; + case '--side-a-reasoning-effort': + parsed.sideAReasoningEffort = argv[index + 1] || DEFAULT_REASONING_EFFORT; + index += 1; + break; + case '--side-b-provider': + parsed.sideBProvider = argv[index + 1] || 'anthropic'; + index += 1; + break; + case '--side-b-model': + parsed.sideBModel = argv[index + 1] || 'claude-max'; + index += 1; + break; + case '--side-b-reasoning-effort': + parsed.sideBReasoningEffort = argv[index + 1] || DEFAULT_REASONING_EFFORT; + index += 1; + break; + case '--judge-provider': + parsed.judgeProvider = argv[index + 1] || 'openai'; + index += 1; + break; + case '--judge-model': + parsed.judgeModel = argv[index + 1] || 'chatgpt-pro'; + index += 1; + break; + case '--judge-reasoning-effort': + parsed.judgeReasoningEffort = argv[index + 1] || DEFAULT_REASONING_EFFORT; + index += 1; + break; + case '--judge-selection-policy': + parsed.judgeSelectionPolicy = argv[index + 1] || 'single_judge_cli'; + index += 1; + break; + case '--max-cost-usd': + parsed.maxCostUsd = Number(argv[index + 1] || ''); + index += 1; + break; + case '--max-latency-ms': + parsed.maxLatencyMs = Number(argv[index + 1] || ''); + index += 1; + break; + case '--min-margin-for-verdict': + parsed.minMarginForVerdict = Number(argv[index + 1] || ''); + index += 1; + break; + case '--timeout-ms': + parsed.timeoutMs = Number(argv[index + 1] || ''); + index += 1; + break; + case '--judge-reserved-budget-usd': + parsed.judgeReservedBudgetUsd = Number(argv[index + 1] || ''); + index += 1; + break; + case '--format': + parsed.format = argv[index + 1] || 'text'; + index += 1; + break; + case '--help': + parsed.help = true; + break; + default: + if (token.startsWith('--')) { + throw new Error(`Unknown flag: ${token}`); + } + break; + } + } + + if (!VALID_FORMATS.has(parsed.format)) { + throw new Error(`Unsupported format "${parsed.format}". Use text or json.`); + } + if (!VALID_MODES.has(parsed.mode)) { + throw new Error(`Unsupported mode "${parsed.mode}".`); + } + + return parsed; +} + +export async function main(argv = process.argv.slice(2)) { + const args = parseCliArgs(argv); + if (args.help) { + console.log( + 'Usage: node scripts/run-conflict-harness.mjs (--scenario scenario-id | --case-packet path) --side-a-command "" --side-b-command "" --judge-command "" [--out-dir dir] [--format text|json]', + ); + return; + } + + const { run } = await runConflictHarness({ + scenario: args.scenario, + scenarioDir: args.scenarioDir || undefined, + casePacketPath: args.casePacketPath || undefined, + outDir: args.outDir || undefined, + cwd: args.cwd, + runId: args.runId, + mode: args.mode, + sideACommand: args.sideACommand, + sideBCommand: args.sideBCommand, + judgeCommand: args.judgeCommand, + sideAProvider: args.sideAProvider, + sideAModel: args.sideAModel, + sideAReasoningEffort: args.sideAReasoningEffort, + sideBProvider: args.sideBProvider, + sideBModel: args.sideBModel, + sideBReasoningEffort: args.sideBReasoningEffort, + judgeProvider: args.judgeProvider, + judgeModel: args.judgeModel, + judgeReasoningEffort: args.judgeReasoningEffort, + judgeSelectionPolicy: args.judgeSelectionPolicy, + maxCostUsd: args.maxCostUsd, + maxLatencyMs: args.maxLatencyMs, + minMarginForVerdict: args.minMarginForVerdict, + timeoutMs: args.timeoutMs, + judgeReservedBudgetUsd: args.judgeReservedBudgetUsd, + }); + + if (args.format === 'json') { + console.log(JSON.stringify(run, null, 2)); + return; + } + + process.stdout.write(formatRunSummary(run)); +} + +async function resolveCasePacket(options) { + if (options.casePacket && typeof options.casePacket === 'object') { + const validation = validateConflictDocument(options.casePacket, 'case'); + if (validation.errors.length > 0) { + throw new Error(formatValidationErrors('Provided case packet failed validation', validation.errors)); + } + return options.casePacket; + } + + if (typeof options.casePacketPath === 'string' && options.casePacketPath.trim().length > 0) { + return loadCasePacket(options.casePacketPath); + } + + return buildCasePacket({ + scenario: options.scenario, + scenarioDir: options.scenarioDir, + }); +} + +function createInitialRunRecord(options) { + return { + run_id: options.runId, + scenario_id: options.casePacket.scenario_id, + mode: options.mode, + status: 'initialized', + sides: { + side_a: { + provider: options.sideAProvider, + model: options.sideAModel, + reasoning_effort: options.sideAReasoningEffort, + access_mode: 'subscription_cli', + role: 'side_a', + first_pass: null, + rebuttal: null, + final: null, + }, + side_b: { + provider: options.sideBProvider, + model: options.sideBModel, + reasoning_effort: options.sideBReasoningEffort, + access_mode: 'subscription_cli', + role: 'side_b', + first_pass: null, + rebuttal: null, + final: null, + }, + }, + judge: { + provider: options.judgeProvider, + model: options.judgeModel, + reasoning_effort: options.judgeReasoningEffort, + access_mode: 'subscription_cli', + blind_labels: true, + family_blind: false, + selection_policy: options.judgeSelectionPolicy, + verdict: null, + }, + budgets: options.budgets, + results: { + winner: null, + margin: null, + judge_confidence: null, + swap_stable: null, + needs_human_review: null, + }, + metrics: { + disagreement_rate: null, + declared_adoption_rate: null, + substantive_revision_rate: null, + unsupported_claim_count: null, + self_contradiction_count: null, + total_estimated_cost_usd: 0, + swap_margin_delta: null, + judge_margin: null, + cost_per_resolved_run: null, + }, + audit: { + identity_leak_warnings: [], + protocol_violations: [], + }, + }; +} + +function buildConfigRecord(options) { + return { + run_id: options.runId, + scenario_id: options.casePacket.scenario_id, + mode: options.mode, + cwd: options.cwd, + budgets: options.budgets, + timeout_ms: options.timeoutMs, + judge_reserved_budget_usd: options.judgeReservedBudgetUsd, + reasoning_efforts: options.reasoningEfforts, + commands: { + side_a: options.commandConfig.side_a ? '[configured]' : '[injected-runner]', + side_b: options.commandConfig.side_b ? '[configured]' : '[injected-runner]', + judge: options.commandConfig.judge ? '[configured]' : '[injected-runner]', + }, + }; +} + +async function invokeCommittedArtifactTurn(options) { + const turnLabel = options.phase === 'first_pass' ? 'first-pass' : 'final'; + const directoryName = options.sideId === 'side_a' ? 'side-a' : 'side-b'; + const sideDir = path.join(options.outDir, directoryName); + const basePrompt = options.phase === 'first_pass' + ? buildFirstPassPrompt(options.casePacket, options.run.run_id, options.sideId) + : buildFinalPrompt( + options.casePacket, + options.run.run_id, + options.sideId, + options.run.sides[options.sideId].first_pass, + options.critiquePacket, + ); + + const inputPacket = options.phase === 'first_pass' + ? options.casePacket + : { + case_packet: options.casePacket, + first_pass: options.run.sides[options.sideId].first_pass, + critique_packet: options.critiquePacket, + }; + + let attempt = 0; + let lastResponse = null; + let repairIssueType = null; + while (attempt < 2) { + const response = await invokeTurn({ + phase: options.phase, + sideId: options.sideId, + runId: options.run.run_id, + prompt: buildRepairPrompt(basePrompt, { + issueType: repairIssueType, + sideId: options.sideId, + }, attempt), + packet: inputPacket, + cwd: options.cwd, + timeoutMs: options.timeoutMs, + turnRunner: options.turnRunner, + command: options.command, + reasoningEffort: options.reasoningEffort, + outDir: options.outDir, + promptFilePath: path.join(sideDir, `${turnLabel}.prompt.txt`), + packetFilePath: path.join(sideDir, `${turnLabel}.input.json`), + rawOutputPath: path.join(sideDir, `${turnLabel}.raw.txt`), + attempt, + }); + const packet = response.packet; + assertCommittedArtifactPacket(packet, options.run.run_id, options.sideId, options.phase); + + const identityLeaks = findIdentityLeaks(packet); + const unseenOpponent = options.phase === 'first_pass' ? findUnseenOpponentReferences(packet) : []; + + if (identityLeaks.length > 0 && attempt === 0) { + repairIssueType = 'identity_leak'; + attempt += 1; + lastResponse = response; + continue; + } + + if (unseenOpponent.length > 0 && attempt === 0) { + repairIssueType = 'unseen_opponent'; + attempt += 1; + lastResponse = response; + continue; + } + + if (identityLeaks.length > 0) { + options.run.audit.identity_leak_warnings.push( + `${options.sideId}:${options.phase}:${identityLeaks.join('; ')}`, + ); + } + + if (unseenOpponent.length > 0) { + options.run.audit.protocol_violations.push( + `${options.sideId}:${options.phase}:${unseenOpponent.join('; ')}`, + ); + options.run.status = 'protocol_violation'; + await validateAndPersistRun(options.outDir, options.run, { + lastCompletedPhase: options.phase, + nextAction: 'stopped', + }); + throw new Error(`Protocol violation: ${unseenOpponent.join('; ')}`); + } + + await writeText(path.join(sideDir, `${turnLabel}.md`), packet.artifact_markdown); + await writeJson(path.join(sideDir, `${turnLabel}.json`), packet); + return { + packet, + estimatedCostUsd: response.estimatedCostUsd, + rawOutputPath: response.rawOutputPath, + }; + } + + if (lastResponse) { + const packet = lastResponse.packet; + assertCommittedArtifactPacket(packet, options.run.run_id, options.sideId, options.phase); + options.run.audit.identity_leak_warnings.push( + `${options.sideId}:${options.phase}:explicit provider self-identification persisted after repair retry`, + ); + await writeText(path.join(sideDir, `${turnLabel}.md`), packet.artifact_markdown); + await writeJson(path.join(sideDir, `${turnLabel}.json`), packet); + return { + packet, + estimatedCostUsd: lastResponse.estimatedCostUsd, + rawOutputPath: lastResponse.rawOutputPath, + }; + } + + throw new Error(`Unable to complete ${options.phase} for ${options.sideId}.`); +} + +async function invokeCritiqueTurn(options) { + const directoryName = options.sideId === 'side_a' ? 'side-a' : 'side-b'; + const sideDir = path.join(options.outDir, directoryName); + const prompt = buildCritiquePrompt( + options.casePacket, + options.run.run_id, + options.sideId, + options.run.sides[options.targetSide].first_pass, + ); + const inputPacket = { + case_packet: options.casePacket, + opposing_first_pass: options.run.sides[options.targetSide].first_pass, + }; + + const response = await invokeTurn({ + phase: 'rebuttal', + sideId: options.sideId, + runId: options.run.run_id, + prompt, + packet: inputPacket, + cwd: options.cwd, + timeoutMs: options.timeoutMs, + turnRunner: options.turnRunner, + command: options.command, + reasoningEffort: options.reasoningEffort, + outDir: options.outDir, + promptFilePath: path.join(sideDir, 'rebuttal.prompt.txt'), + packetFilePath: path.join(sideDir, 'rebuttal.input.json'), + rawOutputPath: path.join(sideDir, 'rebuttal.raw.txt'), + attempt: 0, + }); + const packet = { + ...response.packet, + finding_id: `finding-${options.findingSequence}`, + }; + assertCritiquePacket(packet, options.targetSide); + await writeText(path.join(sideDir, 'rebuttal.md'), formatCritiqueMarkdown(packet)); + await writeJson(path.join(sideDir, 'rebuttal.json'), packet); + return { + packet, + estimatedCostUsd: response.estimatedCostUsd, + }; +} + +async function invokeJudgeTurn(options) { + const judgeDir = path.join(options.outDir, 'judge'); + const judgePacket = buildJudgePacket(options.run, options.casePacket); + const prompt = buildJudgePrompt(judgePacket, options.run.budgets.min_margin_for_verdict); + const response = await invokeTurn({ + phase: 'judge', + sideId: null, + runId: options.run.run_id, + prompt, + packet: judgePacket, + cwd: options.cwd, + timeoutMs: options.timeoutMs, + turnRunner: options.turnRunner, + command: options.command, + reasoningEffort: options.reasoningEffort, + outDir: options.outDir, + promptFilePath: path.join(judgeDir, 'verdict.prompt.txt'), + packetFilePath: path.join(judgeDir, 'verdict.input.json'), + rawOutputPath: path.join(judgeDir, 'verdict.raw.txt'), + attempt: 0, + }); + + const verdict = response.packet; + const validation = validateConflictDocument(verdict, 'verdict'); + if (validation.errors.length > 0) { + throw new Error(formatValidationErrors('Verdict packet failed validation', validation.errors)); + } + + await writeJson(path.join(judgeDir, 'verdict.json'), verdict); + await writeText(path.join(judgeDir, 'verdict.md'), formatVerdictMarkdown(verdict)); + return { + packet: verdict, + estimatedCostUsd: response.estimatedCostUsd, + }; +} + +async function invokeTurn(options) { + await writeText(options.promptFilePath, options.prompt); + await writeJson(options.packetFilePath, options.packet); + + const response = await options.turnRunner({ + phase: options.phase, + sideId: options.sideId, + runId: options.runId, + prompt: options.prompt, + packet: options.packet, + cwd: options.cwd, + timeoutMs: options.timeoutMs, + command: options.command, + reasoningEffort: options.reasoningEffort, + outDir: options.outDir, + promptFilePath: options.promptFilePath, + packetFilePath: options.packetFilePath, + attempt: options.attempt, + }); + + if (response && response.packet) { + const raw = typeof response.stdout === 'string' + ? response.stdout + : JSON.stringify(response.packet, null, 2); + await writeText(options.rawOutputPath, raw); + return { + packet: response.packet, + estimatedCostUsd: normalizeEstimatedCost(response.usage?.estimatedCostUsd), + rawOutputPath: options.rawOutputPath, + }; + } + + if (!response || typeof response.stdout !== 'string') { + throw new Error(`Turn runner for ${options.phase} did not return a packet or stdout.`); + } + + await writeText(options.rawOutputPath, response.stdout); + if (typeof response.exitCode === 'number' && response.exitCode !== 0) { + throw new Error( + `${options.phase} ${options.sideId || 'judge'} turn failed with exit code ${response.exitCode}: ${response.stderr || ''}`.trim(), + ); + } + + const parsed = parseJsonResponse(response.stdout); + return { + packet: parsed.packet, + estimatedCostUsd: normalizeEstimatedCost(parsed.usage?.estimatedCostUsd), + rawOutputPath: options.rawOutputPath, + }; +} + +function parseJsonResponse(output) { + const candidate = extractJsonCandidate(output); + let parsed; + try { + parsed = JSON.parse(candidate); + } catch (error) { + throw new Error(`Model output is not valid JSON: ${error instanceof Error ? error.message : String(error)}`); + } + + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed) && parsed.packet) { + return { + packet: parsed.packet, + usage: parsed.usage || {}, + }; + } + + return { + packet: parsed, + usage: {}, + }; +} + +function extractJsonCandidate(output) { + const trimmed = output.trim(); + if (trimmed.startsWith('{')) return trimmed; + + const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i); + if (fenced) { + return fenced[1].trim(); + } + + const firstBrace = trimmed.indexOf('{'); + const lastBrace = trimmed.lastIndexOf('}'); + if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) { + return trimmed.slice(firstBrace, lastBrace + 1); + } + + return trimmed; +} + +function assertCommittedArtifactPacket(packet, runId, sideId, phase) { + if (!packet || typeof packet !== 'object' || Array.isArray(packet)) { + throw new Error(`${phase} ${sideId} packet must be an object.`); + } + + const required = ['run_id', 'side_id', 'round', 'artifact_markdown', 'claims', 'citations', 'conclusion_confidence', 'open_questions', 'critique_responses']; + for (const key of required) { + if (!(key in packet)) { + throw new Error(`${phase} ${sideId} packet is missing ${key}.`); + } + } + + if (packet.run_id !== runId) { + throw new Error(`${phase} ${sideId} packet has unexpected run_id.`); + } + if (packet.side_id !== sideId) { + throw new Error(`${phase} ${sideId} packet has unexpected side_id.`); + } + if (packet.round !== phase) { + throw new Error(`${phase} ${sideId} packet has unexpected round.`); + } +} + +function assertCritiquePacket(packet, targetSide) { + if (!packet || typeof packet !== 'object' || Array.isArray(packet)) { + throw new Error('Critique packet must be an object.'); + } + const required = ['target_side', 'finding_id', 'target_claim_ids', 'claim_under_attack', 'attack_type', 'evidence_or_reason', 'severity']; + for (const key of required) { + if (!(key in packet)) { + throw new Error(`Critique packet is missing ${key}.`); + } + } + if (packet.target_side !== targetSide) { + throw new Error('Critique packet target_side does not match the opposing side.'); + } +} + +function buildFirstPassPrompt(casePacket, runId, sideId) { + return [ + `You are ${sideId.toUpperCase()} in a sealed first-pass conflict harness run.`, + 'Do not reveal provider identity.', + 'Do not mention or speculate about the opponent.', + 'Return ONLY a JSON object with this exact shape:', + '', + JSON.stringify( + { + run_id: runId, + side_id: sideId, + round: 'first_pass', + artifact_markdown: '# Your artifact', + claims: [ + { + claim_id: `${sideId}-claim-1`, + summary: 'One major claim', + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + null, + 2, + ), + '', + 'Case packet:', + JSON.stringify(casePacket, null, 2), + ].join('\n'); +} + +function buildCritiquePrompt(casePacket, runId, sideId, opposingFirstPass) { + const targetSide = sideId === 'side_a' ? 'side_b' : 'side_a'; + return [ + `You are ${sideId.toUpperCase()} in the rebuttal phase for run ${runId}.`, + 'Attack only visible claims from the opposing first-pass artifact.', + 'Return ONLY a JSON object with this exact shape:', + '', + JSON.stringify( + { + target_side: targetSide, + finding_id: 'runner-will-assign', + target_claim_ids: [`${targetSide}-claim-1`], + claim_under_attack: 'The opposing claim to challenge.', + attack_type: 'evidence_gap', + evidence_or_reason: 'Why the visible claim is weak.', + severity: 'medium', + }, + null, + 2, + ), + '', + 'Case packet:', + JSON.stringify(casePacket, null, 2), + '', + 'Opposing first-pass artifact:', + JSON.stringify(opposingFirstPass, null, 2), + ].join('\n'); +} + +function buildRepairPrompt(basePrompt, issue, attempt) { + if (attempt === 0) return basePrompt; + + const messages = { + identity_leak: + 'Repair instruction: remove any provider self-identification or family naming from the JSON output. Keep the same packet shape.', + unseen_opponent: + 'Repair instruction: remove any mention of unseen opponent content or speculation about the other side. Keep the same packet shape.', + }; + + return `${basePrompt}\n\n${messages[issue.issueType] || 'Repair instruction: keep the exact packet shape.'}`; +} + +function buildFinalPrompt(casePacket, runId, sideId, firstPassPacket, critiquePacket) { + return [ + `You are ${sideId.toUpperCase()} in the final revision phase for run ${runId}.`, + 'You must either adopt the critique or reject it with reasoning.', + 'Do not reveal provider identity.', + 'Return ONLY a JSON object with this exact shape:', + '', + JSON.stringify( + { + run_id: runId, + side_id: sideId, + round: 'final', + artifact_markdown: '# Revised artifact', + claims: [ + { + claim_id: `${sideId}-claim-1`, + summary: 'Updated major claim', + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [ + { + finding_id: critiquePacket?.finding_id || 'finding-1', + disposition: 'adopted', + rationale: 'Why the critique was adopted or rejected.', + }, + ], + }, + null, + 2, + ), + '', + 'Case packet:', + JSON.stringify(casePacket, null, 2), + '', + 'Your first-pass artifact:', + JSON.stringify(firstPassPacket, null, 2), + '', + 'Visible critique packet:', + JSON.stringify(critiquePacket, null, 2), + ].join('\n'); +} + +function buildJudgePacket(run, casePacket) { + return { + case_packet: casePacket, + first_pass_artifacts: { + side_a: redactProviderIdentity(run.sides.side_a.first_pass), + side_b: redactProviderIdentity(run.sides.side_b.first_pass), + }, + critique_packets: { + side_a: redactProviderIdentity(run.sides.side_a.rebuttal), + side_b: redactProviderIdentity(run.sides.side_b.rebuttal), + }, + final_artifacts: { + side_a: redactProviderIdentity(run.sides.side_a.final), + side_b: redactProviderIdentity(run.sides.side_b.final), + }, + min_margin_for_verdict: run.budgets.min_margin_for_verdict, + }; +} + +function buildJudgePrompt(judgePacket, minMarginForVerdict) { + return [ + 'You are the judge in a blinded conflict harness run.', + 'Provider identity is intentionally hidden. Do not infer provider family.', + 'Return ONLY a JSON object with this exact shape:', + '', + JSON.stringify( + { + winner: 'tie', + margin: 0, + rubric_scores: { + side_a: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3.0, + }, + side_b: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3.0, + }, + }, + dimension_rationales: { + claim_quality: 'Brief explanation of which side made the stronger claims and why.', + evidence_discipline: 'Brief explanation of how the sides used or overstated evidence.', + responsiveness_to_critique: 'Brief explanation of how each side handled the critique phase.', + internal_consistency: 'Brief explanation of contradictions, coherence, or missing logic.', + decision_usefulness: 'Brief explanation of which artifact better supports an actual decision.', + }, + side_summaries: { + side_a: { + strengths: ['One concise strength of Side A.'], + weaknesses: ['One concise weakness of Side A.'], + }, + side_b: { + strengths: ['One concise strength of Side B.'], + weaknesses: ['One concise weakness of Side B.'], + }, + }, + decisive_dimension: 'decision_usefulness', + decisive_findings: ['Replace with the actual decisive findings from this run.'], + judge_confidence: 'low', + needs_human_review: true, + rationale: 'One-paragraph explanation of the verdict.', + }, + null, + 2, + ), + '', + `Use min_margin_for_verdict = ${minMarginForVerdict}.`, + 'Set rubric_scores.*.weighted_total as a normalized aggregate on the same 1-5 scale as the rubric dimensions, not as a raw sum.', + 'Treat each side\'s reported conclusion_confidence as a calibration signal, not a vote multiplier.', + 'Reward confidence when it is well matched to the evidence and penalize overconfidence when the artifact overreaches.', + 'judge_confidence rubric:', + '- high: the winning side is clearly stronger on at least 3 rubric dimensions and has no major unsupported-claim problem', + '- medium: the winning side leads overall but has at least 1 weak dimension or absorbed only part of the opposing critique', + '- low: score margin is below min_margin_for_verdict or both sides have significant unsupported claims', + '', + 'Judge packet:', + JSON.stringify(judgePacket, null, 2), + ].join('\n'); +} + +function applyVerdictToRun(run, verdict) { + const unsupportedBySide = computeUnsupportedClaimCounts(run); + const marginBelowThreshold = verdict.margin < run.budgets.min_margin_for_verdict; + const bothSidesUnsupported = unsupportedBySide.side_a > 0 && unsupportedBySide.side_b > 0; + + run.results.winner = verdict.winner; + run.results.margin = verdict.margin; + run.results.judge_confidence = verdict.judge_confidence; + run.results.swap_stable = null; + run.results.needs_human_review = + Boolean(verdict.needs_human_review) || marginBelowThreshold || bothSidesUnsupported; +} + +function computeRunMetrics(run) { + run.metrics.disagreement_rate = computeDisagreementRate(run); + run.metrics.declared_adoption_rate = computeDeclaredAdoptionRate(run); + run.metrics.substantive_revision_rate = computeSubstantiveRevisionRate(run); + const unsupportedCounts = computeUnsupportedClaimCounts(run); + run.metrics.unsupported_claim_count = unsupportedCounts.side_a + unsupportedCounts.side_b; + run.metrics.self_contradiction_count = 0; + run.metrics.swap_margin_delta = null; + run.metrics.judge_margin = run.results.margin; + run.metrics.cost_per_resolved_run = + run.results.winner && run.results.winner !== 'tie' + ? run.metrics.total_estimated_cost_usd + : null; +} + +function computeDisagreementRate(run) { + const rates = SIDE_IDS.map((sideId) => { + const firstPass = run.sides[sideId].first_pass; + const critique = run.sides[sideId === 'side_a' ? 'side_b' : 'side_a'].rebuttal; + if (!firstPass || !Array.isArray(firstPass.claims)) return 0; + const majorClaims = firstPass.claims.filter((claim) => claim.is_major); + if (majorClaims.length === 0 || !critique) return 0; + const targeted = new Set(critique.target_claim_ids || []); + const disputed = majorClaims.filter((claim) => targeted.has(claim.claim_id)).length; + return disputed / majorClaims.length; + }); + + return roundNumber((rates[0] + rates[1]) / 2, 4); +} + +function computeDeclaredAdoptionRate(run) { + const responses = SIDE_IDS.flatMap((sideId) => run.sides[sideId].final?.critique_responses || []); + if (responses.length === 0) return 0; + const adopted = responses.filter((entry) => entry.disposition === 'adopted').length; + return roundNumber(adopted / responses.length, 4); +} + +function computeSubstantiveRevisionRate(run) { + let totalResponses = 0; + let substantivelyRevised = 0; + + for (const sideId of SIDE_IDS) { + const finalPacket = run.sides[sideId].final; + const responses = finalPacket?.critique_responses || []; + const firstPassPacket = run.sides[sideId].first_pass; + const critiquePacket = run.sides[sideId === 'side_a' ? 'side_b' : 'side_a'].rebuttal; + + for (const response of responses) { + totalResponses += 1; + if (response.disposition !== 'adopted') continue; + if (didSubstantivelyReviseTargetedClaims(firstPassPacket, finalPacket, critiquePacket, response)) { + substantivelyRevised += 1; + } + } + } + + if (totalResponses === 0) return 0; + return roundNumber(substantivelyRevised / totalResponses, 4); +} + +function didSubstantivelyReviseTargetedClaims(firstPassPacket, finalPacket, critiquePacket, response) { + if (!firstPassPacket || !finalPacket || !critiquePacket) return false; + if (response.finding_id !== critiquePacket.finding_id) return false; + + const targetClaimIds = critiquePacket.target_claim_ids || []; + if (targetClaimIds.length === 0) return false; + + const firstClaims = new Map((firstPassPacket.claims || []).map((claim) => [claim.claim_id, claim])); + const finalClaims = new Map((finalPacket.claims || []).map((claim) => [claim.claim_id, claim])); + + for (const claimId of targetClaimIds) { + const firstClaim = firstClaims.get(claimId); + const finalClaim = finalClaims.get(claimId); + + if (firstClaim && !finalClaim) { + return true; + } + + if (!firstClaim || !finalClaim) { + continue; + } + + if (normalizeComparableText(firstClaim.summary) !== normalizeComparableText(finalClaim.summary)) { + return true; + } + + if (normalizeStringArray(firstClaim.evidence_refs).join('|') !== normalizeStringArray(finalClaim.evidence_refs).join('|')) { + return true; + } + + if (Boolean(firstClaim.is_major) !== Boolean(finalClaim.is_major)) { + return true; + } + } + + return false; +} + +function computeUnsupportedClaimCounts(run) { + return { + side_a: countUnsupportedClaims(run.sides.side_a.final || run.sides.side_a.first_pass), + side_b: countUnsupportedClaims(run.sides.side_b.final || run.sides.side_b.first_pass), + }; +} + +function countUnsupportedClaims(packet) { + if (!packet || !Array.isArray(packet.claims)) return 0; + return packet.claims.filter((claim) => !Array.isArray(claim.evidence_refs) || claim.evidence_refs.length === 0).length; +} + +async function validateAndPersistRun(outDir, run, state) { + const validation = validateConflictDocument(run, 'run'); + if (validation.errors.length > 0) { + throw new Error(formatValidationErrors('Run record failed validation', validation.errors)); + } + await persistRunState(outDir, run, state); +} + +async function persistRunState(outDir, run, state) { + await writeJson(path.join(outDir, 'run.json'), run); + await writeJson(path.join(outDir, 'state.json'), { + run_id: run.run_id, + last_completed_phase: state.lastCompletedPhase, + next_action: state.nextAction, + status: run.status, + }); +} + +async function createTranscriptLayout(outDir) { + await mkdir(path.join(outDir, 'side-a'), { recursive: true }); + await mkdir(path.join(outDir, 'side-b'), { recursive: true }); + await mkdir(path.join(outDir, 'judge'), { recursive: true }); + await mkdir(path.join(outDir, 'swap-test'), { recursive: true }); + await mkdir(path.join(outDir, 'review'), { recursive: true }); +} + +function hasExceededVisiblePhaseBudget(run, judgeReservedBudgetUsd) { + return run.metrics.total_estimated_cost_usd > run.budgets.max_cost_usd - judgeReservedBudgetUsd; +} + +function hasReservedJudgeBudget(run, judgeReservedBudgetUsd) { + return run.metrics.total_estimated_cost_usd <= run.budgets.max_cost_usd - judgeReservedBudgetUsd; +} + +function findIdentityLeaks(packet) { + const serialized = JSON.stringify(packet); + return IDENTITY_LEAK_PATTERNS + .filter((pattern) => pattern.test(serialized)) + .map((pattern) => pattern.source); +} + +function findUnseenOpponentReferences(packet) { + const artifactMarkdown = typeof packet?.artifact_markdown === 'string' + ? packet.artifact_markdown + : ''; + return UNSEEN_OPPONENT_PATTERNS + .filter((pattern) => pattern.test(artifactMarkdown)) + .map((pattern) => pattern.source); +} + +function redactProviderIdentity(value) { + if (typeof value === 'string') { + let redacted = value; + for (const pattern of IDENTITY_LEAK_PATTERNS) { + const flags = pattern.flags.includes('g') ? pattern.flags : `${pattern.flags}g`; + redacted = redacted.replace(new RegExp(pattern.source, flags), '[redacted-provider]'); + } + return redacted; + } + if (Array.isArray(value)) { + return value.map((entry) => redactProviderIdentity(entry)); + } + if (value && typeof value === 'object') { + return Object.fromEntries( + Object.entries(value).map(([key, entry]) => [key, redactProviderIdentity(entry)]), + ); + } + return value; +} + +function normalizeCommandConfig(options) { + if (options.turnRunner) { + return { side_a: null, side_b: null, judge: null }; + } + + return { + side_a: normalizeRequiredString(options.sideACommand, '--side-a-command'), + side_b: normalizeRequiredString(options.sideBCommand, '--side-b-command'), + judge: normalizeRequiredString(options.judgeCommand, '--judge-command'), + }; +} + +function normalizeMode(value) { + if (!VALID_MODES.has(value)) { + throw new Error(`Unsupported mode "${value}".`); + } + return value; +} + +function normalizeRunId(value, scenarioId) { + if (typeof value === 'string' && value.trim().length > 0) { + return value.trim(); + } + const timestamp = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').replace('Z', 'Z'); + return `conflict-${timestamp}-${scenarioId}`; +} + +function resolveRunDirectory(scenarioId, runId, outDir) { + const root = path.resolve(outDir || path.join('benchmarks', 'results', 'conflict-harness')); + return path.join(root, scenarioId, runId); +} + +function normalizeRequiredString(value, flagName) { + if (typeof value === 'string' && value.trim().length > 0) { + return value.trim(); + } + throw new Error(`Missing required ${flagName}.`); +} + +function normalizeReasoningEffort(value, flagName) { + if (typeof value === 'string' && value.trim().length > 0) { + return value.trim(); + } + throw new Error(`${flagName} must be a non-empty string.`); +} + +function normalizePositiveInteger(value, flagName) { + if (!Number.isInteger(value) || value <= 0) { + throw new Error(`${flagName} must be a positive integer.`); + } + return value; +} + +function normalizePositiveNumber(value, flagName) { + if (!Number.isFinite(value) || value <= 0) { + throw new Error(`${flagName} must be a positive number.`); + } + return value; +} + +function normalizeNonNegativeNumber(value, flagName) { + if (!Number.isFinite(value) || value < 0) { + throw new Error(`${flagName} must be a non-negative number.`); + } + return value; +} + +function normalizeEstimatedCost(value) { + if (!Number.isFinite(value) || value < 0) return 0; + return value; +} + +function expandCommandTemplate(command, turnOptions) { + const replacements = { + '{{prompt_file}}': shellEscape(turnOptions.promptFilePath), + '{{packet_file}}': shellEscape(turnOptions.packetFilePath), + '{{run_id}}': shellEscape(turnOptions.runId), + '{{side_id}}': shellEscape(turnOptions.sideId || ''), + '{{phase}}': shellEscape(turnOptions.phase), + '{{out_dir}}': shellEscape(turnOptions.outDir), + '{{reasoning_effort}}': shellEscape(turnOptions.reasoningEffort || ''), + }; + + let expanded = command; + for (const [pattern, replacement] of Object.entries(replacements)) { + expanded = expanded.split(pattern).join(replacement); + } + return expanded; +} + +export function injectReasoningEffort(command, reasoningEffort) { + if (!reasoningEffort) return command; + + let nextCommand = command; + + if (/(^|\|\s*)claude\b/.test(nextCommand) && !/\s--effort\b/.test(nextCommand)) { + nextCommand = nextCommand.replace( + /(^|\|\s*)claude\b/, + (_match, prefix) => `${prefix}claude --effort ${shellEscape(reasoningEffort)}`, + ); + } + + if (/(^|\|\s*)codex\s+exec\b/.test(nextCommand) && !/\bmodel_reasoning_effort\b/.test(nextCommand)) { + nextCommand = nextCommand.replace( + /(^|\|\s*)codex\s+exec\b/, + (_match, prefix) => `${prefix}codex exec -c model_reasoning_effort=${shellEscape(reasoningEffort)}`, + ); + } + + if (/(^|\|\s*)gemini\b/.test(nextCommand) && !/\s-m\s/.test(nextCommand) && !/\s--model\s/.test(nextCommand)) { + const geminiAlias = `shipwright-gemini-${normalizeGeminiReasoningEffort(reasoningEffort)}`; + nextCommand = nextCommand.replace( + /(^|\|\s*)gemini\b/, + (_match, prefix) => `${prefix}gemini -m ${shellEscape(geminiAlias)}`, + ); + } + + return nextCommand; +} + +function normalizeGeminiReasoningEffort(reasoningEffort) { + const normalized = typeof reasoningEffort === 'string' ? reasoningEffort.trim().toLowerCase() : ''; + if (normalized === 'medium' || normalized === 'high') { + return normalized; + } + if (normalized === 'low') { + return 'medium'; + } + return 'medium'; +} + +function shellEscape(value) { + const stringValue = typeof value === 'string' ? value : String(value ?? ''); + return `'${stringValue.replace(/'/g, `'\\''`)}'`; +} + +function normalizeComparableText(value) { + return typeof value === 'string' ? value.trim().replace(/\s+/g, ' ').toLowerCase() : ''; +} + +function normalizeStringArray(values) { + if (!Array.isArray(values)) return []; + return [...values] + .map((value) => (typeof value === 'string' ? value.trim() : String(value ?? '').trim())) + .sort(); +} + +function formatValidationErrors(label, errors) { + return `${label}:\n${errors.map((error) => `${error.path}: ${error.message}`).join('\n')}`; +} + +function formatRunSummary(run) { + return [ + `Run ID: ${run.run_id}`, + `Scenario: ${run.scenario_id}`, + `Status: ${run.status}`, + `Winner: ${run.results.winner || 'n/a'}`, + `Margin: ${run.results.margin ?? 'n/a'}`, + `Needs human review: ${run.results.needs_human_review ?? 'n/a'}`, + ].join('\n') + '\n'; +} + +function formatCritiqueMarkdown(packet) { + return [ + `# Critique: ${packet.finding_id}`, + '', + `- Target Side: ${packet.target_side}`, + `- Target Claim IDs: ${(packet.target_claim_ids || []).join(', ')}`, + `- Attack Type: ${packet.attack_type}`, + `- Severity: ${packet.severity}`, + '', + `## Claim Under Attack`, + packet.claim_under_attack, + '', + `## Evidence Or Reason`, + packet.evidence_or_reason, + '', + ].join('\n'); +} + +function formatVerdictMarkdown(packet) { + return [ + '# Verdict', + '', + `- Winner: ${packet.winner}`, + `- Margin: ${packet.margin}`, + `- Judge Confidence: ${packet.judge_confidence}`, + `- Needs Human Review: ${packet.needs_human_review}`, + '', + '## Dimension Rationales', + `- Claim Quality: ${packet.dimension_rationales?.claim_quality || 'n/a'}`, + `- Evidence Discipline: ${packet.dimension_rationales?.evidence_discipline || 'n/a'}`, + `- Responsiveness To Critique: ${packet.dimension_rationales?.responsiveness_to_critique || 'n/a'}`, + `- Internal Consistency: ${packet.dimension_rationales?.internal_consistency || 'n/a'}`, + `- Decision Usefulness: ${packet.dimension_rationales?.decision_usefulness || 'n/a'}`, + '', + '## Side Summaries', + `### Side A Strengths`, + ...((packet.side_summaries?.side_a?.strengths || []).map((entry) => `- ${entry}`)), + '', + `### Side A Weaknesses`, + ...((packet.side_summaries?.side_a?.weaknesses || []).map((entry) => `- ${entry}`)), + '', + `### Side B Strengths`, + ...((packet.side_summaries?.side_b?.strengths || []).map((entry) => `- ${entry}`)), + '', + `### Side B Weaknesses`, + ...((packet.side_summaries?.side_b?.weaknesses || []).map((entry) => `- ${entry}`)), + '', + `- Decisive Dimension: ${packet.decisive_dimension || 'n/a'}`, + '', + '## Decisive Findings', + ...(packet.decisive_findings || []).map((finding) => `- ${finding}`), + '', + '## Rationale', + packet.rationale, + '', + ].join('\n'); +} + +async function writeJson(filePath, value) { + await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8'); +} + +async function writeText(filePath, value) { + await writeFile(filePath, `${value}\n`, 'utf8'); +} + +function roundNumber(value, digits) { + const factor = 10 ** digits; + return Math.round(value * factor) / factor; +} + +function isDirectRun() { + if (!process.argv[1]) return false; + return import.meta.url === pathToFileURL(path.resolve(process.argv[1])).href; +} + +if (isDirectRun()) { + main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; + }); +} diff --git a/tests/build-case-packet.test.mjs b/tests/build-case-packet.test.mjs new file mode 100644 index 00000000..4796e376 --- /dev/null +++ b/tests/build-case-packet.test.mjs @@ -0,0 +1,74 @@ +import assert from 'node:assert/strict'; +import path from 'node:path'; +import test from 'node:test'; + +import { + buildCasePacket, + buildCasePacketFromScenario, + parseCliArgs, + validateConflictDocument, +} from '../scripts/build-case-packet.mjs'; +import { loadBenchmarkScenario } from '../scripts/run-benchmarks.mjs'; + +test('buildCasePacketFromScenario creates a valid case packet from a benchmark scenario', async () => { + const scenario = await loadBenchmarkScenario('benchmarks/scenarios/prd-hidden-scope-creep.json'); + const packet = buildCasePacketFromScenario(scenario); + const validation = validateConflictDocument(packet, 'case'); + + assert.equal(packet.scenario_id, 'prd-hidden-scope-creep'); + assert.equal(packet.artifact_type, 'prd'); + assert.equal(packet.tool_policy, 'none'); + assert.deepEqual(packet.rubric.dimensions, [ + 'claim quality', + 'evidence discipline', + 'responsiveness to critique', + 'internal consistency', + 'decision usefulness', + ]); + assert.equal(validation.errors.length, 0); +}); + +test('buildCasePacket loads a scenario by id', async () => { + const packet = await buildCasePacket({ + scenario: 'event-automation-boundary', + }); + + assert.equal(packet.scenario_id, 'event-automation-boundary'); + assert.match(packet.prompt, /Phase 1 PRD/); +}); + +test('validateConflictDocument reports missing required fields', () => { + const validation = validateConflictDocument( + { + scenario_id: 'missing-fields', + prompt: 'Hello', + }, + 'case', + ); + + assert.ok(validation.errors.some((error) => error.path === '$.artifact_type')); + assert.ok(validation.errors.some((error) => error.path === '$.rubric')); +}); + +test('parseCliArgs validates supported flags', () => { + assert.deepEqual( + parseCliArgs([ + '--scenario', + 'prd-hidden-scope-creep', + '--out', + '/tmp/case.json', + '--max-rounds', + '4', + '--tool-policy', + 'none', + ]), + { + scenario: 'prd-hidden-scope-creep', + scenarioDir: path.resolve('benchmarks', 'scenarios'), + outPath: '/tmp/case.json', + maxRounds: 4, + toolPolicy: 'none', + format: 'json', + }, + ); +}); diff --git a/tests/rejudge-conflict-batch.test.mjs b/tests/rejudge-conflict-batch.test.mjs new file mode 100644 index 00000000..43a033ef --- /dev/null +++ b/tests/rejudge-conflict-batch.test.mjs @@ -0,0 +1,142 @@ +import assert from 'node:assert/strict'; +import { mkdtemp, mkdir, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import test from 'node:test'; + +import { + buildBatchSummary, + discoverRunDirs, + parseCliArgs, + runRejudgeBatch, +} from '../scripts/rejudge-conflict-batch.mjs'; + +test('parseCliArgs handles batch rejudge options', () => { + const args = parseCliArgs([ + '--root-dir', '/tmp/results', + '--scenario', 'board-update-ambiguity', + '--run-dir', '/tmp/results/foo/conflict-1', + '--judge-agent', 'gemini', + '--label', 'gemini-pilot', + '--out', '/tmp/summary.md', + '--format', 'json', + ]); + + assert.equal(args.rootDir, '/tmp/results'); + assert.deepEqual(args.scenarios, ['board-update-ambiguity']); + assert.deepEqual(args.runDirs, ['/tmp/results/foo/conflict-1']); + assert.equal(args.judgeAgent, 'gemini'); + assert.equal(args.label, 'gemini-pilot'); + assert.equal(args.outPath, '/tmp/summary.md'); + assert.equal(args.format, 'json'); +}); + +test('discoverRunDirs finds completed run directories and filters scenarios', async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-rejudge-batch-')); + const scenarioA = path.join(rootDir, 'board-update-ambiguity'); + const scenarioB = path.join(rootDir, 'handoff-contradiction'); + const runA = path.join(scenarioA, 'conflict-a'); + const runB = path.join(scenarioB, 'conflict-b'); + await mkdir(runA, { recursive: true }); + await mkdir(runB, { recursive: true }); + await mkdir(path.join(scenarioA, 'not-a-run'), { recursive: true }); + + const all = await discoverRunDirs(rootDir); + const filtered = await discoverRunDirs(rootDir, ['handoff-contradiction']); + + assert.deepEqual(all, [runA, runB]); + assert.deepEqual(filtered, [runB]); +}); + +test('runRejudgeBatch reuses single-run replay across multiple completed runs', async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-rejudge-batch-')); + const runDir = path.join(rootDir, 'board-update-ambiguity', 'conflict-a'); + const judgeDir = path.join(runDir, 'judge'); + await mkdir(judgeDir, { recursive: true }); + await Promise.all([ + writeFile(path.join(runDir, 'run.json'), JSON.stringify({ run_id: 'conflict-a', scenario_id: 'board-update-ambiguity' })), + writeFile(path.join(judgeDir, 'verdict.prompt.txt'), 'prompt'), + writeFile(path.join(judgeDir, 'verdict.input.json'), JSON.stringify({ case_packet: {}, first_pass_artifacts: {}, critique_packets: {}, final_artifacts: {}, min_margin_for_verdict: 0.1 })), + ]); + + const results = await runRejudgeBatch({ + runDirs: [runDir], + judgeAgent: 'gemini', + turnRunner: async () => ({ + stdout: JSON.stringify({ + winner: 'side_b', + margin: 0.7, + rubric_scores: { + side_a: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3, + }, + side_b: { + claim_quality: 4, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 4, + decision_usefulness: 4, + weighted_total: 4, + }, + }, + dimension_rationales: { + claim_quality: 'Side B stronger.', + evidence_discipline: 'Side B cleaner.', + responsiveness_to_critique: 'Side B better.', + internal_consistency: 'Side B tighter.', + decision_usefulness: 'Side B more useful.', + }, + side_summaries: { + side_a: { strengths: ['Clear framing.'], weaknesses: ['Weaker conclusion.'] }, + side_b: { strengths: ['Stronger recommendation.'], weaknesses: ['Slightly dense.'] }, + }, + decisive_dimension: 'decision_usefulness', + decisive_findings: ['Side B is more decision-useful.'], + judge_confidence: 'medium', + needs_human_review: false, + rationale: 'Side B wins.', + }), + stderr: '', + exitCode: 0, + }), + }); + + assert.equal(results.length, 1); + assert.equal(results[0].status, 'completed'); + assert.equal(results[0].winner, 'side_b'); + assert.equal(results[0].scenario, 'board-update-ambiguity'); +}); + +test('buildBatchSummary renders compact replay results', () => { + const summary = buildBatchSummary([ + { + scenario: 'board-update-ambiguity', + runId: 'conflict-a', + status: 'completed', + winner: 'side_b', + margin: 0.7, + judgeConfidence: 'medium', + needsHumanReview: false, + }, + { + scenario: 'handoff-contradiction', + runId: 'conflict-b', + status: 'error', + winner: null, + margin: null, + judgeConfidence: null, + needsHumanReview: null, + }, + ], 'gemini-judge'); + + assert.ok(summary.includes('# Rejudge Batch Summary')); + assert.ok(summary.includes('Judge: gemini-judge')); + assert.ok(summary.includes('Errors: 1')); + assert.ok(summary.includes('board-update-ambiguity')); + assert.ok(summary.includes('conflict-a')); +}); diff --git a/tests/rejudge-conflict-run.test.mjs b/tests/rejudge-conflict-run.test.mjs new file mode 100644 index 00000000..2b6523df --- /dev/null +++ b/tests/rejudge-conflict-run.test.mjs @@ -0,0 +1,371 @@ +import assert from 'node:assert/strict'; +import { mkdtemp, mkdir, readFile, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import test from 'node:test'; + +import { parseCliArgs, rejudgeConflictRun } from '../scripts/rejudge-conflict-run.mjs'; +import { validateConflictDocument } from '../scripts/build-case-packet.mjs'; + +test('parseCliArgs handles rejudge options', () => { + const args = parseCliArgs([ + '--run-dir', '/tmp/example-run', + '--judge-agent', 'gemini', + '--label', 'gemini-pilot', + '--judge-reasoning-effort', 'medium', + '--timeout-ms', '90000', + '--format', 'json', + ]); + + assert.equal(args.runDir, '/tmp/example-run'); + assert.equal(args.judgeAgent, 'gemini'); + assert.equal(args.label, 'gemini-pilot'); + assert.equal(args.judgeReasoningEffort, 'medium'); + assert.equal(args.timeoutMs, 90000); + assert.equal(args.format, 'json'); +}); + +test('rejudgeConflictRun reuses saved judge prompt and writes a sidecar verdict', async () => { + const tmpRoot = await mkdtemp(path.join(os.tmpdir(), 'shipwright-rejudge-')); + const runDir = path.join(tmpRoot, 'conflict-run'); + const judgeDir = path.join(runDir, 'judge'); + await mkdir(judgeDir, { recursive: true }); + + const run = { + run_id: 'conflict-test-run', + }; + const judgePacket = { + case_packet: { scenario_id: 'board-update-ambiguity' }, + first_pass_artifacts: { side_a: {}, side_b: {} }, + critique_packets: { side_a: {}, side_b: {} }, + final_artifacts: { side_a: {}, side_b: {} }, + min_margin_for_verdict: 0.1, + }; + const prompt = 'Return ONLY a JSON object.'; + const verdict = { + winner: 'side_b', + margin: 0.8, + rubric_scores: { + side_a: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3.0, + }, + side_b: { + claim_quality: 4, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 4, + decision_usefulness: 4, + weighted_total: 4.0, + }, + }, + dimension_rationales: { + claim_quality: 'Side B states the stronger core case.', + evidence_discipline: 'Side B uses the available evidence more carefully.', + responsiveness_to_critique: 'Side B better incorporates the critique.', + internal_consistency: 'Side B is more internally consistent.', + decision_usefulness: 'Side B is more helpful to an actual decision maker.', + }, + side_summaries: { + side_a: { + strengths: ['Clear framing.'], + weaknesses: ['Less complete final recommendation.'], + }, + side_b: { + strengths: ['More actionable recommendation.'], + weaknesses: ['Slightly denser prose.'], + }, + }, + decisive_dimension: 'decision_usefulness', + decisive_findings: ['Side B is more decision-useful.'], + judge_confidence: 'medium', + needs_human_review: false, + rationale: 'Side B better addressed the board decision.', + }; + + await Promise.all([ + writeFile(path.join(runDir, 'run.json'), `${JSON.stringify(run, null, 2)}\n`), + writeFile(path.join(judgeDir, 'verdict.input.json'), `${JSON.stringify(judgePacket, null, 2)}\n`), + writeFile(path.join(judgeDir, 'verdict.prompt.txt'), prompt), + ]); + + const seen = []; + const result = await rejudgeConflictRun({ + runDir, + judgeAgent: 'gemini', + label: 'gemini-pilot', + turnRunner: async (turnOptions) => { + seen.push(turnOptions); + return { + stdout: `${JSON.stringify(verdict, null, 2)}\n`, + stderr: '', + exitCode: 0, + }; + }, + }); + + assert.equal(seen.length, 1); + assert.equal(seen[0].phase, 'judge'); + assert.equal(seen[0].runId, 'conflict-test-run'); + assert.equal(seen[0].reasoningEffort, 'medium'); + assert.equal(result.verdict.winner, 'side_b'); + + const metadata = JSON.parse(await readFile(path.join(result.outputDir, 'metadata.json'), 'utf8')); + assert.equal(metadata.judge.agent, 'gemini'); + assert.equal(metadata.judge.label, 'gemini-pilot'); + assert.equal(metadata.replay.repair_attempted, false); + assert.equal(metadata.replay.repair_attempts, 0); + + const savedVerdict = JSON.parse(await readFile(path.join(result.outputDir, 'verdict.json'), 'utf8')); + assert.equal(savedVerdict.margin, 0.8); + assert.equal(savedVerdict.decisive_dimension, 'decision_usefulness'); +}); + +test('rejudgeConflictRun rejects unknown judge agents', async () => { + const tmpRoot = await mkdtemp(path.join(os.tmpdir(), 'shipwright-rejudge-')); + const runDir = path.join(tmpRoot, 'conflict-run'); + const judgeDir = path.join(runDir, 'judge'); + await mkdir(judgeDir, { recursive: true }); + + await Promise.all([ + writeFile(path.join(runDir, 'run.json'), '{ "run_id": "x" }\n'), + writeFile(path.join(judgeDir, 'verdict.input.json'), '{}\n'), + writeFile(path.join(judgeDir, 'verdict.prompt.txt'), 'prompt'), + ]); + + await assert.rejects( + () => rejudgeConflictRun({ runDir, judgeAgent: 'unknown-bot' }), + (error) => { + assert.ok(error.message.includes('Unknown judge agent')); + return true; + }, + ); +}); + +test('rejudgeConflictRun repairs Gemini verdicts that only miss structured fields', async () => { + const tmpRoot = await mkdtemp(path.join(os.tmpdir(), 'shipwright-rejudge-')); + const runDir = path.join(tmpRoot, 'conflict-run'); + const judgeDir = path.join(runDir, 'judge'); + await mkdir(judgeDir, { recursive: true }); + + await Promise.all([ + writeFile(path.join(runDir, 'run.json'), '{ "run_id": "repair-run" }\n'), + writeFile(path.join(judgeDir, 'verdict.input.json'), '{}\n'), + writeFile(path.join(judgeDir, 'verdict.prompt.txt'), 'prompt'), + ]); + + const incompleteVerdict = { + winner: 'side_a', + margin: 0.1, + rubric_scores: { + side_a: { + claim_quality: 4, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 4, + decision_usefulness: 4, + weighted_total: 4, + }, + side_b: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 3, + }, + }, + decisive_findings: ['Side A is more actionable.'], + judge_confidence: 'medium', + needs_human_review: false, + rationale: 'Side A wins.', + }; + + const repairedVerdict = { + ...incompleteVerdict, + dimension_rationales: { + claim_quality: 'Side A is stronger.', + evidence_discipline: 'Side A is tighter.', + responsiveness_to_critique: 'Side A addressed critique better.', + internal_consistency: 'Side A is more coherent.', + decision_usefulness: 'Side A is more useful.', + }, + side_summaries: { + side_a: { + strengths: ['Actionable recommendation.'], + weaknesses: ['Could be shorter.'], + }, + side_b: { + strengths: ['Good framing.'], + weaknesses: ['Less decisive.'], + }, + }, + decisive_dimension: 'decision_usefulness', + }; + + let callCount = 0; + const result = await rejudgeConflictRun({ + runDir, + judgeAgent: 'gemini', + label: 'gemini-repair', + turnRunner: async () => { + callCount += 1; + return { + stdout: `${JSON.stringify(callCount === 1 ? incompleteVerdict : repairedVerdict, null, 2)}\n`, + stderr: '', + exitCode: 0, + }; + }, + }); + + assert.equal(callCount, 2); + assert.equal(result.verdict.decisive_dimension, 'decision_usefulness'); + + const metadata = JSON.parse(await readFile(path.join(result.outputDir, 'metadata.json'), 'utf8')); + assert.equal(metadata.replay.repair_attempted, true); + assert.equal(metadata.replay.repair_attempts, 1); +}); + +test('rejudgeConflictRun repairs Gemini verdicts that miss structured fields and weighted totals', async () => { + const tmpRoot = await mkdtemp(path.join(os.tmpdir(), 'shipwright-rejudge-')); + const runDir = path.join(tmpRoot, 'conflict-run'); + const judgeDir = path.join(runDir, 'judge'); + await mkdir(judgeDir, { recursive: true }); + + await Promise.all([ + writeFile(path.join(runDir, 'run.json'), '{ "run_id": "repair-run-2" }\n'), + writeFile(path.join(judgeDir, 'verdict.input.json'), '{}\n'), + writeFile(path.join(judgeDir, 'verdict.prompt.txt'), 'prompt'), + ]); + + const incompleteVerdict = { + winner: 'side_b', + margin: 0.4, + rubric_scores: { + side_a: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + }, + side_b: { + claim_quality: 4, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 4, + decision_usefulness: 4, + }, + }, + decisive_findings: ['Side B is stronger overall.'], + judge_confidence: 'medium', + needs_human_review: true, + rationale: 'Side B wins with a clearer recommendation.', + }; + + const repairedVerdict = { + ...incompleteVerdict, + rubric_scores: { + side_a: { + ...incompleteVerdict.rubric_scores.side_a, + weighted_total: 3, + }, + side_b: { + ...incompleteVerdict.rubric_scores.side_b, + weighted_total: 4, + }, + }, + dimension_rationales: { + claim_quality: 'Side B has the stronger claims.', + evidence_discipline: 'Side B uses evidence more carefully.', + responsiveness_to_critique: 'Side B absorbs critique better.', + internal_consistency: 'Side B is more coherent.', + decision_usefulness: 'Side B is more useful to the decision maker.', + }, + side_summaries: { + side_a: { + strengths: ['Good framing.'], + weaknesses: ['Less complete recommendation.'], + }, + side_b: { + strengths: ['More decisive recommendation.'], + weaknesses: ['Slightly more rigid stance.'], + }, + }, + decisive_dimension: 'decision_usefulness', + }; + + let callCount = 0; + const result = await rejudgeConflictRun({ + runDir, + judgeAgent: 'gemini', + label: 'gemini-repair-2', + turnRunner: async () => { + callCount += 1; + return { + stdout: `${JSON.stringify(callCount === 1 ? incompleteVerdict : repairedVerdict, null, 2)}\n`, + stderr: '', + exitCode: 0, + }; + }, + }); + + assert.equal(callCount, 2); + assert.equal(result.verdict.rubric_scores.side_a.weighted_total, 3); + assert.equal(result.verdict.rubric_scores.side_b.weighted_total, 4); +}); + +test('verdict schema rejects weighted totals outside the 1-5 scale', () => { + const verdict = { + winner: 'side_a', + margin: 1, + rubric_scores: { + side_a: { + claim_quality: 4, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 4, + decision_usefulness: 4, + weighted_total: 20, + }, + side_b: { + claim_quality: 3, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 3, + decision_usefulness: 3, + weighted_total: 19, + }, + }, + dimension_rationales: { + claim_quality: 'Side A is stronger.', + evidence_discipline: 'Side A is tighter.', + responsiveness_to_critique: 'Side A is more responsive.', + internal_consistency: 'Side A is more coherent.', + decision_usefulness: 'Side A is more useful.', + }, + side_summaries: { + side_a: { + strengths: ['Good strategy.'], + weaknesses: ['Could be shorter.'], + }, + side_b: { + strengths: ['Good framing.'], + weaknesses: ['Less useful.'], + }, + }, + decisive_dimension: 'decision_usefulness', + decisive_findings: ['Side A is stronger.'], + judge_confidence: 'medium', + needs_human_review: true, + rationale: 'Side A wins.', + }; + + const validation = validateConflictDocument(verdict, 'verdict'); + assert.ok(validation.errors.some((error) => error.path === '$.rubric_scores.side_a.weighted_total')); + assert.ok(validation.errors.some((error) => error.path === '$.rubric_scores.side_b.weighted_total')); +}); diff --git a/tests/run-conflict-batch.test.mjs b/tests/run-conflict-batch.test.mjs new file mode 100644 index 00000000..3dc57b0f --- /dev/null +++ b/tests/run-conflict-batch.test.mjs @@ -0,0 +1,308 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; + +import { buildSummary, parseCliArgs, runBatch } from '../scripts/run-conflict-batch.mjs'; + +test('parseCliArgs handles scenario filters and flags', () => { + const args = parseCliArgs([ + '--scenario', 'prd-hidden-scope-creep', + '--scenario', 'handoff-contradiction', + '--out', '/tmp/summary.md', + '--dry-run', + '--side-a-agent', 'gemini', + '--side-b-agent', 'claude', + '--judge-agent', 'gpt', + '--judge-agent', 'gemini', + '--side-a-reasoning-effort', 'high', + '--side-b-reasoning-effort', 'low', + '--judge-reasoning-effort', 'medium', + ]); + + assert.deepEqual(args.scenarios, ['prd-hidden-scope-creep', 'handoff-contradiction']); + assert.equal(args.outPath, '/tmp/summary.md'); + assert.equal(args.dryRun, true); + assert.equal(args.sideAAgent, 'gemini'); + assert.equal(args.sideBAgent, 'claude'); + assert.deepEqual(args.judgeAgents, ['gpt', 'gemini']); + assert.equal(args.sideAReasoningEffort, 'high'); + assert.equal(args.sideBReasoningEffort, 'low'); + assert.equal(args.judgeReasoningEffort, 'medium'); +}); + +test('runBatch dry-run swaps competitor assignments when requested', async () => { + const results = await runBatch({ + scenarios: ['prd-hidden-scope-creep'], + dryRun: true, + swapSides: true, + }); + + assert.equal(results.length, 2); + for (const result of results) { + assert.equal(result.sideALabel, 'gpt'); + assert.equal(result.sideBLabel, 'claude'); + assert.equal(result.status, 'dry_run'); + } +}); + +test('runBatch dry-run supports explicit role assignment including Gemini judge', async () => { + const results = await runBatch({ + scenarios: ['prd-hidden-scope-creep'], + dryRun: true, + sideAAgent: 'gpt', + sideBAgent: 'claude', + judgeAgents: ['gemini'], + }); + + assert.equal(results.length, 1); + assert.equal(results[0].sideALabel, 'gpt'); + assert.equal(results[0].sideBLabel, 'claude'); + assert.equal(results[0].judgeLabel, 'gemini-judge'); + assert.equal(results[0].status, 'dry_run'); +}); + +test('runBatch rejects duplicate side assignments', async () => { + await assert.rejects( + () => runBatch({ + scenarios: ['prd-hidden-scope-creep'], + dryRun: true, + sideAAgent: 'claude', + sideBAgent: 'claude', + }), + (error) => { + assert.ok(error.message.includes('Side A and Side B must use different agents')); + return true; + }, + ); +}); + +test('runBatch rejects unknown judge agents', async () => { + await assert.rejects( + () => runBatch({ + scenarios: ['prd-hidden-scope-creep'], + dryRun: true, + judgeAgents: ['mystery-bot'], + }), + (error) => { + assert.ok(error.message.includes('Unknown judge agent')); + return true; + }, + ); +}); + +test('buildSummary produces judge agreement analysis from completed runs', () => { + const results = [ + { + scenario: 'prd-hidden-scope-creep', + judgeLabel: 'claude-judge', + status: 'completed', + winner: 'side_a', + margin: 1.2, + judgeConfidence: 'high', + needsHumanReview: false, + disagreementRate: 0.67, + declaredAdoptionRate: 1.0, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 1, + runId: 'run-1', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'prd-hidden-scope-creep', + judgeLabel: 'gpt-judge', + status: 'completed', + winner: 'side_b', + margin: 0.8, + judgeConfidence: 'high', + needsHumanReview: true, + disagreementRate: 0.75, + declaredAdoptionRate: 1.0, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 4, + runId: 'run-2', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'handoff-contradiction', + judgeLabel: 'claude-judge', + status: 'completed', + winner: 'side_a', + margin: 0.6, + judgeConfidence: 'medium', + needsHumanReview: false, + disagreementRate: 0.5, + declaredAdoptionRate: 0.5, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 2, + runId: 'run-3', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'handoff-contradiction', + judgeLabel: 'gpt-judge', + status: 'completed', + winner: 'side_a', + margin: 0.4, + judgeConfidence: 'medium', + needsHumanReview: false, + disagreementRate: 0.5, + declaredAdoptionRate: 0.5, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 2, + runId: 'run-4', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + ]; + + const summary = buildSummary(results); + + assert.ok(summary.includes('Judge Agreement Analysis')); + assert.ok(summary.includes('**NO**')); // prd-hidden-scope-creep disagrees + assert.ok(summary.includes('YES')); // handoff-contradiction agrees + assert.ok(summary.includes('1/2 (50%)')); + assert.ok(summary.includes('Average margin delta')); +}); + +test('buildSummary suppresses publishability interpretation when coverage is partial', () => { + const results = [ + { + scenario: 'prd-hidden-scope-creep', + judgeLabel: 'claude-judge', + status: 'completed', + winner: 'side_a', + margin: 1.0, + judgeConfidence: 'high', + needsHumanReview: false, + disagreementRate: 0.5, + declaredAdoptionRate: 1.0, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 0, + runId: 'run-1', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'prd-hidden-scope-creep', + judgeLabel: 'gpt-judge', + status: 'completed', + winner: 'side_a', + margin: 0.8, + judgeConfidence: 'high', + needsHumanReview: false, + disagreementRate: 0.5, + declaredAdoptionRate: 1.0, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 0, + runId: 'run-2', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'handoff-contradiction', + judgeLabel: 'claude-judge', + status: 'error', + winner: null, + margin: null, + judgeConfidence: null, + needsHumanReview: null, + disagreementRate: null, + declaredAdoptionRate: null, + substantiveRevisionRate: null, + unsupportedClaimCount: null, + runId: null, + error: 'Model timeout', + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'handoff-contradiction', + judgeLabel: 'gpt-judge', + status: 'completed', + winner: 'side_b', + margin: 0.5, + judgeConfidence: 'medium', + needsHumanReview: false, + disagreementRate: 0.5, + declaredAdoptionRate: 1.0, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 0, + runId: 'run-4', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + ]; + + const summary = buildSummary(results); + + // 1/2 completed comparisons — should warn about partial coverage + assert.ok(summary.includes('1/2')); + assert.ok(summary.includes('WARNING')); + assert.ok(summary.includes('partial coverage')); + // Should NOT include any "usable" / "publishable" interpretation + assert.ok(!summary.includes('Single-judge runs are usable')); +}); + +test('runBatch rejects unknown scenario IDs', async () => { + await assert.rejects( + () => runBatch({ scenarios: ['prd-hidden-scope-creep', 'nonexistent-typo'], dryRun: true }), + (error) => { + assert.ok(error.message.includes('Unknown scenario(s): nonexistent-typo')); + return true; + }, + ); +}); + +test('buildSummary handles errors gracefully', () => { + const results = [ + { + scenario: 'broken-scenario', + judgeLabel: 'claude-judge', + status: 'error', + winner: null, + margin: null, + judgeConfidence: null, + needsHumanReview: null, + disagreementRate: null, + declaredAdoptionRate: null, + substantiveRevisionRate: null, + unsupportedClaimCount: null, + runId: null, + error: 'Model output is not valid JSON', + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + { + scenario: 'broken-scenario', + judgeLabel: 'gpt-judge', + status: 'completed', + winner: 'side_a', + margin: 0.5, + judgeConfidence: 'medium', + needsHumanReview: false, + disagreementRate: 0.5, + declaredAdoptionRate: 1.0, + substantiveRevisionRate: 0.5, + unsupportedClaimCount: 0, + runId: 'run-5', + error: null, + sideALabel: 'claude', + sideBLabel: 'gpt', + }, + ]; + + const summary = buildSummary(results); + + assert.ok(summary.includes('Errors: 1')); + assert.ok(summary.includes('Model output is not valid JSON')); + assert.ok(summary.includes('ERROR')); +}); diff --git a/tests/run-conflict-harness.test.mjs b/tests/run-conflict-harness.test.mjs new file mode 100644 index 00000000..027cba9b --- /dev/null +++ b/tests/run-conflict-harness.test.mjs @@ -0,0 +1,541 @@ +import assert from 'node:assert/strict'; +import { mkdtemp, readFile, rm } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import test from 'node:test'; + +import { injectReasoningEffort, runConflictHarness } from '../scripts/run-conflict-harness.mjs'; + +function createCasePacket() { + return { + scenario_id: 'conflict-smoke', + title: 'Conflict Smoke Test', + prompt: 'Write the strongest possible recommendation.', + artifact_type: 'strategy', + rubric: { + dimensions: [ + 'claim quality', + 'evidence discipline', + 'responsiveness to critique', + 'internal consistency', + 'decision usefulness', + ], + scoring_scale: '1-5', + expected_sections: ['Decision Frame'], + scoring_spec_ref: null, + }, + constraints: { + expected_sections: ['Decision Frame'], + expect_structured: false, + context_files: [], + scoring_spec_ref: null, + }, + evidence: [], + max_rounds: 3, + tool_policy: 'none', + sharing_policy: { + share_case_packet: true, + share_committed_artifacts_after_first_pass: true, + share_critiques_after_open: true, + share_hidden_reasoning: false, + share_provider_identity: false, + share_internal_coalition_drafts: false, + }, + success_condition: { + type: 'validator_contract', + description: 'Return a valid artifact packet.', + validator: { + artifact_type: 'strategy', + expect_sections: ['Decision Frame'], + expect_structured: false, + }, + }, + }; +} + +test('runConflictHarness completes a head-to-head run and writes state', async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-conflict-run-')); + const calls = []; + + try { + const { run, outDir } = await runConflictHarness({ + casePacket: createCasePacket(), + outDir: rootDir, + runId: 'conflict-smoke-run', + sideAReasoningEffort: 'high', + sideBReasoningEffort: 'low', + judgeReasoningEffort: 'medium', + turnRunner: async (options) => { + calls.push({ + phase: options.phase, + sideId: options.sideId, + prompt: options.prompt, + packet: options.packet, + attempt: options.attempt, + reasoningEffort: options.reasoningEffort, + }); + + if (options.phase === 'first_pass') { + return { + packet: { + run_id: 'conflict-smoke-run', + side_id: options.sideId, + round: 'first_pass', + artifact_markdown: `# ${options.sideId} first pass`, + claims: [ + { + claim_id: `${options.sideId}-claim-1`, + summary: `${options.sideId} major claim`, + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + if (options.phase === 'rebuttal') { + return { + packet: { + target_side: options.sideId === 'side_a' ? 'side_b' : 'side_a', + finding_id: 'ignored-by-runner', + target_claim_ids: [options.sideId === 'side_a' ? 'side_b-claim-1' : 'side_a-claim-1'], + claim_under_attack: 'The opposing claim is weak.', + attack_type: 'evidence_gap', + evidence_or_reason: 'The visible claim needs stronger support.', + severity: 'medium', + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + if (options.phase === 'final') { + return { + packet: { + run_id: 'conflict-smoke-run', + side_id: options.sideId, + round: 'final', + artifact_markdown: `# ${options.sideId} final`, + claims: [ + { + claim_id: `${options.sideId}-claim-1`, + summary: `${options.sideId} revised claim`, + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'high', + open_questions: [], + critique_responses: [ + { + finding_id: options.sideId === 'side_a' ? 'finding-2' : 'finding-1', + disposition: 'adopted', + rationale: 'The critique improved the final answer.', + }, + ], + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + if (options.phase === 'judge') { + return { + packet: { + winner: 'side_a', + margin: 0.2, + rubric_scores: { + side_a: { + claim_quality: 5, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 5, + decision_usefulness: 4, + weighted_total: 4.4, + }, + side_b: { + claim_quality: 4, + evidence_discipline: 3, + responsiveness_to_critique: 3, + internal_consistency: 4, + decision_usefulness: 3, + weighted_total: 3.4, + }, + }, + dimension_rationales: { + claim_quality: 'Side A made the crisper top-level claim.', + evidence_discipline: 'Side A stayed closer to the visible evidence.', + responsiveness_to_critique: 'Side A addressed the rebuttal more concretely.', + internal_consistency: 'Side A had fewer internal gaps.', + decision_usefulness: 'Side A gave the board a more actionable posture.', + }, + side_summaries: { + side_a: { + strengths: ['Concrete response to critique.'], + weaknesses: ['Less ambitious framing.'], + }, + side_b: { + strengths: ['Broader framing of the problem.'], + weaknesses: ['More diffuse final recommendation.'], + }, + }, + decisive_dimension: 'responsiveness_to_critique', + decisive_findings: ['Side A responded more concretely to the rebuttal.'], + judge_confidence: 'medium', + needs_human_review: false, + rationale: 'Side A is clearer and more responsive.', + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + throw new Error(`Unexpected phase: ${options.phase}`); + }, + }); + + const state = JSON.parse(await readFile(path.join(outDir, 'state.json'), 'utf8')); + const config = JSON.parse(await readFile(path.join(outDir, 'config.json'), 'utf8')); + const judgeCall = calls.find((entry) => entry.phase === 'judge'); + const sideACall = calls.find((entry) => entry.phase === 'first_pass' && entry.sideId === 'side_a'); + const sideBCall = calls.find((entry) => entry.phase === 'first_pass' && entry.sideId === 'side_b'); + + assert.equal(run.status, 'completed'); + assert.equal(run.results.winner, 'side_a'); + assert.equal(run.metrics.declared_adoption_rate, 1); + assert.equal(run.metrics.substantive_revision_rate, 1); + assert.equal(run.sides.side_a.reasoning_effort, 'high'); + assert.equal(run.sides.side_b.reasoning_effort, 'low'); + assert.equal(run.judge.reasoning_effort, 'medium'); + assert.equal(state.last_completed_phase, 'adjudication'); + assert.equal(state.status, 'completed'); + assert.deepEqual(config.reasoning_efforts, { + side_a: 'high', + side_b: 'low', + judge: 'medium', + }); + assert.equal(sideACall.reasoningEffort, 'high'); + assert.equal(sideBCall.reasoningEffort, 'low'); + assert.equal(judgeCall.reasoningEffort, 'medium'); + assert.ok(judgeCall.prompt.includes('judge_confidence rubric')); + assert.ok(judgeCall.prompt.includes('min_margin_for_verdict = 0.1')); + assert.ok(judgeCall.prompt.includes('conclusion_confidence')); + assert.ok(!('provider' in judgeCall.packet)); + assert.ok(!('provider' in judgeCall.packet.first_pass_artifacts.side_a)); + assert.ok(!('provider' in judgeCall.packet.first_pass_artifacts.side_b)); + } finally { + await rm(rootDir, { recursive: true, force: true }); + } +}); + +test('injectReasoningEffort pins Gemini to project-local aliases', () => { + const low = injectReasoningEffort('cat {{prompt_file}} | gemini --approval-mode plan --output-format text -p ""', 'low'); + const medium = injectReasoningEffort('cat {{prompt_file}} | gemini --approval-mode plan --output-format text -p ""', 'medium'); + const high = injectReasoningEffort('cat {{prompt_file}} | gemini --approval-mode plan --output-format text -p ""', 'high'); + + assert.ok(low.includes("gemini -m 'shipwright-gemini-medium'")); + assert.ok(medium.includes("gemini -m 'shipwright-gemini-medium'")); + assert.ok(high.includes("gemini -m 'shipwright-gemini-high'")); +}); + +test('runConflictHarness enforces budget at phase boundaries', async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-conflict-budget-')); + const calls = []; + + try { + const { run } = await runConflictHarness({ + casePacket: createCasePacket(), + outDir: rootDir, + runId: 'conflict-budget-run', + maxCostUsd: 10, + turnRunner: async (options) => { + calls.push(`${options.phase}:${options.sideId || 'judge'}`); + if (options.phase === 'first_pass') { + return { + packet: { + run_id: 'conflict-budget-run', + side_id: options.sideId, + round: 'first_pass', + artifact_markdown: `# ${options.sideId} first pass`, + claims: [ + { + claim_id: `${options.sideId}-claim-1`, + summary: `${options.sideId} major claim`, + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + usage: { estimatedCostUsd: 7 }, + }; + } + + throw new Error('No phase after first_pass should execute.'); + }, + }); + + assert.equal(run.status, 'budget_exhausted'); + assert.ok(run.sides.side_a.first_pass); + assert.ok(run.sides.side_b.first_pass); + assert.deepEqual(calls, ['first_pass:side_a', 'first_pass:side_b']); + } finally { + await rm(rootDir, { recursive: true, force: true }); + } +}); + +test('runConflictHarness retries first-pass identity leakage once before continuing', async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-conflict-leak-')); + let firstPassAttempts = 0; + const prompts = []; + + try { + await runConflictHarness({ + casePacket: createCasePacket(), + outDir: rootDir, + runId: 'conflict-leak-run', + turnRunner: async (options) => { + if (options.phase === 'first_pass' && options.sideId === 'side_a') { + prompts.push(options.prompt); + } + if (options.phase === 'first_pass' && options.sideId === 'side_a') { + firstPassAttempts += 1; + if (firstPassAttempts === 1) { + return { + packet: { + run_id: 'conflict-leak-run', + side_id: 'side_a', + round: 'first_pass', + artifact_markdown: '# As ChatGPT I recommend shipping now', + claims: [ + { + claim_id: 'side_a-claim-1', + summary: 'Leaky claim', + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + usage: { estimatedCostUsd: 0 }, + }; + } + } + + if (options.phase === 'first_pass') { + return { + packet: { + run_id: 'conflict-leak-run', + side_id: options.sideId, + round: 'first_pass', + artifact_markdown: `# ${options.sideId} clean first pass`, + claims: [ + { + claim_id: `${options.sideId}-claim-1`, + summary: `${options.sideId} claim`, + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + if (options.phase === 'rebuttal') { + return { + packet: { + target_side: options.sideId === 'side_a' ? 'side_b' : 'side_a', + finding_id: 'ignored', + target_claim_ids: [options.sideId === 'side_a' ? 'side_b-claim-1' : 'side_a-claim-1'], + claim_under_attack: 'Weak claim', + attack_type: 'evidence_gap', + evidence_or_reason: 'Needs more proof.', + severity: 'medium', + }, + }; + } + + if (options.phase === 'final') { + return { + packet: { + run_id: 'conflict-leak-run', + side_id: options.sideId, + round: 'final', + artifact_markdown: `# ${options.sideId} final`, + claims: [ + { + claim_id: `${options.sideId}-claim-1`, + summary: `${options.sideId} final claim`, + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'high', + open_questions: [], + critique_responses: [ + { + finding_id: options.sideId === 'side_a' ? 'finding-2' : 'finding-1', + disposition: 'adopted', + rationale: 'Updated cleanly.', + }, + ], + }, + }; + } + + if (options.phase === 'judge') { + return { + packet: { + winner: 'side_b', + margin: 0.15, + rubric_scores: { + side_a: { + claim_quality: 4, + evidence_discipline: 4, + responsiveness_to_critique: 4, + internal_consistency: 4, + decision_usefulness: 4, + weighted_total: 4, + }, + side_b: { + claim_quality: 5, + evidence_discipline: 4, + responsiveness_to_critique: 5, + internal_consistency: 5, + decision_usefulness: 4, + weighted_total: 4.6, + }, + }, + dimension_rationales: { + claim_quality: 'Side B made the stronger core recommendation.', + evidence_discipline: 'Both sides were similar on evidence use.', + responsiveness_to_critique: 'Side B responded more directly to the critique.', + internal_consistency: 'Side B was more internally coherent.', + decision_usefulness: 'Side B was more actionable overall.', + }, + side_summaries: { + side_a: { + strengths: ['Clear initial structure.'], + weaknesses: ['Did not fully resolve the critique.'], + }, + side_b: { + strengths: ['More complete final answer.'], + weaknesses: ['Still had minor ambiguity.'], + }, + }, + decisive_dimension: 'decision_usefulness', + decisive_findings: ['Side B was stronger.'], + judge_confidence: 'medium', + needs_human_review: false, + rationale: 'Side B wins.', + }, + }; + } + + throw new Error(`Unexpected phase: ${options.phase}`); + }, + }); + + assert.equal(firstPassAttempts, 2); + assert.ok(prompts[1].includes('Repair instruction: remove any provider self-identification')); + } finally { + await rm(rootDir, { recursive: true, force: true }); + } +}); + +test('runConflictHarness terminates with protocol_violation after repeated unseen-opponent references', async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-conflict-protocol-')); + const prompts = []; + const runId = 'conflict-protocol-run'; + const transcriptDir = path.join(rootDir, 'conflict-smoke', runId); + + try { + await assert.rejects( + runConflictHarness({ + casePacket: createCasePacket(), + outDir: rootDir, + runId, + turnRunner: async (options) => { + if (options.phase === 'first_pass' && options.sideId === 'side_a') { + prompts.push(options.prompt); + return { + packet: { + run_id: runId, + side_id: 'side_a', + round: 'first_pass', + artifact_markdown: '# As Side B argued, the recommendation should narrow the scope', + claims: [ + { + claim_id: 'side_a-claim-1', + summary: 'Protocol violation claim', + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + if (options.phase === 'first_pass') { + return { + packet: { + run_id: runId, + side_id: options.sideId, + round: 'first_pass', + artifact_markdown: `# ${options.sideId} first pass`, + claims: [ + { + claim_id: `${options.sideId}-claim-1`, + summary: `${options.sideId} claim`, + evidence_refs: ['ctx-1'], + is_major: true, + }, + ], + citations: ['ctx-1'], + conclusion_confidence: 'medium', + open_questions: [], + critique_responses: [], + }, + usage: { estimatedCostUsd: 0 }, + }; + } + + throw new Error(`Unexpected phase: ${options.phase}`); + }, + }), + /Protocol violation/, + ); + + const persistedRun = JSON.parse(await readFile(path.join(transcriptDir, 'run.json'), 'utf8')); + const persistedState = JSON.parse(await readFile(path.join(transcriptDir, 'state.json'), 'utf8')); + + assert.equal(prompts.length, 2); + assert.ok(prompts[1].includes('Repair instruction: remove any mention of unseen opponent content')); + assert.equal(persistedRun.status, 'protocol_violation'); + assert.equal(persistedState.status, 'protocol_violation'); + assert.ok(persistedRun.audit.protocol_violations.length > 0); + } finally { + await rm(rootDir, { recursive: true, force: true }); + } +});