diff --git a/.github/skills/harness/SKILL.md b/.github/skills/harness/SKILL.md new file mode 100644 index 00000000..53daf8bf --- /dev/null +++ b/.github/skills/harness/SKILL.md @@ -0,0 +1,160 @@ +--- +name: harness +description: > + Universal harness engineering for AI agent projects. Initialize, configure, and manage + project harnesses that make AI agent work reproducible, measurable, and CI-ready. + Use when setting up a new AI agent project, retrofitting harness to existing project, + creating harness.yaml config, running `sk harness init`, debugging non-reproducible + agent results, defining success criteria for AI tasks, or wiring harness to GitHub Actions CI. + Trigger phrases: "harness init", "setup harness", "harness engineering", "agent evaluation", + "success criteria", "sk harness", "harness.yaml", "make agent work reproducible". +--- + +# Harness Skill + +Make AI agent work reproducible, measurable, and CI-ready using `sk harness`. + +## Quick Start + +```bash +sk harness init # scaffold — auto-detects project language +sk harness doctor # verify config is valid +sk harness show # list registered commands +SK_HARNESS=1 sk briefing "task" # enable dispatch middleware +``` + +## What This Skill Does + +1. **Initialize** `harness.yaml` and `.harness/` for any project +2. **Configure** success criteria that gate AI agent merges +3. **Enable** dispatch middleware (`SK_HARNESS=1`) for telemetry and hooks +4. **Wire** harness checks into GitHub Actions CI +5. **Debug** non-reproducible agent results with structured evidence + +Core insight: harness configuration produces a 36% performance gap with the same AI model +(CORE benchmark, arXiv 2412.04524). A well-structured harness is worth more than a model upgrade. + +## When to Use + +| Situation | What to do | +|-----------|-----------| +| New project, no tests yet | `sk harness init --skeleton-only` | +| New project with tests | `sk harness init` (auto-detects commands) | +| Existing project | `sk harness init` (detects existing test/lint commands) | +| >1 AI agent on same codebase | Full harness + `sk tentacle` for orchestration | +| CI failing from agent changes | Retrofit with `required: true` success criteria | +| One-off script, no recurrence | Skip harness; use `sk learn` to record the pattern | + +## Commands Reference + +### `sk harness init` — Scaffold Harness Config + +```bash +sk harness init # scaffold for current directory +sk harness init --target /path # scaffold for a specific path +sk harness init --name my-project # set project name in harness.yaml +sk harness init --skeleton-only # create .harness/ dirs only, no harness.yaml +sk harness init --ci # also create .github/workflows/harness-ci.yml +sk harness init --no-ci # skip CI workflow generation +sk harness init --force # overwrite existing harness.yaml +sk harness init --yes # non-interactive, accept all defaults +sk harness init --json # machine-readable output +``` + +Detects project type from: `pyproject.toml`, `setup.py`, `requirements.txt` (Python), +`package.json` (Node/bun/pnpm/yarn), `Cargo.toml` (Rust), `go.mod` (Go), `pom.xml`/`build.gradle` (Java). + +### Other Commands + +```bash +sk harness show [--tag TAG] [--json] # list registered commands +sk harness check [--json] # verify scripts exist on disk +sk harness doctor [--json] # full self-check (scripts, DB, hooks) +sk harness config list|get|set # manage env vars (SK_HARNESS, SK_DRY_RUN, + # SK_DEBUG_TIMING, SK_TOOLS_DIR) +``` + +### Enable Dispatch Middleware + +```bash +SK_HARNESS=1 sk # enable for one command +export SK_HARNESS=1 # enable for session +SK_DRY_RUN=1 SK_HARNESS=1 sk check # dry-run: test hooks without executing +SK_DEBUG_TIMING=1 SK_HARNESS=1 sk check # emit per-hook timing to stderr +``` + +## `harness.yaml` Schema + +```yaml +harness: + name: my-project + version: "1.0" + +environment: + type: python # python | node | rust | go | java | generic + setup: [] # commands to run before agent tasks + +commands: + test: python3 run_all_tests.py + lint: ruff check . + format_check: ruff format --check . + build: ~ + +success_criteria: + - id: tests-pass + command: python3 run_all_tests.py + required: true # gates CI merges + description: All tests must pass + - id: lint-clean + command: ruff check . + required: false # advisory only + description: Lint should be clean + +reporting: + format: jsonl + output_dir: .harness/reports/ + telemetry: true + +ci: + enabled: false + provider: github-actions + on: [push, pull_request] +``` + +## Typical Workflow + +```bash +# 1. Initialize harness +sk harness init --yes + +# 2. Review and adjust harness.yaml (add your real test commands) +$EDITOR harness.yaml + +# 3. Validate +sk harness doctor + +# 4. Run agent with harness middleware +SK_HARNESS=1 sk tentacle swarm my-feature --agent-type general-purpose --model claude-sonnet-4.6 + +# 5. Check criteria passed +sk harness check + +# 6. Record outcome +sk learn --pattern "Harness retrofit" "harness init detected Python correctly" --tags "harness" +``` + +For CI: `sk harness init --ci` generates `.github/workflows/harness-ci.yml`. Edit it to add +secrets, matrix builds, or additional steps as needed. + +## Limitations + +- **Non-determinism**: same harness + same model ≠ same result; use pass@k for measurement +- **API cost**: every harness-wrapped invocation may call APIs; use `SK_DRY_RUN=1` to test +- **Overhead**: ~50–100ms per dispatch; negligible for interactive use +- **Scope**: `SK_HARNESS=1` only wraps the Python shim; native Rust binary bypasses it +- **No containers**: tasks run in current environment (no Docker isolation like SWE-bench) +- **CI**: only GitHub Actions is auto-generated; other providers need manual adaptation +- **Concurrency**: use `sk tentacle` for multi-agent runs; concurrent bare harness runs can race + +> See [docs/HARNESS-PHILOSOPHY.md](../../../docs/HARNESS-PHILOSOPHY.md) for full rationale. +> See [docs/HARNESS.md](../../../docs/HARNESS.md) for API reference. diff --git a/docs/HARNESS-PHILOSOPHY.md b/docs/HARNESS-PHILOSOPHY.md new file mode 100644 index 00000000..d5c6a9fd --- /dev/null +++ b/docs/HARNESS-PHILOSOPHY.md @@ -0,0 +1,552 @@ +# Harness Philosophy + +> Canonical philosophy and design rationale for `sk harness` — the universal harness +> engineering layer for AI agent projects. +> +> **API reference:** [docs/HARNESS.md](HARNESS.md) · **Skill:** [.github/skills/harness/SKILL.md](../.github/skills/harness/SKILL.md) + +--- + +## 1. Why Harness Engineering? + +### The 36% Performance Gap + +The most important empirical finding in AI agent evaluation is not about model size or +parameter count — it is about the harness. The CORE benchmark (arXiv 2412.04524) +demonstrated that **the same AI model, evaluated across different harness configurations, +produces a 36% performance gap**. The model didn't change. The prompts didn't change. +Only the harness changed. + +This means: + +- A team using a well-structured harness gets the equivalent of a model upgrade for free. +- A team skipping harness engineering is systematically leaving performance on the table. +- Harness configuration is a first-class engineering concern, not an afterthought. + +### AI Agents Are Non-Deterministic; Harnesses Make Outcomes Reproducible + +An AI agent asked to "fix the bug" will produce a different output on each run. This +non-determinism is fundamental — it is not a defect to be fixed, it is a property to be +managed. Harness engineering accepts this property and builds a structured envelope around +it: + +- **Input control**: the agent always receives the same task definition, context, and + constraints. +- **Environment reproducibility**: dependencies, config, and test data are locked. +- **Output capture**: agent output is captured in a structured, comparable format. +- **Success criteria**: objective measures (tests pass, lint clean, file created) are + defined before the agent runs. + +With these four properties, a harness transforms non-deterministic agent runs into a +measurable distribution. You may not know which specific output the agent will produce, +but you can measure whether it satisfies your criteria. + +### Harness Is Infrastructure for AI Work, Like CI Is Infrastructure for Code + +Before CI existed, developers ran tests manually — sometimes, inconsistently, and often +not at all. The quality of a codebase depended on individual discipline rather than +systemic enforcement. CI changed this: tests run on every commit, automatically, and the +result gates the merge. + +**Harness engineering is the same transition for AI agent work.** + +| Without Harness | With Harness | +|----------------|-------------| +| Ad-hoc prompting, inconsistent results | Structured task definition, reproducible inputs | +| No way to measure improvement | Quantified success criteria, regression detection | +| Agent outputs lost or untracked | JSONL telemetry, structured reports | +| "It worked on my machine" | Locked environment, CI-ready gates | +| Vibe coding — merge if it looks right | Evidence-based merges — merge only when gates pass | + +The progression is direct: **ad-hoc prompting → harness → measurable AI engineering**. + +--- + +## 2. What Is a Harness? + +A harness is a **structured runtime environment** that controls how AI agents receive +tasks, execute work, and report results. It is the glue between AI capability and +measured outcomes. + +### Components + +#### Task Definition +What the agent must do. Includes: +- Input specification (problem statement, file scope, constraints) +- Expected output description (file created, test passing, JSON produced) +- Success criteria (objective, checkable conditions) +- Metadata (name, tags, timeout, required/optional) + +#### Environment Setup +A reproducible environment for every run: +- Locked dependencies (`requirements.txt`, `package.json`, `Cargo.lock`) +- Deterministic configuration (env vars, feature flags) +- Test data and fixtures +- Optional: containerization (Docker, devcontainer) + +#### Agent Runner +Dispatches the agent and captures output: +- Wraps the agent invocation with pre/post hooks +- Captures stdout, stderr, timing, exit code +- Injects context (briefing, knowledge entries) +- Supports dry-run mode for testing the harness itself + +#### Success Criteria +Objective, checkable conditions that define a passing run: +- Test suite passes (`python3 test_fixes.py`) +- Lint is clean (`ruff check .`) +- Required file exists and is syntactically valid +- No regressions introduced (existing pass tests still pass) + +#### Reporter +Aggregates results across runs and surfaces evidence: +- Detects regressions (tests that were passing are now failing) +- Aggregates pass/fail statistics +- Produces evidence for closeout (required by Rule 9) +- Writes to `~/.copilot/markers/harness-telemetry.jsonl` + +#### Hooks +Lifecycle events for instrumentation and enforcement: +- `preDispatch`: inject context, validate environment, record start +- `postDispatch`: capture output, run success checks, record end +- `sessionStart`: resume banner, paused-goal detection +- `sessionEnd`: write telemetry, update knowledge + +### What a Harness Is Not + +A harness is not: +- **A prompt template**: prompts are inputs; the harness is the container for prompt delivery + and result capture. +- **A test framework**: tests measure code correctness; the harness measures whether the + agent produced correct code. +- **A CI system**: CI runs tests; the harness runs agents that produce the code CI then tests. +- **A model**: the model is the cognitive engine; the harness is the mechanical infrastructure + around it. + +--- + +## 3. How `sk harness` Works + +### Existing Infrastructure + +#### `SK_HARNESS=1` — Dispatch Middleware + +Setting `SK_HARNESS=1` in the environment enables the Python dispatch middleware in +`harness/dispatch.py`. Every `sk` command that passes through the shim gets wrapped with: + +1. **Pre-dispatch hook pipeline**: timing start, context injection, dry-run check +2. **Command execution**: the actual `sk` subcommand runs +3. **Post-dispatch hook pipeline**: timing end, output capture, telemetry write + +```bash +SK_HARNESS=1 sk briefing "my task" +# → preDispatch hook fires +# → briefing runs with context +# → postDispatch hook fires, writes JSONL telemetry +``` + +The middleware is **fail-open**: if `harness/dispatch.py` crashes, the command still runs. +Hook failures are logged, not fatal. + +#### `harness-manifest.json` — Command Registry + +The manifest registers 77+ `sk` commands with metadata: +- Command name, description, tags +- Input/output types +- Required/optional flags +- Hook points (which lifecycle events apply) + +Used by `sk harness show`, `sk harness check`, and `sk harness doctor` to validate and +display the harness configuration. + +#### `DispatchContext` — Per-Invocation State + +```python +@dataclass(frozen=True) +class DispatchContext: + command: str # e.g. "briefing" + args: list[str] # e.g. ["my task", "--compact"] + dry_run: bool # SK_DRY_RUN=1 + debug_timing: bool # SK_DEBUG_TIMING=1 + tools_dir: Path # SK_TOOLS_DIR or default +``` + +Each `sk` invocation gets a fresh `DispatchContext`. State does not leak between +invocations (cf. openai/evals fresh Solver pattern). + +#### `CommandMeta` — Registry Entry + +```python +@dataclass(frozen=True) +class CommandMeta: + name: str + description: str + tags: list[str] + hooks: list[str] +``` + +Frozen dataclass: immutable once loaded, safe for concurrent access. + +### New: `sk harness init` + +`sk harness init` scaffolds a universal harness configuration for any project. It: + +1. **Detects project type** — Python (`pyproject.toml`, `setup.py`, `requirements.txt`), + Node (`package.json`), Rust (`Cargo.toml`), Go (`go.mod`), Java (`pom.xml`, `build.gradle`) +2. **Auto-detects commands** — infers test, lint, format, build commands from project type +3. **Creates `harness.yaml`** — universal config at the project root +4. **Creates `.harness/` structure** — local state directories +5. **Optionally creates CI** — `.github/workflows/harness-ci.yml` with `--ci` flag + +```bash +# Scaffold harness for current project +sk harness init + +# Scaffold for a specific path with a name +sk harness init --target /path/to/project --name my-project + +# Create .harness/ structure only (no harness.yaml) +sk harness init --skeleton-only + +# Include GitHub Actions CI workflow +sk harness init --ci + +# Non-interactive (accept all defaults) +sk harness init --yes +``` + +### Environment Variables + +| Variable | Default | Effect | +|----------|---------|--------| +| `SK_HARNESS` | `0` | Set to `1` to enable dispatch middleware | +| `SK_DRY_RUN` | `0` | Set to `1` to skip actual command execution | +| `SK_DEBUG_TIMING` | `0` | Set to `1` to emit per-hook timing to stderr | +| `SK_TOOLS_DIR` | `~/.copilot/tools` | Override tools directory path | + +--- + +## 4. When to Use Harness Engineering + +### Decision Matrix + +| Situation | Recommended Approach | +|-----------|---------------------| +| New project, no tests yet | `sk harness init --skeleton-only` to create structure; add test commands as tests are written | +| New project, starting with tests | `sk harness init` for full config with auto-detected commands | +| Existing project with tests | `sk harness init` — detects existing test/lint commands; review and adjust `harness.yaml` | +| Existing project, no tests | `sk harness init --skeleton-only`; add tests before defining `success_criteria` | +| AI agent development (evals) | Full harness with `success_criteria`, pass@k, LLM-as-judge | +| Simple one-off script | Skip harness; use `sk learn` to record the pattern after it works | +| Multi-agent orchestration | Full harness + `sk tentacle` for orchestration coordination | +| CI failing from agent changes | Retrofit harness with `required: true` success criteria to gate merges | + +### Signals That You Need a Harness + +**You should start harness engineering when:** + +- You have more than one AI agent working on the same codebase and cannot compare their + results objectively. +- You cannot reproduce an AI agent's results — same prompt, different output, no way to + understand why. +- CI keeps failing from agent-introduced changes that were "tested" locally but not + systematically. +- You are making a decision about which AI model or configuration to use, and you have + no quantitative basis for the comparison. +- You are onboarding a new team member who needs to understand what "a passing agent run" + looks like. + +**You can skip harness engineering when:** + +- The task is a one-off with no recurrence (no need to reproduce). +- The agent output is purely informational (no code changes, no CI gates). +- You are prototyping and not yet ready to define success criteria. + +--- + +## 5. Architecture & Components (for `sk`) + +### File Map + +``` +~/.copilot/tools/ +├── harness/ +│ ├── __init__.py # Package init, exports +│ ├── dispatch.py # DispatchContext, run_with_hooks(), hook pipeline +│ ├── manifest.py # lru_cache manifest loader +│ └── meta.py # CommandMeta dataclass (frozen) +├── harness-manifest.json # Command registry (77+ entries) +├── harness-init.py # sk harness init implementation +│ +# Per-project (created by sk harness init): +/ +├── harness.yaml # Universal harness config +└── .harness/ + ├── tasks/ # Task definition files (.yaml) + ├── reports/ # Agent run reports (.jsonl) + └── state/ # Harness runtime state +``` + +### `harness/dispatch.py` — Dispatch Middleware + +```python +def run_with_hooks(ctx: DispatchContext, fn: Callable) -> int: + """ + Run fn wrapped with pre/post hook pipeline. + Fail-open: hook failures are logged, fn always runs. + Writes JSONL record to harness-telemetry.jsonl on completion. + """ +``` + +Key behaviors: +- Fresh `DispatchContext` per invocation (no state leakage) +- JSONL telemetry written to `~/.copilot/markers/harness-telemetry.jsonl` +- `SK_DRY_RUN=1` skips `fn` but still runs hooks (for testing the hook pipeline itself) +- `SK_DEBUG_TIMING=1` emits per-hook wall-clock timing to stderr + +### `harness/manifest.py` — Manifest Loader + +```python +@lru_cache(maxsize=1) +def load_manifest() -> dict: + """ + Load harness-manifest.json once and cache. + Thread-safe via lru_cache. + """ +``` + +### `harness/meta.py` — CommandMeta + +```python +@dataclass(frozen=True) +class CommandMeta: + name: str + description: str + tags: list[str] + hooks: list[str] +``` + +Frozen dataclass: immutable, hashable, safe for concurrent reads. + +### `harness.yaml` — Per-Project Config + +The universal harness configuration file. See [HARNESS.md](HARNESS.md) for full schema. +Key sections: + +```yaml +harness: + name: my-project + version: "1.0" + +environment: + type: python # auto-detected: python | node | rust | go | java | generic + setup: [] # setup commands to run before agent tasks + +commands: + test: python3 run_all_tests.py + lint: ruff check . + format_check: ruff format --check . + build: ~ + +success_criteria: + - id: tests-pass + command: python3 run_all_tests.py + required: true + - id: lint-clean + command: ruff check . + required: false + +ci: + enabled: false + provider: github-actions +``` + +--- + +## 6. Limitations + +### Non-Determinism Is Irreducible + +A harness does not eliminate non-determinism — it manages it. The same harness, same model, +same prompt will produce different outputs across runs. Strategies to manage this: + +- **Temperature=0**: reduces (but does not eliminate) variance for code generation tasks. +- **pass@k**: run the agent k times and accept if any run passes; report pass rate. +- **Majority vote**: for classification or structured outputs, take the modal answer across + k runs. +- **LLM-as-judge**: use a strong model (Opus, GPT-4) to evaluate quality dimensions that + cannot be expressed as binary tests. + +A harness makes these strategies tractable by providing reproducible inputs and structured +output capture. It does not make the agent deterministic. + +### API Cost + +Running agents in a harness costs API credits. Every `SK_HARNESS=1` invocation may +trigger pre/post hooks that themselves call APIs. Budget accordingly: + +- Use `SK_DRY_RUN=1` to test the harness pipeline without running the agent. +- Limit pass@k runs in development; use higher k only for benchmark comparisons. +- Cache model outputs where possible (openai/evals-style JSONL caching). + +### Dispatch Overhead + +The Python dispatch middleware adds approximately 50–100ms per `sk` command invocation. +This is negligible for interactive use and most automation. It becomes visible at scale +(e.g., running 1000 agent tasks in a loop). For high-frequency automation, consider: + +- Batching tasks and using a single harness wrapper rather than per-command hooks. +- Profiling with `SK_DEBUG_TIMING=1` to identify slow hooks. + +### Multi-Agent Race Conditions + +Concurrent harness runs against the same `.harness/` directory will race on: +- JSONL telemetry append (safe: append-only files are race-resistant on most filesystems) +- State files in `.harness/state/` (unsafe: use file locks or `O_CREAT | O_EXCL`) + +For multi-agent orchestration, use `sk tentacle` which owns coordination, locking, and +handoff sequencing. Do not attempt concurrent harness runs without tentacle coordination. + +### LLM-as-Judge Quality Ceiling + +When using an LLM to grade agent output quality (as opposed to binary test pass/fail), +the judge's quality ceiling is the judge model's capability. A weak judge gives noisy +grades. Best practices: + +- Use the strongest available model for judging. +- Define the rubric explicitly in the judge prompt. +- Cross-validate judge grades with human spot-checks when the rubric matters. +- Prefer binary/checkable success criteria over LLM judgment where possible. + +### Scope: Python Middleware Only + +`SK_HARNESS=1` enables the Python dispatch middleware in `harness/dispatch.py`. It +wraps `sk` commands that flow through the Python `sk.py` shim. It does **not** wrap: + +- The native Rust `sk` binary (compiled `sk watch`, `sk hooks run`, `sk index embed`, + `sk sync run`): these bypass the Python shim entirely. +- Direct script invocations: `python3 briefing.py` does not trigger harness hooks. +- Shell commands that are not `sk` subcommands. + +If your workflow relies on native Rust paths, harness telemetry will be incomplete. +Use the Python shim explicitly (`python3 sk.py `) or accept partial coverage. + +### No Container Isolation + +`sk harness` does not spin up Docker containers. Agent tasks run in the current shell +environment, sharing the filesystem, environment variables, and process space. This is +intentional (lower overhead, simpler setup) but means: + +- Environment drift between runs is possible (e.g., a previous run modifies a file that + affects the next run). +- Parallel runs can interfere. +- Reproducibility depends on clean environment discipline, not container isolation. + +For full isolation (as in SWE-bench's three-layer Docker hierarchy), `sk harness` is not +the right tool. Use a dedicated eval framework with container support. + +### Local-Only CI + +`harness.yaml` CI integration targets GitHub Actions (`sk harness init --ci`). Projects +hosted on other CI platforms (GitLab CI, CircleCI, Jenkins) will need to adapt the +generated workflow manually. There is currently no `--ci-provider` flag. + +--- + +## 7. Best Practices + +### Starting Out + +1. **Use `--skeleton-only` for new projects** before you have tests. Create the `.harness/` + directory structure now; add `success_criteria` once you have something to test. + ```bash + sk harness init --skeleton-only + ``` + +2. **Define `success_criteria` before running agents**, not after. Defining criteria after + seeing agent output introduces confirmation bias. Write the criteria first, then run. + +3. **Start with `required: false`** for criteria you are not yet confident are stable. Promote + to `required: true` once you have verified them across multiple runs. + +### During Agent Work + +4. **Use `sk briefing` inside agent tasks** to inject past knowledge. The harness controls + task dispatch; `sk briefing` injects context that improves agent quality. + ```bash + sk briefing "my task description" --compact + ``` + +5. **Record learnings with `sk learn` after agent runs**. A harness measures outcomes; + `sk learn` preserves the understanding of why those outcomes happened. + ```bash + sk learn --pattern "Harness init pattern" "sk harness init --yes detects Python correctly" --tags "harness,python" + ``` + +6. **Use `SK_DRY_RUN=1` to test the harness itself** before running expensive agents. + Verify that hooks fire, context is injected, and telemetry is written. + ```bash + SK_HARNESS=1 SK_DRY_RUN=1 sk briefing "test" + ``` + +### Measurement and Review + +7. **Review harness telemetry weekly**. Telemetry accumulates in + `~/.copilot/markers/harness-telemetry.jsonl`. Review pass/fail rates, hook timing, + and recurring failures. + +8. **Gate merges on `required: true` success criteria**. Configure `sk harness init --ci` + to generate a GitHub Actions workflow that runs success criteria checks on every PR. + Only merge when required criteria pass. + +9. **Use `sk harness doctor`** to validate your harness configuration before a major agent + run. It checks that commands resolve, criteria are reachable, and environment setup + looks correct. + ```bash + sk harness doctor --json + ``` + +### Multi-Agent Work + +10. **Use `sk tentacle` for multi-agent orchestration**. Do not run concurrent harness + agents without tentacle coordination. The tentacle owns locking, scope, and handoff + sequencing. + +11. **One harness per project, not per agent**. Multiple agents working on the same project + share the same `harness.yaml`. This enforces consistent success criteria across all + agents. + +--- + +## 8. Further Reading + +### sk Documentation +- **[docs/HARNESS.md](HARNESS.md)** — API reference for `harness/dispatch.py`, + `harness/manifest.py`, `harness/meta.py`, and `harness.yaml` schema. +- **[docs/ARCHITECTURE.md](ARCHITECTURE.md)** — Full Python/Rust boundary table and + script inventory. +- **[docs/AGENT-RULES.md](AGENT-RULES.md)** — Mandatory agent rules, including Rule 9 + (claims require evidence) and Rule 3 (test after every change). + +### Benchmark Papers +- **SWE-bench** — arXiv 2310.06770 — Three-layer Docker hierarchy, TestSpec, Fail-To-Pass + + Pass-To-Pass dual metrics. The foundational agent evaluation benchmark. +- **AgentBench** — arXiv 2308.03688 — JSONL task definitions, multi-environment agent + evaluation (OS, DB, web, game environments). +- **CORE benchmark** — arXiv 2412.04524 — The source of the 36% harness performance gap + finding. Directly relevant to harness design choices. + +### Reference Implementations +- **openai/evals** — Registry + Eval + Solver pattern; TaskState interface; fresh Solver + per sample. Clean separation of task definition from solver implementation. +- **harness-boot (qwerfunch)** — `spec.yaml + harness.yaml + state.yaml`, idempotent init, + `.harness/` directory structure. Closest in spirit to `sk harness init`. +- **moai-adk** — Levels pattern (minimal/standard/thorough) with auto-detection based on + project complexity. Inspiration for `--skeleton-only` and progressive harness adoption. +- **Harness Protocol v1 (harness-kit)** — Plugins + mcp-servers + instructions in + `harness.yaml`. Future direction for `sk harness` plugin extensibility. + +--- + +*This document describes the philosophy and rationale behind `sk harness`. For commands, +flags, and schema reference, see [docs/HARNESS.md](HARNESS.md).* diff --git a/docs/HARNESS.md b/docs/HARNESS.md index a1e4891d..48b9ef23 100644 --- a/docs/HARNESS.md +++ b/docs/HARNESS.md @@ -2,6 +2,8 @@ > Dispatch middleware for `sk` — pre/post hooks controlled by `SK_HARNESS=1`. > Source: `harness/dispatch.py`, `harness/meta.py`, `harness/__init__.py` +> +> **Philosophy & motivation:** See [HARNESS-PHILOSOPHY.md](./HARNESS-PHILOSOPHY.md) for why harness engineering matters, when to use it, and its limitations. --- diff --git a/harness-init.py b/harness-init.py new file mode 100644 index 00000000..25d488da --- /dev/null +++ b/harness-init.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +"""sk harness init — Scaffold a universal project harness.""" + +import argparse +import json +import os +import sys + +if os.name == "nt": + sys.stdout.reconfigure(encoding="utf-8") + +HARNESS_YAML_TEMPLATE = """\ +# harness.yaml — Universal project harness config +# Generated by: sk harness init +# Docs: https://github.com/magicpro97/copilot-session-knowledge/blob/main/docs/HARNESS-PHILOSOPHY.md +version: "1.0" + +project: + name: "{name}" + type: "{type}" + description: "" + root: "." + +commands: + test: "{test_cmd}" + lint: "{lint_cmd}" + +success_criteria: + - id: "tests-pass" + description: "All tests pass" + verify_cmd: "{test_cmd}" + required: true + - id: "lint-clean" + description: "No lint errors" + verify_cmd: "{lint_cmd}" + required: false + +reporting: + format: "jsonl" + output_dir: ".harness/reports/" + telemetry: true + +# Uncomment to enable SK_HARNESS dispatch middleware: +# harness_middleware: true +""" + +CI_YAML_TEMPLATE = """\ +# .github/workflows/harness-ci.yml +# Generated by: sk harness init +name: Harness CI + +on: + push: + branches: [main, develop] + pull_request: + branches: [main] + +jobs: + harness-gates: + name: Harness Quality Gates + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run tests + run: {test_cmd} + - name: Run lint + run: {lint_cmd} + continue-on-error: true +""" + +GITIGNORE_LINES = [ + "# Harness reports", + ".copilot/harness-reports/", + ".harness/reports/", +] + + +def _detect_node_pm(target_dir: str) -> str: + for lockfile, pm in [ + ("bun.lockb", "bun"), + ("bun.lock", "bun"), + ("pnpm-lock.yaml", "pnpm"), + ("yarn.lock", "yarn"), + ]: + if os.path.exists(os.path.join(target_dir, lockfile)): + return pm + return "npm" + + +def _detect_project(target_dir: str) -> dict: + markers = [ + ("pyproject.toml", "python-uv", "python3 -m pytest", "python3 -m ruff check ."), + ("requirements.txt", "python-pip", "python3 -m pytest", "python3 -m ruff check ."), + ("setup.py", "python-pip", "python3 -m pytest", "python3 -m ruff check ."), + ("Cargo.toml", "rust", "cargo test", "cargo clippy -- -D warnings"), + ("go.mod", "go", "go test ./...", "go vet ./..."), + ("pom.xml", "java-maven", "mvn test -q", "mvn checkstyle:check -q"), + ("build.gradle", "java-gradle", "./gradlew test -q", "./gradlew check -q"), + ] + + proj_type = "unknown" + test_cmd = "echo 'no tests configured'" + lint_cmd = "echo 'no lint configured'" + + for filename, ptype, tcmd, lcmd in markers: + if os.path.exists(os.path.join(target_dir, filename)): + proj_type = ptype + test_cmd = tcmd + lint_cmd = lcmd + break + + if os.path.exists(os.path.join(target_dir, "package.json")): + pm = _detect_node_pm(target_dir) + proj_type = "node" + test_cmd = f"{pm} test" + lint_cmd = f"{pm} run lint" + + return { + "type": proj_type, + "test_cmd": test_cmd, + "lint_cmd": lint_cmd, + "has_git": os.path.isdir(os.path.join(target_dir, ".git")), + "has_github": os.path.isdir(os.path.join(target_dir, ".github")), + "has_copilot": os.path.isdir(os.path.join(target_dir, ".copilot")), + "has_claude": ( + os.path.isdir(os.path.join(target_dir, ".claude")) or os.path.exists(os.path.join(target_dir, "CLAUDE.md")) + ), + } + + +def _write_file(path: str, content: str) -> None: + try: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.write(content) + except PermissionError as exc: + print(f"✗ Permission denied writing {path}: {exc}") + print(" Try: sudo sk harness init") + raise + + +def _create_harness_dirs(target_dir: str) -> None: + for subdir in ["", "tasks", "reports"]: + dirpath = os.path.join(target_dir, ".harness", subdir) if subdir else os.path.join(target_dir, ".harness") + os.makedirs(dirpath, exist_ok=True) + gitkeep = os.path.join(dirpath, ".gitkeep") + if not os.path.exists(gitkeep): + open(gitkeep, "w").close() + + +def _update_gitignore(target_dir: str) -> None: + gitignore_path = os.path.join(target_dir, ".gitignore") + existing = "" + if os.path.exists(gitignore_path): + try: + with open(gitignore_path, encoding="utf-8") as f: + existing = f.read() + except OSError: + pass + + missing = [ln for ln in GITIGNORE_LINES if ln not in existing] + if not missing: + return + + try: + with open(gitignore_path, "a", encoding="utf-8") as f: + if existing and not existing.endswith("\n"): + f.write("\n") + f.write("\n".join(missing) + "\n") + except PermissionError as exc: + print(f" Warning: could not update .gitignore: {exc}") + + +def _confirm(prompt: str, default: bool = True) -> bool: + hint = "[Y/n]" if default else "[y/N]" + try: + ans = input(f"{prompt} {hint}: ").strip().lower() + except (EOFError, KeyboardInterrupt): + return default + if ans == "": + return default + return ans in ("y", "yes") + + +def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="sk harness init", + description="Scaffold a universal project harness", + ) + parser.add_argument("--target", default=None, help="Target directory (default: cwd)") + parser.add_argument("--name", default=None, help="Project name (default: directory basename)") + parser.add_argument("--skeleton-only", action="store_true", help="Create .harness/ dirs only") + parser.add_argument("--ci", action="store_true", help="Create GitHub Actions CI workflow") + parser.add_argument("--no-ci", action="store_true", help="Skip CI workflow creation") + parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive (accept defaults)") + parser.add_argument("--force", action="store_true", help="Overwrite existing harness.yaml") + parser.add_argument("--json", action="store_true", dest="output_json", help="Output result as JSON") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = _parse_args(argv) + + target = os.path.abspath(args.target or os.getcwd()) + if not os.path.isdir(target): + print(f"✗ Target directory does not exist: {target}") + return 1 + + name = args.name or os.path.basename(target) + info = _detect_project(target) + + harness_yaml_path = os.path.join(target, "harness.yaml") + if os.path.exists(harness_yaml_path) and not args.force and not args.skeleton_only: + if args.output_json: + print(json.dumps({"status": "error", "error": "harness.yaml already exists", "path": harness_yaml_path})) + else: + print(f"✗ harness.yaml already exists at {harness_yaml_path}") + print(" Use --force to overwrite.") + return 1 + + if not args.yes: + print(f"Detected project type: {info['type']}") + print(f" test_cmd: {info['test_cmd']}") + print(f" lint_cmd: {info['lint_cmd']}") + if not _confirm(f"Initialize harness in {target}?", default=True): + print("Aborted.") + return 0 + + created: list[str] = [] + + # Skeleton directories + try: + _create_harness_dirs(target) + created.append(".harness/") + except OSError as exc: + print(f"✗ Failed to create .harness/ dirs: {exc}") + return 1 + + if args.skeleton_only: + if args.output_json: + print(json.dumps({"status": "ok", "target": target, "created": [".harness/"], "project": info})) + else: + print("✓ .harness/ directory initialized (skeleton-only)") + return 0 + + # harness.yaml + yaml_content = HARNESS_YAML_TEMPLATE.format( + name=name, + type=info["type"], + test_cmd=info["test_cmd"], + lint_cmd=info["lint_cmd"], + ) + try: + _write_file(harness_yaml_path, yaml_content) + created.append("harness.yaml") + except OSError as exc: + print(f"✗ Failed to write harness.yaml: {exc}") + return 1 + + # CI workflow + create_ci = False + if args.ci: + create_ci = True + elif not args.no_ci: + show_ci_prompt = info["has_github"] or args.ci + if show_ci_prompt and not args.yes: + create_ci = _confirm("Create .github/workflows/harness-ci.yml?", default=True) + elif args.yes and info["has_github"]: + create_ci = True + + if create_ci: + ci_path = os.path.join(target, ".github", "workflows", "harness-ci.yml") + ci_content = CI_YAML_TEMPLATE.format( + test_cmd=info["test_cmd"], + lint_cmd=info["lint_cmd"], + ) + try: + _write_file(ci_path, ci_content) + created.append(".github/workflows/harness-ci.yml") + except OSError as exc: + print(f" Warning: could not write CI workflow: {exc}") + + # .gitignore + _update_gitignore(target) + + if args.output_json: + print(json.dumps({"status": "ok", "target": target, "created": created, "project": info})) + return 0 + + print(f"✓ harness.yaml created at {harness_yaml_path}") + print("✓ .harness/ directory initialized") + if ".github/workflows/harness-ci.yml" in created: + print("✓ .github/workflows/harness-ci.yml created") + print() + print("Next steps:") + print(" export SK_HARNESS=1 # Enable dispatch middleware") + print(" sk harness doctor # Verify setup") + print(" sk harness check # Verify all scripts exist") + print() + print("Docs: docs/HARNESS-PHILOSOPHY.md") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/harness-manifest.json b/harness-manifest.json index 5c03395c..870b9518 100644 --- a/harness-manifest.json +++ b/harness-manifest.json @@ -656,6 +656,57 @@ "events" ], "group": "events" + }, + "harness init": { + "script": "harness-init.py", + "description": "Scaffold universal project harness (harness.yaml + .harness/ + CI workflow)", + "tags": [ + "harness", + "setup", + "init", + "scaffold" + ], + "group": "harness" + }, + "harness show": { + "script": "sk.py", + "description": "List all registered commands with metadata", + "tags": [ + "harness", + "show", + "list" + ], + "group": "harness" + }, + "harness check": { + "script": "sk.py", + "description": "Verify all registered scripts exist on disk", + "tags": [ + "harness", + "check", + "verify" + ], + "group": "harness" + }, + "harness doctor": { + "script": "sk.py", + "description": "Comprehensive self-check (scripts, DB, hooks, Python)", + "tags": [ + "harness", + "doctor", + "health" + ], + "group": "harness" + }, + "harness config": { + "script": "sk.py", + "description": "Manage harness env vars (SK_HARNESS, SK_DRY_RUN, SK_DEBUG_TIMING, SK_TOOLS_DIR)", + "tags": [ + "harness", + "config", + "env" + ], + "group": "harness" } } } diff --git a/install.py b/install.py index d4243570..9df0f661 100644 --- a/install.py +++ b/install.py @@ -236,6 +236,7 @@ def _register_project(project_root: Path) -> None: "preset-manager.py", "install.py", "statusline.py", + "harness-init.py", ] SUPPORT_FILES = [ diff --git a/sk.py b/sk.py index e7419f66..b2b188b8 100644 --- a/sk.py +++ b/sk.py @@ -870,6 +870,8 @@ def _run_harness(args: list[str]) -> int: """In-process handler for 'sk harness '.""" sub = args[0] if args else "help" + if sub == "init": + return _run("harness-init.py", args[1:]) if sub == "config": return _harness_config(args[1:]) if sub == "show": @@ -878,7 +880,10 @@ def _run_harness(args: list[str]) -> int: return _harness_check(args[1:]) if sub == "doctor": return _harness_doctor(args[1:]) - print("sk harness subcommands: config, show, check, doctor") + print("sk harness subcommands: init, config, show, check, doctor") + print( + " sk harness init [--target PATH] [--name NAME] [--skeleton-only] [--ci] [--no-ci] [--force] [--yes] [--json]" + ) print(" sk harness config list|get|set") print(" sk harness show [--tag TAG] [--json] List all registered commands with metadata") print(" sk harness check [--json] Verify all registered scripts exist on disk") @@ -929,6 +934,7 @@ def _print_help() -> None: f"{direct_list}\n" "\nGrouped namespaces:\n" f"{_help_groups()}\n" + " sk harness init Scaffold universal harness for any project [--target PATH] [--name NAME] [--skeleton-only] [--ci] [--no-ci] [--force] [--yes] [--json]\n" " sk harness config Manage harness env vars (SK_HARNESS, SK_DRY_RUN, SK_DEBUG_TIMING, SK_TOOLS_DIR)\n" " sk harness show List all registered commands with metadata [--tag TAG] [--json]\n" " sk harness check Verify all registered scripts exist [--json]\n" diff --git a/test_fixes.py b/test_fixes.py index 0b69ad12..c1bc78e6 100755 --- a/test_fixes.py +++ b/test_fixes.py @@ -10543,6 +10543,126 @@ def _mcp717_roundtrip(method, params): except Exception as _e718_br: test("I718-17: briefing.py badge source check", False, str(_e718_br)) +# --------------------------------------------------------------------------- +# === I731: sk harness init === +# Tests for harness-init.py: project detection, harness.yaml generation, idempotency, skeleton-only +try: + import importlib.util as _iutil731 + import tempfile as _tmpmod731 + + _hi731_spec = _iutil731.spec_from_file_location("harness_init", REPO / "harness-init.py") + _hi731 = _iutil731.module_from_spec(_hi731_spec) + _hi731_spec.loader.exec_module(_hi731) + + # I731-1: harness-init.py exists and imports cleanly + test("I731-1: harness-init.py importable", True, "imported OK") + + # I731-2: project detection — python-uv + with _tmpmod731.TemporaryDirectory() as _d731a: + open(os.path.join(_d731a, "pyproject.toml"), "w").close() + _info731 = _hi731._detect_project(_d731a) + test("I731-2: detect python-uv from pyproject.toml", _info731["type"] == "python-uv", str(_info731)) + + # I731-3: project detection — unknown + with _tmpmod731.TemporaryDirectory() as _d731b: + _info731b = _hi731._detect_project(_d731b) + test("I731-3: detect unknown when no markers", _info731b["type"] == "unknown", str(_info731b)) + + # I731-4: project detection — node (package.json) + with _tmpmod731.TemporaryDirectory() as _d731c: + open(os.path.join(_d731c, "package.json"), "w").close() + _info731c = _hi731._detect_project(_d731c) + test("I731-4: detect node from package.json", _info731c["type"] == "node", str(_info731c)) + + # I731-5: project detection — rust (Cargo.toml) + with _tmpmod731.TemporaryDirectory() as _d731d: + open(os.path.join(_d731d, "Cargo.toml"), "w").close() + _info731d = _hi731._detect_project(_d731d) + test("I731-5: detect rust from Cargo.toml", _info731d["type"] == "rust", str(_info731d)) + + # I731-6: harness-init creates harness.yaml with correct type + with _tmpmod731.TemporaryDirectory() as _d731e: + open(os.path.join(_d731e, "pyproject.toml"), "w").close() + _rc731 = _hi731.main(["--target", _d731e, "--yes", "--no-ci"]) + _hy731_path = os.path.join(_d731e, "harness.yaml") + _hy731_exists = os.path.exists(_hy731_path) + _hy731_content = open(_hy731_path).read() if _hy731_exists else "" + test("I731-6a: harness init exits 0", _rc731 == 0, f"rc={_rc731}") + test("I731-6b: harness.yaml created", _hy731_exists, "harness.yaml not found") + test("I731-6c: harness.yaml has python-uv type", "python-uv" in _hy731_content, _hy731_content[:200]) + test("I731-6d: harness.yaml has pytest test_cmd", "pytest" in _hy731_content, _hy731_content[:200]) + + # I731-7: idempotency — second run without --force fails with rc=1 + with _tmpmod731.TemporaryDirectory() as _d731f: + open(os.path.join(_d731f, "pyproject.toml"), "w").close() + _hi731.main(["--target", _d731f, "--yes", "--no-ci"]) + _rc731_idem = _hi731.main(["--target", _d731f, "--yes", "--no-ci"]) + test("I731-7: second init without --force returns 1", _rc731_idem == 1, f"rc={_rc731_idem}") + + # I731-8: --force overwrites existing harness.yaml + with _tmpmod731.TemporaryDirectory() as _d731g: + open(os.path.join(_d731g, "pyproject.toml"), "w").close() + _hi731.main(["--target", _d731g, "--yes", "--no-ci"]) + _rc731_force = _hi731.main(["--target", _d731g, "--yes", "--no-ci", "--force"]) + test("I731-8: --force overwrite exits 0", _rc731_force == 0, f"rc={_rc731_force}") + + # I731-9: --skeleton-only skips harness.yaml + with _tmpmod731.TemporaryDirectory() as _d731h: + _rc731_skel = _hi731.main(["--target", _d731h, "--skeleton-only", "--yes"]) + _hy731h_missing = not os.path.exists(os.path.join(_d731h, "harness.yaml")) + _harness_dir_731 = os.path.isdir(os.path.join(_d731h, ".harness")) + test("I731-9a: skeleton-only exits 0", _rc731_skel == 0, f"rc={_rc731_skel}") + test("I731-9b: skeleton-only no harness.yaml", _hy731h_missing, "harness.yaml should not exist") + test("I731-9c: skeleton-only creates .harness/", _harness_dir_731, ".harness/ not created") + + # I731-10: .harness/ subdirs exist + with _tmpmod731.TemporaryDirectory() as _d731i: + _hi731.main(["--target", _d731i, "--yes", "--no-ci"]) + test("I731-10a: .harness/tasks/ exists", os.path.isdir(os.path.join(_d731i, ".harness", "tasks")), "missing") + test( + "I731-10b: .harness/reports/ exists", os.path.isdir(os.path.join(_d731i, ".harness", "reports")), "missing" + ) + + # I731-11: manifest has harness group + import json as _json731 + + _manifest731 = _json731.load(open(REPO / "harness-manifest.json")) + _harness_cmds731 = [k for k, v in _manifest731["commands"].items() if v.get("group") == "harness"] + test("I731-11a: manifest has harness group entries", len(_harness_cmds731) > 0, str(_harness_cmds731)) + test("I731-11b: manifest has harness init entry", "harness init" in _manifest731["commands"], str(_harness_cmds731)) + + # I731-12: sk.py routes init to harness-init.py + _sk731_src = (REPO / "sk.py").read_text(encoding="utf-8") + test( + "I731-12: sk.py routes harness init", + "harness-init.py" in _sk731_src and "init" in _sk731_src, + "routing not found", + ) + +except Exception as _e731: + for _sfx in [ + "1", + "2", + "3", + "4", + "5", + "6a", + "6b", + "6c", + "6d", + "7", + "8", + "9a", + "9b", + "9c", + "10a", + "10b", + "11a", + "11b", + "12", + ]: + test(f"I731-{_sfx}: harness init", False, str(_e731)) + # --------------------------------------------------------------------------- if FAIL == 0: print("🎉 All tests passed!")