Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions agent_fox/_templates/profiles/verifier.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,35 @@ Treat this file as executable workflow policy.
- Reference specific requirement IDs in your assessment.
- Output bare JSON only — no markdown fences, no surrounding prose.

## Verification Checklist

Your context includes a **Verification Checklist** section with two tables:

1. **Task Completion Audit** — every subtask checkbox from tasks.md with its
current state. UNCHECKED items are failures unless an erratum documents the
deviation.
2. **Requirement-to-Test Coverage** — maps each requirement ID to test files
that reference it. UNCOVERED requirements are critical findings.

Use this checklist as your primary verification structure. Walk through every
row and confirm or reject each item.

### Hard gates

- If any subtask is **UNCHECKED** and no erratum covers it → **FAIL** verdict
for the corresponding requirement.
- If any requirement is **UNCOVERED** (no test references it) → **FAIL**
verdict for that requirement.
- SKIPPED subtasks (marked `[-]` or `[~]`) are intentional and do not trigger
failure.

## Focus Areas

- **Requirements coverage:** For each requirement in scope, confirm it is
implemented and matches the acceptance criteria, including edge cases.
Cross-reference the Requirement-to-Test Coverage table.
- **Task completion:** Verify every subtask checkbox is checked. For unchecked
items, check whether an erratum in `docs/errata/` documents the deviation.
- **Test execution:** Run spec tests for the task group first, then the full
suite to check for regressions.
- **Code quality:** Does the implementation follow the design document's
Expand Down
1 change: 1 addition & 0 deletions agent_fox/engine/session_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def _build_prompts(
memory_facts=memory_facts,
conn=self._knowledge_db.connection,
project_root=Path.cwd(),
archetype=self._archetype,
)

system_prompt = build_system_prompt(
Expand Down
23 changes: 23 additions & 0 deletions agent_fox/session/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ def assemble_context(
*,
conn: duckdb.DuckDBPyConnection,
project_root: Path | None = None,
archetype: str | None = None,
) -> str:
"""Assemble task-specific context for a coding session.

Expand All @@ -300,6 +301,9 @@ def assemble_context(
.specs/steering.md after spec files and before memory facts
(64-REQ-2.1, 64-REQ-2.2).

When archetype is ``"verifier"``, appends a structured verification
checklist (task completion audit + requirement-to-test coverage).

Returns a formatted string with section headers.

Logs a warning for any missing spec file but does not raise.
Expand Down Expand Up @@ -392,6 +396,25 @@ def assemble_context(
task_group,
)

# Verification checklist for the verifier archetype
if archetype == "verifier":
try:
from agent_fox.spec.verification_checklist import (
build_verification_checklist,
render_checklist_markdown,
)

tests_dir = project_root / "tests" if project_root is not None else None
checklist = build_verification_checklist(spec_dir, conn, tests_dir=tests_dir)
checklist_md = render_checklist_markdown(checklist)
sections.append(checklist_md)
except Exception:
logger.warning(
"Failed to build verification checklist for %s",
spec_dir.name,
exc_info=True,
)

# 03-REQ-4.3: Return formatted string with section headers
return "\n\n---\n\n".join(sections)

Expand Down
285 changes: 285 additions & 0 deletions agent_fox/spec/verification_checklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
"""Verification checklist builder for the verifier archetype.

Builds a structured checklist from tasks.md checkboxes, requirements.md
acceptance criteria, and errata — injected into the verifier's session
context so it can enforce task completion and requirement coverage.
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from agent_fox.spec._patterns import REQ_ID_BARE
from agent_fox.spec.parser import parse_tasks

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class SubtaskAuditEntry:
"""Audit entry for a single subtask checkbox."""

group_number: int
subtask_id: str
title: str
checked: bool
skipped: bool # [-] or [~] markers


@dataclass(frozen=True)
class RequirementMapping:
"""Maps a requirement ID to its test coverage status."""

requirement_id: str
covered: bool
test_files: list[str] = field(default_factory=list)


@dataclass(frozen=True)
class VerificationChecklist:
"""Complete verification checklist for a spec."""

spec_name: str
task_audit: list[SubtaskAuditEntry]
requirement_coverage: list[RequirementMapping]
has_errata: bool


def build_verification_checklist(
spec_dir: Path,
conn: Any,
*,
tests_dir: Path | None = None,
) -> VerificationChecklist:
"""Build a verification checklist from spec files and DB state.

Args:
spec_dir: Path to the spec directory (e.g. .agent-fox/specs/10_my_spec).
conn: DuckDB connection for errata queries.
tests_dir: Path to the tests directory for requirement-to-test scanning.

Returns:
A populated VerificationChecklist.
"""
spec_name = spec_dir.name

task_audit = _audit_task_checkboxes(spec_dir)
has_errata = _check_errata_exist(conn, spec_name)
requirement_coverage = scan_requirement_test_coverage(spec_dir, tests_dir)

return VerificationChecklist(
spec_name=spec_name,
task_audit=task_audit,
requirement_coverage=requirement_coverage,
has_errata=has_errata,
)


def _audit_task_checkboxes(spec_dir: Path) -> list[SubtaskAuditEntry]:
"""Parse tasks.md and audit every subtask checkbox state."""
tasks_path = spec_dir / "tasks.md"
if not tasks_path.is_file():
return []

try:
groups = parse_tasks(tasks_path)
except Exception:
logger.warning("Failed to parse tasks.md for checklist audit in %s", spec_dir)
return []

entries: list[SubtaskAuditEntry] = []
for group in groups:
for subtask in group.subtasks:
entries.append(
SubtaskAuditEntry(
group_number=group.number,
subtask_id=subtask.id,
title=subtask.title,
checked=subtask.completed,
skipped=_is_subtask_skipped(tasks_path, subtask.id),
)
)
return entries


_SUBTASK_SKIP_PATTERN = re.compile(r"^\s+- \[([~\-])\] (\d+\.(?:\d+|V))")


def _is_subtask_skipped(tasks_path: Path, subtask_id: str) -> bool:
"""Check if a subtask is marked with [-] or [~] (intentionally skipped)."""
text = tasks_path.read_text(encoding="utf-8")
for line in text.splitlines():
m = _SUBTASK_SKIP_PATTERN.match(line)
if m and m.group(2) == subtask_id:
return True
return False


def _check_errata_exist(conn: Any, spec_name: str) -> bool:
"""Check if any errata exist for this spec in the DB."""
try:
row = conn.execute(
"SELECT COUNT(*) FROM errata WHERE spec_name = ?",
[spec_name],
).fetchone()
return row is not None and row[0] > 0
except Exception:
logger.debug("Could not query errata for %s", spec_name)
return False


def scan_requirement_test_coverage(
spec_dir: Path,
tests_dir: Path | None = None,
) -> list[RequirementMapping]:
"""Map requirement IDs to test file coverage.

For each requirement ID found in requirements.md, scans test files
for references (in comments, docstrings, or function names).

Args:
spec_dir: Path to the spec directory containing requirements.md.
tests_dir: Path to the project's tests directory. If None or
non-existent, all requirements are marked uncovered.

Returns:
List of RequirementMapping, one per requirement ID.
"""
req_path = spec_dir / "requirements.md"
if not req_path.is_file():
return []

req_text = req_path.read_text(encoding="utf-8")
req_ids = sorted(set(REQ_ID_BARE.findall(req_text)))
if not req_ids:
return []

test_content = _load_test_file_contents(tests_dir)

mappings: list[RequirementMapping] = []
for req_id in req_ids:
test_files = _find_test_files_for_req(req_id, test_content)
mappings.append(
RequirementMapping(
requirement_id=req_id,
covered=len(test_files) > 0,
test_files=test_files,
)
)
return mappings


def _load_test_file_contents(tests_dir: Path | None) -> dict[str, str]:
"""Load all test file contents into a dict keyed by relative path."""
if tests_dir is None or not tests_dir.is_dir():
return {}
contents: dict[str, str] = {}
for test_file in tests_dir.rglob("test_*.py"):
try:
contents[test_file.name] = test_file.read_text(encoding="utf-8")
except OSError:
continue
return contents


def _normalize_req_id_for_funcname(req_id: str) -> str:
"""Convert '10-REQ-1.1' to 'req_10_1_1' for function name matching."""
without_prefix = re.sub(r"^(\d+)-REQ-", r"req_\1_", req_id)
return without_prefix.replace(".", "_").replace("-", "_").lower()


def _find_test_files_for_req(
req_id: str,
test_content: dict[str, str],
) -> list[str]:
"""Find test files that reference a requirement ID."""
normalized = _normalize_req_id_for_funcname(req_id)
matching: list[str] = []
for filename, content in test_content.items():
if req_id in content or normalized in content:
matching.append(filename)
return sorted(matching)


def render_checklist_markdown(checklist: VerificationChecklist) -> str:
"""Render a verification checklist as markdown for context injection."""
lines = [
"## Verification Checklist",
"",
f"Spec: `{checklist.spec_name}`",
"",
]

# Task completion audit
lines.append("### Task Completion Audit")
lines.append("")
if checklist.task_audit:
lines.append("| Group | Subtask | Title | Status |")
lines.append("|-------|---------|-------|--------|")
for entry in checklist.task_audit:
if entry.skipped:
status = "SKIPPED"
elif entry.checked:
status = "DONE"
else:
status = "**UNCHECKED**"
lines.append(f"| {entry.group_number} | {entry.subtask_id} | {entry.title} | {status} |")
unchecked = [e for e in checklist.task_audit if not e.checked and not e.skipped]
lines.append("")
if unchecked:
lines.append(
f"**{len(unchecked)} unchecked subtask(s).** Each must be completed or documented in an erratum."
)
else:
lines.append("All subtasks completed or intentionally skipped.")
else:
lines.append("No tasks found.")
lines.append("")

# Errata notice
if checklist.has_errata:
lines.append(
"**Note:** Errata exist for this spec. Check `docs/errata/` "
"and the errata DB table for documented deviations."
)
lines.append("")

# Requirement-to-test coverage
lines.append("### Requirement-to-Test Coverage")
lines.append("")
if checklist.requirement_coverage:
lines.append("| Requirement | Status | Test Files |")
lines.append("|-------------|--------|------------|")
for mapping in checklist.requirement_coverage:
if mapping.covered:
status = "COVERED"
files = ", ".join(mapping.test_files)
else:
status = "**UNCOVERED**"
files = "-"
lines.append(f"| {mapping.requirement_id} | {status} | {files} |")
uncovered = [m for m in checklist.requirement_coverage if not m.covered]
lines.append("")
if uncovered:
lines.append(
f"**{len(uncovered)} requirement(s) without test coverage.** "
f"Each uncovered requirement is a critical finding."
)
else:
lines.append("All requirements have test coverage.")
else:
lines.append("No requirements found to map.")
lines.append("")

# Enforcement rules
lines.append("### Enforcement Rules")
lines.append("")
lines.append("- Any **UNCHECKED** subtask without a corresponding erratum → FAIL verdict.")
lines.append("- Any **UNCOVERED** requirement without test coverage → FAIL verdict.")
lines.append("- Errata document intentional deviations — verify they are legitimate.")

return "\n".join(lines)
9 changes: 9 additions & 0 deletions docs/memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

_3176 facts | last updated: 2026-04-23_

**2026-04-23 verifier checklist enforcement (issue #521):** Added
`spec/verification_checklist.py` — builds a structured verification
checklist from tasks.md checkboxes, requirements.md acceptance criteria,
and errata. Injected into verifier context via `assemble_context()`
(new `archetype` parameter). Verifier profile updated with hard gates:
unchecked subtasks without errata → FAIL, uncovered requirements → FAIL.
Requirement-to-test mapping scans test files for req ID references
(string match + normalized function name match). +22 tests (4364 total pass).

**2026-04-23 simplification pass 3:** Inlined core/llm_validation.py into
session/review_parser.py (single consumer), core/retry.py into core/client.py
(single consumer), merged nightshift/fix_types.py into nightshift/fix_pipeline.py,
Expand Down
Loading