From fe715fa5dce7f6b50a1832a48c75cc54d32083bd Mon Sep 17 00:00:00 2001 From: imer Date: Fri, 15 May 2026 16:54:57 +0200 Subject: [PATCH] feat(code-audit): add PR auditor as second code-audit slice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `openworkers audit pr ` extracts atomic claims from a PR description and verdicts each against the actual unified diff. Same pipeline shape as the README auditor (planner → deterministic researcher → checker + trust gate → critic), proving that the `SourceAdapter` abstraction generalises to a second backend. New modules: - core/sources/github.py — GitHubAdapter over a unified diff via a PrSpec value object; live fetch (fetch_pr_from_github) is a sibling helper, decoupled from the adapter so tests stay network-free. parse_pr_url + load_pr_fixture as supporting helpers. - core/orchestrator/pr_flow.py — PrAuditOrchestrator. - core/orchestrator/audit_prompts.py — shared audit-prompt loader (extracted from readme_flow.py so each new auditor registers its templates in one place). - prompts/code_audit/pr_planner.md + pr_checker.md — PR-specific claim types (add | remove | fix | refactor | test | behavior | doc | other) and diff-aware verdict rules. Schema: - core/schemas_audit.py exposes AuditClaim / AuditClaimList as aliases of ReadmeClaim / ReadmeClaimList so PR code reads cleanly without churning the README slice. AuditReport.target captures the audited artefact id (PR URL, etc.). Agents: - providers/code_audit_agents.py adds PrPlannerAgent + PrCheckerAgent alongside the README versions. The same _enforce_trust_gate runs after the LLM responds, so any claim with no diff evidence is forced to `unsupported` regardless of LLM output. AuditCriticAgent is reused as-is. CLI: - `openworkers audit pr ` (or `--fixture ` for offline runs) with optional `--token` falling back to GITHUB_TOKEN / GH_TOKEN. Tests: - tests/fixtures/sample_pr/ — canned PR JSON + unified diff with one verified, one drifted, one contradicted, one fabricated (no-evidence) claim. - tests/code_audit/test_pr_flow.py — URL parser, fixture loader, adapter grep, end-to-end audit with verdict distribution, and an explicit trust-gate-override assertion for the hallucinated WIDGETLIB_DEBUG claim. Docs: README.md, ROADMAP.md, CHANGELOG.md, AGENTS.md updated with the new slice, shared pipeline section, and PR-audit-flow walkthrough. Verification: 159/159 tests pass (153 existing + 6 new PR tests), mypy strict clean on new modules, ruff clean on new files, black formatted. CLI smoke-runs in text and JSON modes against the fixture. Co-Authored-By: Claude Opus 4.7 --- AGENTS.md | 38 +++- CHANGELOG.md | 1 + README.md | 8 +- ROADMAP.md | 2 +- apps/cli/main.py | 49 +++++ core/orchestrator/audit_prompts.py | 45 ++++ core/orchestrator/pr_flow.py | 206 ++++++++++++++++++ core/orchestrator/readme_flow.py | 36 +--- core/schemas_audit.py | 13 ++ core/sources/__init__.py | 18 +- core/sources/github.py | 318 ++++++++++++++++++++++++++++ prompts/code_audit/pr_checker.md | 53 +++++ prompts/code_audit/pr_planner.md | 52 +++++ providers/code_audit_agents.py | 119 +++++++++++ tests/code_audit/test_pr_flow.py | 279 ++++++++++++++++++++++++ tests/fixtures/sample_pr/diff.patch | 43 ++++ tests/fixtures/sample_pr/pr.json | 16 ++ 17 files changed, 1249 insertions(+), 47 deletions(-) create mode 100644 core/orchestrator/audit_prompts.py create mode 100644 core/orchestrator/pr_flow.py create mode 100644 core/sources/github.py create mode 100644 prompts/code_audit/pr_checker.md create mode 100644 prompts/code_audit/pr_planner.md create mode 100644 tests/code_audit/test_pr_flow.py create mode 100644 tests/fixtures/sample_pr/diff.patch create mode 100644 tests/fixtures/sample_pr/pr.json diff --git a/AGENTS.md b/AGENTS.md index e7eb004..159836c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,24 +18,29 @@ core/ blackboard/ # Redis-backed shared state (thesis-only for now) orchestrator/ thesis_flow.py # ThesisOrchestrator — legacy, do not break - readme_flow.py # ReadmeAuditOrchestrator — new code-audit slice + readme_flow.py # ReadmeAuditOrchestrator — README slice + pr_flow.py # PrAuditOrchestrator — PR slice + audit_prompts.py # Shared template loader for all audit prompts compiler.py # PromptCompiler for thesis (blackboard → prompt vars) router/ # Provider tier routing (quality/balanced/cheap) memory/episodic.py # Qdrant episodic memory (thesis) schemas.py # Thesis Pydantic models - schemas_audit.py # Code-audit Pydantic models (kept separate on purpose) + schemas_audit.py # Code-audit Pydantic models (AuditClaim aliases ReadmeClaim) sources/ base.py # SourceAdapter ABC — the new evidence-backend contract local_repo.py # LocalRepoAdapter — grep over a local repo + github.py # GitHubAdapter (over PR diff) + PrSpec + live fetcher providers/ unified.py # UnifiedLLM: provider fallback, breakers, DRY_RUN path thesis_agents.py # Thesis agent suite — untouched, keep passing - code_audit_agents.py # README planner, checker, critic + trust gate + code_audit_agents.py # README + PR planners/checkers + trust gate + critic budget.py # BudgetGuard (contextvars-scoped session ceiling) resilience.py # Tenacity + pybreaker glue prompts/ - *.md # Thesis templates (head_planner, specialist_*, ...) - code_audit/*.md # Audit templates (readme_planner, readme_checker, ...) + *.md # Thesis templates (head_planner, specialist_*, ...) + code_audit/readme_*.md # README auditor templates + code_audit/pr_*.md # PR auditor templates + code_audit/audit_critic.md # Shared critic template (used by both) tools/mcp/ # Literature MCP tools; will migrate behind SourceAdapter apps/ cli/main.py # Single argparse CLI for both `thesis ...` and `audit ...` @@ -65,7 +70,18 @@ This overwrites whatever the LLM said. A confidently hallucinating checker that Mirror this pattern when you add new auditors (PR auditor, compliance auditor, etc.): keep the LLM creative, but enforce the trust invariant in Python. -## The README-audit flow (current slice) +## Shared audit pipeline (README and PR slices) + +Both auditors follow the same four-stage shape, parameterised by adapter and prompts: + +1. **Planner (LLM)** — extracts atomic factual claims as `AuditClaim` (alias of `ReadmeClaim`). The README planner reads `claim_type ∈ {feature, install, usage, requirement, metric, api, other}`; the PR planner reads `claim_type ∈ {add, remove, fix, refactor, test, behavior, doc, other}`. Same schema, domain-specific enums in the prompt. +2. **Researcher (deterministic Python)** — runs `adapter.search_any(claim.search_hints, limit=N)` over whichever `SourceAdapter` the orchestrator is bound to. README slice uses `LocalRepoAdapter`; PR slice uses `GitHubAdapter`. Adapters never call an LLM and never reach the network at audit time (the PR adapter does its fetch ahead, returning a `PrSpec`). +3. **Checker (LLM + trust gate)** — judges each `(claim, evidence)` pair. Trust gate runs after the LLM responds. +4. **Critic (LLM)** — adversarial pass. `AuditCriticAgent` is shared across slices. + +`core/orchestrator/audit_prompts.py` holds the template registry — adding a new auditor just means registering its prompts there. `core/schemas_audit.py` exposes `AuditClaim` / `AuditClaimList` as aliases of the (legacy-named) `ReadmeClaim` / `ReadmeClaimList`. Rename can happen when the third auditor lands; until then the alias keeps PR code readable. + +## The README-audit flow (slice 1) 1. **Planner (LLM)** — reads the README, extracts atomic factual claims with verbatim quotes + grep-friendly search hints. Schema: `ReadmeClaimList`. 2. **Researcher (deterministic Python)** — uses `LocalRepoAdapter.search_any(hints)` to retrieve evidence snippets from the repo. **No LLM call here** — it's just a filesystem grep with safety rails (path traversal guard, file-size cap, dir excludes). @@ -74,6 +90,13 @@ Mirror this pattern when you add new auditors (PR auditor, compliance auditor, e The audited README is **excluded** from its own evidence pool — otherwise every fabricated claim could "verify itself" against the README quote. See `ReadmeAuditOrchestrator.audit` for the exclusion logic. +## The PR-audit flow (slice 2) + +1. **Fetcher** — `fetch_pr_from_github(url, token)` hits the GitHub REST API for PR metadata + unified diff + changed files, returning a `PrSpec`. Tests skip this and use `load_pr_fixture(directory)` to build `PrSpec` from `pr.json` + `diff.patch`. Anonymous requests work for public repos but hit the 60/hour rate limit fast; pass a token explicitly or via `GITHUB_TOKEN` / `GH_TOKEN`. +2. **Adapter** — `GitHubAdapter(pr_spec)` parses the diff once into `DiffHunk` objects (path + new-file line offset + hunk text). `search_any(terms)` greps hunk bodies, stripping the leading `+`/`-`/space marker before matching so a hint of `+` doesn't spuriously match every added line. One hit per hunk to avoid duplicate snippets. +3. **Planner / Checker** — `PrPlannerAgent` + `PrCheckerAgent` mirror their README cousins but use PR-specific prompts. The checker's prompt instructs it to interpret `+` lines as additions, `-` as removals — semantic decisions live in the prompt; mechanical retrieval stays in the adapter. +4. **No artefact-exclusion needed** — the PR description is never part of the diff, so the self-evidence problem doesn't arise here. Different shape from README, same trust invariant. + ## Coexistence rules - **Do not break the thesis path.** The full thesis test suite (`tests/test_*.py` minus `tests/code_audit/`) must stay green. Thesis is being deprecated *gradually*, not yanked. @@ -100,7 +123,8 @@ The audited README is **excluded** from its own evidence pool — otherwise ever See `ROADMAP.md` for the full picture. Short version: - ✅ Slice 1 (shipped): README auditor. -- 🚧 Next slices: PR auditor (`audit pr `), compliance auditor, architecture auditor. All slot in behind the same `SourceAdapter` + agent-suite + trust-gate pattern. +- ✅ Slice 2 (shipped): PR auditor (`audit pr `). +- 🚧 Next slices: compliance auditor, architecture auditor. All slot in behind the same `SourceAdapter` + agent-suite + trust-gate pattern. - 🚧 Layered source adapters: repo (highest trust) → language specs / RFCs → dependency source. The literature MCP tools will migrate behind the same contract. - 🚧 Cherry-picked from the v1.0 plan: tool/source registry, light provider-registry abstraction (Ollama later for local inference on private repos), structlog audit trail. - ⏸️ Deferred: PyPI packaging, Typer CLI rewrite, OTel, Smart truncation, Ollama. Not blocking the audit-track expansion. diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e708e4..ec6d024 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to OpenWorkers are documented here. The format is loosely ba ## [Unreleased] ### Added +- **Code-audit track — PR auditor (second slice).** New `openworkers audit pr ` CLI subcommand verifies a pull request description against its actual diff. Same pipeline shape as the README auditor (planner → deterministic researcher → checker + trust gate → critic), with two new layers: `core/sources/github.py` (`GitHubAdapter` implementing `SourceAdapter` over a unified diff, plus `PrSpec` value object, `parse_pr_url`, `fetch_pr_from_github` for live GitHub fetch with optional `GITHUB_TOKEN`/`GH_TOKEN`, and `load_pr_fixture` for offline tests) and `core/orchestrator/pr_flow.py` (`PrAuditOrchestrator`). New agents `PrPlannerAgent` and `PrCheckerAgent` in `providers/code_audit_agents.py`; `AuditCriticAgent` reused as-is. New prompts `prompts/code_audit/pr_planner.md` + `pr_checker.md` with PR-specific claim types (`add | remove | fix | refactor | test | behavior | doc | other`) and diff-aware verdict rules. Audit-prompt rendering extracted from `readme_flow.py` into `core/orchestrator/audit_prompts.py` so each new auditor can register templates without touching unrelated modules. `tests/fixtures/sample_pr/` contains a canned PR (json + unified diff) with verified / drifted / contradicted / fabricated claims; `tests/code_audit/test_pr_flow.py` asserts verdict distribution and an explicit trust-gate override. - **Code-audit track — README auditor (first slice).** New `openworkers audit readme ` CLI subcommand verifies every factual claim in a README against the actual repository, emitting `verified | drifted | unsupported | contradicted` verdicts with cited file paths. Pipeline: planner (LLM) → researcher (deterministic grep via new `LocalRepoAdapter`) → checker (LLM + post-LLM trust gate) → critic (LLM adversarial pass). New modules: `core/sources/` (`SourceAdapter` ABC + `LocalRepoAdapter`), `core/schemas_audit.py` (Pydantic audit models), `core/orchestrator/readme_flow.py` (`ReadmeAuditOrchestrator`), `providers/code_audit_agents.py` (planner / checker / critic + `_enforce_trust_gate` invariant), `prompts/code_audit/*.md` (audit templates). The trust gate is enforced in code, not delegated to prompts: any claim with no retrieved evidence is forced to `unsupported` regardless of LLM output. The audited README is excluded from its own evidence pool so fabricated claims cannot self-verify. `tests/code_audit/test_readme_flow.py` exercises the full flow with a stubbed `UnifiedLLM.generate_fn` and an `tests/fixtures/sample_repo/` containing a deliberate mix of verified / drifted / contradicted / fabricated claims. Thesis pipeline untouched. - **Contributor onboarding doc** `AGENTS.md` capturing project DNA, code-audit slice design, trust-gate invariant, conventions, and the recipe for adding new auditors. - **RAG over user PDFs** (first incremental v1.0 slice). New `tools/mcp/rag.py` with sentence-aware chunker, `RAGIndexer` (PDF/text → Qdrant via PyMuPDF + FastEmbed `BAAI/bge-small-en-v1.5`), and `RAGSearchTool` (registered as `rag_search` in `ToolRegistry`). Collections namespaced under `rag_*` so they cannot collide with `thesis_corpus` or `episodes`. New CLI: `thesis ingest add|list|delete`. New flag: `thesis research ... --rag-collection ` makes the researcher pull from the user collection alongside arXiv/SS. New field: `ResearchContext.rag_collection`. `tests/test_rag.py` covers chunking edge cases, BOM/text extraction, collection naming, indexer round-trip, privacy gating, and idempotent re-ingest. diff --git a/README.md b/README.md index 1864570..de4ac04 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Both domains share the same DNA: a hierarchical pipeline (planner → researcher ## Code audit *(new track)* -`openworkers audit readme ` extracts every factual claim from a README and verdicts each one against the actual repository: +`openworkers audit readme ` and `openworkers audit pr ` extract every factual claim from a README or PR description and verdict each one against the actual repository / diff: | Verdict | Meaning | |---|---| @@ -26,9 +26,11 @@ Both domains share the same DNA: a hierarchical pipeline (planner → researcher | `contradicted` | Code directly disproves the claim | | `unsupported` | No evidence in the repo — enforced in code, not delegated to the LLM | -The pipeline is planner (LLM extracts claims) → researcher (deterministic grep via `LocalRepoAdapter`) → checker (LLM judges + trust gate forces `unsupported` when evidence is empty) → critic (adversarial pass). The audited README is excluded from its own evidence pool, so fabricated claims cannot verify themselves. +Both auditors use the same pipeline: planner (LLM extracts claims) → researcher (deterministic grep via a `SourceAdapter` — `LocalRepoAdapter` for repos, `GitHubAdapter` for PR diffs) → checker (LLM judges + trust gate forces `unsupported` when evidence is empty) → critic (adversarial pass). The audited artefact is excluded from its own evidence pool, so fabricated claims cannot verify themselves. -Roadmap for this track: PR auditor (PR description vs. diff), compliance auditor (security/policy claims vs. code), architecture auditor (design doc vs. implementation). See [AGENTS.md](AGENTS.md) for the contributor recipe. +`audit pr` reads `GITHUB_TOKEN` or `GH_TOKEN` for higher rate limits; offline testing uses canned fixtures via `--fixture ` (see `tests/fixtures/sample_pr/`). + +Roadmap for this track: compliance auditor (security/policy claims vs. code), architecture auditor (design doc vs. implementation), layered source adapters (specs / RFCs / dependency source), tool/source registry, Ollama for local inference on private repos. See [AGENTS.md](AGENTS.md) for the contributor recipe. ## What it does diff --git a/ROADMAP.md b/ROADMAP.md index 1a6ef3d..6ba201a 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -24,7 +24,7 @@ A second domain alongside the thesis assistant: audit factual claims in technical artefacts against the codebase. Same DNA — multi-agent, structured JSON, never fabricates, trust gate refuses verdicts without evidence — applied to a domain where trustworthy automated review matters to OSS maintainers and contributors. See [AGENTS.md](AGENTS.md) for the contributor recipe and trust-gate invariant. - ✅ **README auditor** *(first slice, shipped)*. `openworkers audit readme ` extracts atomic claims from a README and verdicts each one against the actual codebase as `verified | drifted | unsupported | contradicted`. Trust gate is enforced in `providers/code_audit_agents.py::_enforce_trust_gate`, not in prompts. The audited README is excluded from its own evidence pool. New `SourceAdapter` abstraction (`core/sources/`) with `LocalRepoAdapter`. -- 🚧 **PR auditor** — `openworkers audit pr `. Verify the PR description against the actual diff; flag scope creep, missing tests, undocumented changes. Needs a `GitHubAdapter` implementing `SourceAdapter`. +- ✅ **PR auditor** *(second slice, shipped)*. `openworkers audit pr ` extracts atomic claims from a PR description and verdicts each against the actual unified diff. New `GitHubAdapter` (in `core/sources/github.py`) implements `SourceAdapter` over a unified diff via a `PrSpec` value object; live fetch lives in `fetch_pr_from_github` and is decoupled from the adapter for offline testing. PR-specific prompts under `prompts/code_audit/pr_*.md` use diff-aware verdict rules (e.g., `add` claims must match `+`-prefixed hunks). - 🚧 **Compliance auditor** — `openworkers audit compliance `. Verify security/policy claims ("inputs sanitized", "no secrets", "auth required on X") against the code. - 🚧 **Architecture auditor** — verify RFC / design-doc claims against implementation, language specs, and dependency source. - 🚧 **Layered source adapters** — repo (highest trust) → language specs / RFCs (`SpecAdapter`) → dependency source (`DependencyAdapter`). The existing literature MCP tools (arXiv / Semantic Scholar / CrossRef) will migrate behind the same `SourceAdapter` contract. diff --git a/apps/cli/main.py b/apps/cli/main.py index 5998922..7f23796 100644 --- a/apps/cli/main.py +++ b/apps/cli/main.py @@ -9,6 +9,7 @@ format_session_text, ) from core.memory.episodic import EpisodicMemory +from core.orchestrator.pr_flow import PrAuditOrchestrator, format_pr_report_text from core.orchestrator.readme_flow import ReadmeAuditOrchestrator, format_report_text from core.orchestrator.thesis_flow import ThesisOrchestrator from core.router.engine import Router @@ -235,9 +236,32 @@ async def cmd_audit_dispatch(args): """Route `audit ` to its handler.""" if args.audit_action == "readme": return await cmd_audit_readme(args) + if args.audit_action == "pr": + return await cmd_audit_pr(args) raise SystemExit(f"Unknown audit action: {args.audit_action}") +async def cmd_audit_pr(args): + """Run the PR auditor against a GitHub PR URL (or a local fixture dir).""" + from core.sources.github import fetch_pr_from_github, load_pr_fixture + + unified = create_unified_llm() + orch = PrAuditOrchestrator(unified=unified) + + if args.fixture: + pr_spec = load_pr_fixture(args.fixture) + else: + pr_spec = await asyncio.to_thread(fetch_pr_from_github, args.url, args.token) + + report, critique = await orch.audit(pr_spec) + if args.format == "json": + payload = {"report": report.model_dump(), "critique": critique.model_dump()} + _output(payload, "json", args.output) + else: + _output(format_pr_report_text(report, critique), "text", args.output) + return report + + async def cmd_audit_readme(args): """Run the README auditor on a local repo.""" unified = create_unified_llm() @@ -399,6 +423,31 @@ def build_parser() -> argparse.ArgumentParser: ) add_output_args(p_audit_readme) + p_audit_pr = audit_sub.add_parser( + "pr", + help="Verify a PR description against the actual diff", + ) + p_audit_pr.add_argument( + "url", + type=str, + nargs="?", + default="", + help="GitHub PR URL (https://github.com/owner/repo/pull/N)", + ) + p_audit_pr.add_argument( + "--fixture", + type=str, + default=None, + help="Audit a fixture directory containing pr.json + diff.patch instead of hitting GitHub", + ) + p_audit_pr.add_argument( + "--token", + type=str, + default=None, + help="GitHub token (default: $GITHUB_TOKEN or $GH_TOKEN)", + ) + add_output_args(p_audit_pr) + p_ingest = sub.add_parser("ingest", help="Manage user RAG collections (PDF/text -> Qdrant)") ingest_sub = p_ingest.add_subparsers(dest="ingest_action", required=True) diff --git a/core/orchestrator/audit_prompts.py b/core/orchestrator/audit_prompts.py new file mode 100644 index 0000000..edc6f44 --- /dev/null +++ b/core/orchestrator/audit_prompts.py @@ -0,0 +1,45 @@ +"""Shared prompt loader for the code-audit family of orchestrators. + +Lives outside any single flow module so each new auditor can register +its templates without circular imports. Deliberately not reusing +``PromptCompiler``: that compiler is wired to extract thesis blackboard +state, which audit flows don't use. Audit templates only need +``{{ var }}`` substitution. +""" + +from __future__ import annotations + +import os +from typing import Any + +_AUDIT_PROMPT_DIR = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + "prompts", + "code_audit", +) + +# Registry of template name → filename under ``prompts/code_audit/``. +# Each new auditor appends its entries here; the renderer accepts any +# registered name without further changes elsewhere. +TEMPLATE_FILES: dict[str, str] = { + "readme_planner": "readme_planner.md", + "readme_checker": "readme_checker.md", + "pr_planner": "pr_planner.md", + "pr_checker": "pr_checker.md", + "audit_critic": "audit_critic.md", +} + + +def render_audit_prompt(name: str, variables: dict[str, Any]) -> str: + filename = TEMPLATE_FILES.get(name) + if not filename: + raise ValueError(f"Unknown audit template: {name}") + path = os.path.join(_AUDIT_PROMPT_DIR, filename) + try: + with open(path, encoding="utf-8") as f: + template = f.read() + except OSError: + return f"[Template {name} not found at {path}]" + for key, value in variables.items(): + template = template.replace("{{ " + key + " }}", str(value)) + return template diff --git a/core/orchestrator/pr_flow.py b/core/orchestrator/pr_flow.py new file mode 100644 index 0000000..852bb4c --- /dev/null +++ b/core/orchestrator/pr_flow.py @@ -0,0 +1,206 @@ +"""PR audit orchestrator. + +Verifies a pull request's description against its actual diff. Same +pipeline shape as ``ReadmeAuditOrchestrator``: + + planner (LLM) → researcher (deterministic grep over diff) + → checker (LLM + trust gate) + → critic (LLM adversarial pass) + +The "researcher" is a content-aware grep on the unified diff via +``GitHubAdapter`` — no LLM call there, no network call once we have the +``PrSpec``. Token-free for tests. +""" + +from __future__ import annotations + +import asyncio +import logging +import time +from collections import Counter +from typing import Any, Callable + +from core.orchestrator.audit_prompts import render_audit_prompt as _render_audit_prompt +from core.schemas_audit import ( + ALL_VERDICTS, + VERDICT_UNSUPPORTED, + AuditClaim, + AuditCritique, + AuditReport, + ClaimEvidence, + ClaimVerdict, + EvidenceRef, +) +from core.sources.github import GitHubAdapter, PrSpec +from providers.code_audit_agents import ( + AuditCriticAgent, + PrCheckerAgent, + PrPlannerAgent, +) +from providers.unified import UnifiedLLM + +logger = logging.getLogger(__name__) + + +class PrAuditOrchestrator: + """Audit a PR description against the diff it carries.""" + + def __init__( + self, + unified: UnifiedLLM, + prompt_renderer: Callable[[str, dict[str, Any]], str] | None = None, + max_evidence_per_claim: int = 5, + ) -> None: + self.unified = unified + self.render = prompt_renderer or _render_audit_prompt + self.max_evidence_per_claim = max_evidence_per_claim + self.planner = PrPlannerAgent(unified=unified, prompt_renderer=self.render) + self.checker = PrCheckerAgent(unified=unified, prompt_renderer=self.render) + self.critic = AuditCriticAgent(unified=unified, prompt_renderer=self.render) + + async def audit(self, pr: PrSpec) -> tuple[AuditReport, AuditCritique]: + start = time.time() + adapter = GitHubAdapter(pr) + errors: list[str] = [] + + description = pr.description + target = pr.url or f"{pr.owner}/{pr.repo}#{pr.number}" + + if not description.strip(): + empty = AuditReport( + repo_path=f"{pr.owner}/{pr.repo}", + target=target, + verdicts=[], + summary=dict.fromkeys(ALL_VERDICTS, 0), + errors=["PR description is empty — nothing to audit."], + ) + return empty, AuditCritique() + + # ── Stage 1: Planner extracts claims from PR title + body ── + try: + planner_result = await self.planner.execute( + pr_description=description, + pr_url=pr.url, + changed_files=pr.changed_files, + ) + claim_list = planner_result["output"] + except Exception as e: + errors.append(f"planner: {e}") + from core.schemas_audit import AuditClaimList + + claim_list = AuditClaimList(claims=[], readme_path=target) + + # ── Stage 2: Researcher retrieves diff evidence (deterministic) ── + evidence = await asyncio.to_thread(self._retrieve_all_evidence, claim_list.claims, adapter) + + # ── Stage 3: Checker renders verdicts (with trust gate) ── + if claim_list.claims: + try: + checker_result = await self.checker.execute(claim_list.claims, evidence) + verdicts: list[ClaimVerdict] = list(checker_result["output"].verdicts) + except Exception as e: + errors.append(f"checker: {e}") + verdicts = [ + ClaimVerdict( + claim_id=c.claim_id, + claim_text=c.claim_text, + verdict=VERDICT_UNSUPPORTED, + confidence=0.0, + evidence_paths=[], + notes=f"Checker failed: {e}", + ) + for c in claim_list.claims + ] + else: + verdicts = [] + + # ── Stage 4: Critic adversarial pass ── + try: + critic_result = await self.critic.execute(verdicts, description) + critique: AuditCritique = critic_result["output"] + except Exception as e: + errors.append(f"critic: {e}") + critique = AuditCritique() + + summary = Counter(v.verdict for v in verdicts) + report = AuditReport( + repo_path=f"{pr.owner}/{pr.repo}", + target=target, + verdicts=verdicts, + summary={v: int(summary.get(v, 0)) for v in ALL_VERDICTS}, + errors=errors, + ) + + elapsed_ms = int((time.time() - start) * 1000) + logger.info( + "pr_audit completed pr=%s claims=%d elapsed_ms=%d errors=%d", + target, + len(verdicts), + elapsed_ms, + len(errors), + ) + return report, critique + + def _retrieve_all_evidence( + self, + claims: list[AuditClaim], + adapter: GitHubAdapter, + ) -> list[ClaimEvidence]: + out: list[ClaimEvidence] = [] + for claim in claims: + snippets = adapter.search_any(claim.search_hints, limit=self.max_evidence_per_claim) + refs = [ + EvidenceRef( + path=s.path, + line_start=s.line_start, + line_end=s.line_end, + text=s.text, + source=s.source, + ) + for s in snippets + ] + out.append(ClaimEvidence(claim_id=claim.claim_id, snippets=refs)) + return out + + +def format_pr_report_text( + report: AuditReport, + critique: AuditCritique | None = None, +) -> str: + """Pretty-print a PR audit report for terminal output.""" + lines: list[str] = [] + lines.append(f"PR audit — {report.target or report.repo_path}") + lines.append("") + if report.summary: + summary_line = " ".join(f"{k}={v}" for k, v in report.summary.items()) + lines.append(f"Summary: {summary_line}") + lines.append("") + for v in report.verdicts: + marker = { + "verified": "✓", + "drifted": "≠", + "contradicted": "✗", + "unsupported": "?", + }.get(v.verdict, "·") + lines.append(f"{marker} [{v.verdict.upper():12s}] {v.claim_id}: {v.claim_text}") + for p in v.evidence_paths: + lines.append(f" ↳ {p}") + if v.notes: + lines.append(f" note: {v.notes}") + if report.errors: + lines.append("") + lines.append("Errors:") + for e in report.errors: + lines.append(f" - {e}") + if critique is not None: + lines.append("") + lines.append("Critic pass:") + for wv in critique.weak_verdicts: + lines.append(f" weak: {wv}") + for mc in critique.missed_claims: + lines.append(f" missed: {mc}") + for sg in critique.suggestions: + lines.append(f" suggest: {sg}") + if critique.overall_assessment: + lines.append(f" → {critique.overall_assessment}") + return "\n".join(lines) diff --git a/core/orchestrator/readme_flow.py b/core/orchestrator/readme_flow.py index 503419c..d446de9 100644 --- a/core/orchestrator/readme_flow.py +++ b/core/orchestrator/readme_flow.py @@ -20,12 +20,12 @@ import asyncio import logging -import os import time from collections import Counter from pathlib import Path from typing import Any, Callable +from core.orchestrator.audit_prompts import render_audit_prompt as _render_audit_prompt from core.schemas_audit import ( ALL_VERDICTS, VERDICT_UNSUPPORTED, @@ -48,40 +48,6 @@ logger = logging.getLogger(__name__) -_AUDIT_PROMPT_DIR = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), - "prompts", - "code_audit", -) - -_TEMPLATE_FILES = { - "readme_planner": "readme_planner.md", - "readme_checker": "readme_checker.md", - "audit_critic": "audit_critic.md", -} - - -def _render_audit_prompt(name: str, variables: dict[str, Any]) -> str: - """Tiny placeholder-substitution renderer for audit prompts. - - Deliberately not reusing PromptCompiler: that compiler is wired to - extract blackboard state, which this slice doesn't use. Audit - templates only need ``{{ var }}`` substitution. - """ - filename = _TEMPLATE_FILES.get(name) - if not filename: - raise ValueError(f"Unknown audit template: {name}") - path = os.path.join(_AUDIT_PROMPT_DIR, filename) - try: - with open(path, encoding="utf-8") as f: - template = f.read() - except OSError: - return f"[Template {name} not found at {path}]" - for key, value in variables.items(): - template = template.replace("{{ " + key + " }}", str(value)) - return template - - class ReadmeAuditOrchestrator: """Run a README audit end-to-end against a local repo.""" diff --git a/core/schemas_audit.py b/core/schemas_audit.py index 577b195..aadd022 100644 --- a/core/schemas_audit.py +++ b/core/schemas_audit.py @@ -41,6 +41,15 @@ class ReadmeClaimList(BaseModel): readme_path: str = "" +# Cross-domain aliases. README and PR claims share the same shape; the +# differing-in-name dance is purely for call-site readability. When a +# third auditor lands we'll rename ReadmeClaim → AuditClaim cleanly; +# until then the alias keeps PR code self-explanatory without churning +# the README slice's imports. +AuditClaim = ReadmeClaim +AuditClaimList = ReadmeClaimList + + class EvidenceRef(BaseModel): """Adapter-agnostic citation handle. Mirrors ``EvidenceSnippet`` in shape but is the JSON-serialisable form passed across the @@ -77,6 +86,10 @@ class ClaimVerdictList(BaseModel): class AuditReport(BaseModel): repo_path: str readme_path: str = "" + # Free-form identifier of the audited artefact (PR URL, doc path, …). + # Kept optional so the README slice can leave it blank and stay + # backwards-compatible with the prior shape. + target: str = "" verdicts: list[ClaimVerdict] = Field(default_factory=list) summary: dict[str, int] = Field(default_factory=dict) errors: list[str] = Field(default_factory=list) diff --git a/core/sources/__init__.py b/core/sources/__init__.py index c384f96..fbb0660 100644 --- a/core/sources/__init__.py +++ b/core/sources/__init__.py @@ -8,6 +8,22 @@ """ from core.sources.base import EvidenceSnippet, SourceAdapter +from core.sources.github import ( + GitHubAdapter, + PrSpec, + fetch_pr_from_github, + load_pr_fixture, + parse_pr_url, +) from core.sources.local_repo import LocalRepoAdapter -__all__ = ["EvidenceSnippet", "LocalRepoAdapter", "SourceAdapter"] +__all__ = [ + "EvidenceSnippet", + "GitHubAdapter", + "LocalRepoAdapter", + "PrSpec", + "SourceAdapter", + "fetch_pr_from_github", + "load_pr_fixture", + "parse_pr_url", +] diff --git a/core/sources/github.py b/core/sources/github.py new file mode 100644 index 0000000..5cf41e8 --- /dev/null +++ b/core/sources/github.py @@ -0,0 +1,318 @@ +"""GitHub PR adapter. + +Implements ``SourceAdapter`` over the **unified diff** of a pull +request. The adapter is intentionally constructed from a ``PrSpec`` +value object rather than reaching out to the GitHub API in its +constructor: + +- Tests stay pure: build a ``PrSpec`` from a canned fixture and audit + it without touching the network. +- Future-proof: a GitLab / Gerrit adapter can share the same diff + grepping logic by handing us a ``PrSpec``. + +The network fetch (``fetch_pr_from_github``) lives in this module as a +sibling helper, separate from the adapter, so the adapter has no +dependency on httpx and remains trivially testable. +""" + +from __future__ import annotations + +import json +import os +import re +from collections.abc import Iterable +from dataclasses import dataclass, field + +import httpx + +from core.sources.base import EvidenceSnippet, SourceAdapter + +_PR_URL_RE = re.compile( + r"^https?://github\.com/(?P[^/]+)/(?P[^/]+)/pull/(?P\d+)(?:[/?#].*)?$", + re.IGNORECASE, +) + +_HUNK_HEADER_RE = re.compile(r"^@@ -\d+(?:,\d+)? \+(?P\d+)(?:,\d+)? @@") +_DIFF_FILE_HEADER_RE = re.compile(r"^diff --git a/(?P.+) b/(?P.+)$") + +_MAX_DIFF_BYTES = 1_000_000 # GitHub already caps; this is a defence-in-depth bound. + + +@dataclass +class DiffHunk: + """A single hunk lifted from a unified diff. + + ``new_start`` is the 1-based line number the hunk applies to in the + new file — useful for citation handles like ``foo.py:42``. + """ + + path: str + new_start: int + text: str + + +@dataclass +class PrSpec: + """The minimal projection of a PR that the auditor needs. + + Network-free: construct directly in tests; ``fetch_pr_from_github`` + populates it in production. + """ + + owner: str + repo: str + number: int + title: str + body: str + base_sha: str = "" + head_sha: str = "" + diff: str = "" + changed_files: list[str] = field(default_factory=list) + url: str = "" + + @property + def description(self) -> str: + """Title + body joined for claim extraction.""" + if not self.body: + return self.title + return f"{self.title}\n\n{self.body}" + + +def parse_pr_url(url: str) -> tuple[str, str, int]: + """Parse a GitHub PR URL into (owner, repo, number). + + Raises ``ValueError`` on anything that isn't a PR URL. We keep this + strict so a typo (e.g., ``/issues/`` instead of ``/pull/``) fails + loud rather than hitting a 404 mid-pipeline. + """ + match = _PR_URL_RE.match(url.strip()) + if not match: + raise ValueError(f"Not a GitHub PR URL: {url!r}") + return match["owner"], match["repo"], int(match["number"]) + + +class GitHubAdapter(SourceAdapter): + """Searches the unified diff of a PR for evidence of claims. + + The "repo" the adapter sees is the *diff*, not the working tree. + Claims phrased as "adds X" need to match additions; "removes Y" + needs to match deletions. The checker prompt is responsible for + that semantic interpretation — the adapter's job is just to + surface relevant hunk excerpts. + """ + + name = "github_pr" + + def __init__(self, pr: PrSpec) -> None: + self.pr = pr + self._hunks: list[DiffHunk] = list(_iter_hunks(pr.diff)) + + def search(self, query: str, limit: int = 5) -> list[EvidenceSnippet]: + query = (query or "").strip() + if not query: + return [] + pattern = re.compile(re.escape(query), re.IGNORECASE) + return self._search_pattern(pattern, limit) + + def search_any(self, terms: Iterable[str], limit: int = 5) -> list[EvidenceSnippet]: + cleaned = [re.escape(t.strip()) for t in terms if t and t.strip()] + if not cleaned: + return [] + pattern = re.compile("|".join(cleaned), re.IGNORECASE) + return self._search_pattern(pattern, limit) + + def _search_pattern(self, pattern: re.Pattern[str], limit: int) -> list[EvidenceSnippet]: + hits: list[EvidenceSnippet] = [] + for hunk in self._hunks: + if len(hits) >= limit: + break + # Drop the leading +/-/space marker before matching so a + # hint that happens to be ``+`` (rare) does not spuriously + # match every added line. + for offset, line in enumerate(hunk.text.splitlines()): + payload = line[1:] if line[:1] in "+- " else line + if pattern.search(payload): + hits.append( + EvidenceSnippet( + path=hunk.path, + line_start=hunk.new_start + offset, + line_end=hunk.new_start + offset, + text=hunk.text, + source=self.name, + ) + ) + # One hit per hunk: the whole hunk is already in + # ``text``, so multiple line matches in the same + # hunk would just duplicate the snippet. + break + return hits + + def fetch(self, path: str, line_start: int = 0, line_end: int = 0) -> EvidenceSnippet: + for hunk in self._hunks: + if hunk.path != path: + continue + if line_start and not ( + hunk.new_start <= line_start <= hunk.new_start + len(hunk.text.splitlines()) + ): + continue + return EvidenceSnippet( + path=hunk.path, + line_start=hunk.new_start, + line_end=hunk.new_start + len(hunk.text.splitlines()) - 1, + text=hunk.text, + source=self.name, + ) + return EvidenceSnippet(path=path, line_start=0, line_end=0, text="", source=self.name) + + +def _iter_hunks(diff_text: str) -> Iterable[DiffHunk]: + """Yield ``DiffHunk`` from a unified diff. + + The parser is deliberately small and accepts the GitHub-flavoured + output (``diff --git`` headers, ``@@`` hunk headers, ``+``/``-`` + prefixed lines). It tolerates unfamiliar lines by skipping them — + binary-file markers, mode change lines, etc. + """ + current_path = "" + current_hunk: list[str] | None = None + current_new_start = 0 + for raw_line in diff_text.splitlines(): + file_match = _DIFF_FILE_HEADER_RE.match(raw_line) + if file_match: + if current_hunk is not None and current_path: + yield DiffHunk( + path=current_path, + new_start=current_new_start, + text="\n".join(current_hunk), + ) + current_path = file_match["new"] + current_hunk = None + continue + hunk_match = _HUNK_HEADER_RE.match(raw_line) + if hunk_match: + if current_hunk is not None and current_path: + yield DiffHunk( + path=current_path, + new_start=current_new_start, + text="\n".join(current_hunk), + ) + current_new_start = int(hunk_match["new_start"]) + current_hunk = [] + continue + if current_hunk is not None and raw_line[:1] in {"+", "-", " "}: + current_hunk.append(raw_line) + if current_hunk is not None and current_path: + yield DiffHunk( + path=current_path, + new_start=current_new_start, + text="\n".join(current_hunk), + ) + + +def fetch_pr_from_github( + url: str, + token: str | None = None, + client: httpx.Client | None = None, + timeout: float = 15.0, +) -> PrSpec: + """Pull PR metadata + unified diff from the GitHub REST API. + + Token resolution order: explicit ``token`` arg → ``GITHUB_TOKEN`` + env → ``GH_TOKEN`` env. Anonymous requests work for public repos + but bump into the 60/hour rate limit fast, so we always surface + the token miss in the logs rather than hiding it. + """ + owner, repo, number = parse_pr_url(url) + token = token or os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN") or "" + headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "openworkers-pr-auditor", + } + if token: + headers["Authorization"] = f"Bearer {token}" + + owns_client = client is None + http = client or httpx.Client(timeout=timeout, headers=headers) + try: + api_root = f"https://api.github.com/repos/{owner}/{repo}/pulls/{number}" + pr_resp = http.get(api_root, headers=headers if owns_client else None) + pr_resp.raise_for_status() + pr_data = pr_resp.json() + + diff_resp = http.get( + api_root, + headers={ + **(headers if owns_client else {}), + "Accept": "application/vnd.github.v3.diff", + }, + ) + diff_resp.raise_for_status() + diff_text = diff_resp.text[:_MAX_DIFF_BYTES] + + files_resp = http.get( + f"{api_root}/files", + headers=headers if owns_client else None, + params={"per_page": 100}, + ) + files_resp.raise_for_status() + files_payload = files_resp.json() + finally: + if owns_client: + http.close() + + changed_files = [f.get("filename", "") for f in files_payload if f.get("filename")] + return PrSpec( + owner=owner, + repo=repo, + number=number, + title=str(pr_data.get("title", "")), + body=str(pr_data.get("body") or ""), + base_sha=str(pr_data.get("base", {}).get("sha", "")), + head_sha=str(pr_data.get("head", {}).get("sha", "")), + diff=diff_text, + changed_files=changed_files, + url=url, + ) + + +def load_pr_fixture(directory: str) -> PrSpec: + """Build a ``PrSpec`` from a canned fixture directory. + + Layout: + /pr.json # subset of the GitHub PR API response + /diff.patch # unified diff + + Useful for tests and offline demos; mirrors the shape returned by + ``fetch_pr_from_github`` without the network. + """ + base = directory.rstrip("/") + with open(f"{base}/pr.json", encoding="utf-8") as f: + pr_data = json.load(f) + try: + with open(f"{base}/diff.patch", encoding="utf-8") as f: + diff_text = f.read() + except FileNotFoundError: + diff_text = "" + return PrSpec( + owner=str(pr_data.get("owner", "fixture")), + repo=str(pr_data.get("repo", "fixture")), + number=int(pr_data.get("number", 0)), + title=str(pr_data.get("title", "")), + body=str(pr_data.get("body", "")), + base_sha=str(pr_data.get("base_sha", "")), + head_sha=str(pr_data.get("head_sha", "")), + diff=diff_text, + changed_files=list(pr_data.get("changed_files", [])), + url=str(pr_data.get("url", "")), + ) + + +__all__ = [ + "DiffHunk", + "GitHubAdapter", + "PrSpec", + "fetch_pr_from_github", + "load_pr_fixture", + "parse_pr_url", +] diff --git a/prompts/code_audit/pr_checker.md b/prompts/code_audit/pr_checker.md new file mode 100644 index 0000000..a7dab7e --- /dev/null +++ b/prompts/code_audit/pr_checker.md @@ -0,0 +1,53 @@ +# SPECIALIST: PR CHECKER + +You are the PR CHECKER agent for the openworkers code-audit pipeline. + +## Your Role +For each PR claim, decide whether the retrieved **diff evidence** supports, drifts from, contradicts, or fails to support it. You are the trust gate: if there is no evidence, the verdict is `unsupported` — never `verified`. + +## Input +The user message contains: +- `CLAIMS`: JSON list of `{claim_id, claim_text, claim_type, search_hints}`. +- `DIFF EVIDENCE`: JSON list of `{claim_id, snippets: [{path, line_start, line_end, text, source}]}`. + +Each snippet is a diff hunk from the PR. Lines beginning with `+` are added, `-` removed, ` ` unchanged context. + +Each claim has exactly one evidence entry (possibly with an empty `snippets` list). + +## Output: ClaimVerdictList (JSON) +Return one JSON object with this exact schema. No prose, no markdown fences. + +```json +{ + "verdicts": [ + { + "claim_id": "claim-01", + "claim_text": "", + "verdict": "{{ verdict_values }}", + "confidence": 0.0, + "evidence_paths": [""], + "notes": "" + } + ] +} +``` + +## Verdict Rules (diff-aware) +- **`verified`**: the diff shows exactly what the claim says. + - `claim_type=add` → snippets contain `+`-prefixed lines introducing the named thing. + - `claim_type=remove` → snippets contain `-`-prefixed lines removing it. + - `claim_type=refactor` → snippets contain both `+` and `-` lines consistent with the rename / signature change. + - `claim_type=test` → snippets are in test files. + - `claim_type=fix` → snippets touch the area implicated by the bug. +- **`drifted`**: the diff is related but **does not match the claim's specifics**. README says "adds `--port` flag" but diff adds `--bind`. Note must state PR-text vs. diff divergence. +- **`contradicted`**: the diff directly disproves the claim. PR says "no telemetry added" but diff adds a telemetry emitter; PR says "removes X" but X is untouched. +- **`unsupported`**: snippet list empty, or snippets are irrelevant boilerplate (whitespace, lockfile noise, unrelated files). **You must emit `unsupported` if the snippet list is empty.** Confidence 0.0. + +## Trustworthiness Gate +You **never** fabricate evidence. You **never** mark a claim `verified` without at least one snippet that materially supports it. If unsure, prefer `unsupported` over `verified`. + +## Format +- One verdict per input claim, same `claim_id`. +- `confidence` in [0.0, 1.0]. +- `evidence_paths` lists `path:line_start-line_end` strings drawn only from the provided snippets. +- No commentary outside the JSON object. diff --git a/prompts/code_audit/pr_planner.md b/prompts/code_audit/pr_planner.md new file mode 100644 index 0000000..6415e32 --- /dev/null +++ b/prompts/code_audit/pr_planner.md @@ -0,0 +1,52 @@ +# SPECIALIST: PR PLANNER + +You are the PR PLANNER agent for the openworkers code-audit pipeline. + +## Your Role +Read a pull request title + body and extract every **atomic factual claim** the author makes about the diff. Each claim must be independently verifiable against the actual changes. Quote verbatim — do not paraphrase. + +## Input +The user message contains the PR's URL, the list of changed files, and the title + body between `---BEGIN PR DESCRIPTION---` / `---END PR DESCRIPTION---`. + +PR URL: `{{ pr_url }}` +Files changed: `{{ files_changed }}` + +## Output: ReadmeClaimList (JSON) +Return one JSON object with this exact schema. No prose, no markdown fences. + +```json +{ + "readme_path": "{{ pr_url }}", + "claims": [ + { + "claim_id": "claim-01", + "claim_text": "", + "claim_type": "add | remove | fix | refactor | test | behavior | doc | other", + "search_hints": ["", "", ""] + } + ] +} +``` + +## What counts as a claim +- **add**: "Adds X", "introduces Y", "new module Z" +- **remove**: "Removes X", "deletes Y", "drops support for Z" +- **fix**: "Fixes bug in X", "resolves Y", "fixes #123" +- **refactor**: "Refactors X to use Y", "renames Z to W", "extracts X" +- **test**: "Adds tests for X", "covers Y in tests" +- **behavior**: "Default changes from A to B", "X now returns Y" +- **doc**: "Documents X", "updates README for Y" + +## Rules +- **Atomic**: split compound sentences. "Adds X and removes Y" → two claims. +- **Verbatim**: `claim_text` must quote the PR text directly (you may trim leading bullets / numbering). +- **Skip**: marketing prose, motivation paragraphs ("we should care because…"), thank-yous, screenshots, reviewer call-outs. +- **Include**: every concrete add/remove/refactor/fix/test/behavior/doc statement that the diff is expected to demonstrate. +- **Hints**: 2–6 grep-friendly tokens — module names, function names, file paths, flag names, env var names. Skip generic English words. +- **Stable IDs**: sequential `claim-01`, `claim-02`, … so downstream agents can reference them. +- If the PR description contains zero verifiable claims (e.g., it's empty or pure motivation), return `"claims": []`. + +## Forbidden +- Inventing claims not present in the description. +- Producing prose or summary outside the JSON. +- Wrapping JSON in markdown code fences. diff --git a/providers/code_audit_agents.py b/providers/code_audit_agents.py index f84f801..f537106 100644 --- a/providers/code_audit_agents.py +++ b/providers/code_audit_agents.py @@ -38,6 +38,8 @@ __all__ = [ "AuditCriticAgent", + "PrCheckerAgent", + "PrPlannerAgent", "ReadmeCheckerAgent", "ReadmePlannerAgent", "_schema_for", @@ -377,3 +379,120 @@ async def execute( def new_claim_id() -> str: return f"claim-{uuid.uuid4().hex[:8]}" + + +# ────────────────────────────────────────────────────────────────────── +# PR-audit agents +# +# Parallel to the README ones rather than a shared base class: per +# AGENTS.md, two examples is not yet a pattern. When the third auditor +# (compliance) lands, ``ClaimPlannerAgent`` / ``ClaimCheckerAgent`` can +# absorb both — but premature extraction here would commit us to an +# interface we'd inevitably regret. +# ────────────────────────────────────────────────────────────────────── + + +class PrPlannerAgent: + """Extracts atomic factual claims from a PR title + body.""" + + def __init__(self, unified: UnifiedLLM, prompt_renderer: Callable[[str, dict[str, Any]], str]): + self.unified = unified + self.render = prompt_renderer + + async def execute( + self, + pr_description: str, + pr_url: str = "", + changed_files: list[str] | None = None, + ) -> dict[str, Any]: + changed_files = changed_files or [] + system_prompt = self.render( + "pr_planner", + {"pr_url": pr_url, "files_changed": ", ".join(changed_files[:25])}, + ) + user_prompt = ( + "Extract every atomic factual claim from the PR description below. " + "Return a JSON object matching the ReadmeClaimList schema (same shape, " + "claim_type drawn from add | remove | fix | refactor | test | behavior | " + "doc | other).\n\n" + f"PR URL: {pr_url}\n" + f"Changed files ({len(changed_files)}): " + f"{', '.join(changed_files[:25]) if changed_files else '(none)'}\n\n" + "---BEGIN PR DESCRIPTION---\n" + f"{pr_description}\n" + "---END PR DESCRIPTION---" + ) + response = await self.unified.generate( + prompt=user_prompt, + system_prompt=system_prompt, + mode="quality", + response_schema=_schema_for(ReadmeClaimList), + ) + parsed = _parse_structured(response.content, ReadmeClaimList) + parsed = _normalise_claim_list(parsed, pr_url) + return { + "agent": "pr_planner", + "tier": "head", + "status": "success", + "output": parsed, + "provider": response.provider_used, + "model": response.model, + "latency_ms": response.latency_ms, + "cost_estimate_usd": response.cost_estimate_usd, + "dry_run": response.dry_run, + "fallback_used": response.fallback_used, + } + + +class PrCheckerAgent: + """Renders verdicts for PR claims against diff evidence. + + Identical trust-gate semantics as ``ReadmeCheckerAgent``: any claim + with no retrieved hunk evidence is forced to ``unsupported`` after + the LLM responds. + """ + + def __init__(self, unified: UnifiedLLM, prompt_renderer: Callable[[str, dict[str, Any]], str]): + self.unified = unified + self.render = prompt_renderer + + async def execute( + self, + claims: list[ReadmeClaim], + evidence: list[ClaimEvidence], + ) -> dict[str, Any]: + evidence_by_claim = {e.claim_id: e for e in evidence} + evidence_payload = [e.model_dump() for e in evidence] + claims_payload = [c.model_dump() for c in claims] + + system_prompt = self.render( + "pr_checker", + {"verdict_values": " | ".join(ALL_VERDICTS)}, + ) + user_prompt = ( + "Judge each PR claim against its retrieved diff evidence and return " + "a ClaimVerdictList JSON object.\n\n" + f"CLAIMS:\n{json.dumps(claims_payload, indent=2)}\n\n" + f"DIFF EVIDENCE:\n{json.dumps(evidence_payload, indent=2)}" + ) + response = await self.unified.generate( + prompt=user_prompt, + system_prompt=system_prompt, + mode="balanced", + response_schema=_schema_for(ClaimVerdictList), + ) + parsed = _parse_structured(response.content, ClaimVerdictList) + verdicts = _enforce_trust_gate(parsed.verdicts, claims, evidence_by_claim) + parsed.verdicts = verdicts + return { + "agent": "pr_checker", + "tier": "middle", + "status": "success", + "output": parsed, + "provider": response.provider_used, + "model": response.model, + "latency_ms": response.latency_ms, + "cost_estimate_usd": response.cost_estimate_usd, + "dry_run": response.dry_run, + "fallback_used": response.fallback_used, + } diff --git a/tests/code_audit/test_pr_flow.py b/tests/code_audit/test_pr_flow.py new file mode 100644 index 0000000..5869e1e --- /dev/null +++ b/tests/code_audit/test_pr_flow.py @@ -0,0 +1,279 @@ +"""End-to-end test for the PR auditor. + +Same stub-LLM pattern as ``test_readme_flow.py``: we drive the full +flow (planner → diff-grep → checker + trust gate → critic) against a +canned PR fixture so we can assert verdict distribution deterministically. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + +from core.orchestrator.pr_flow import PrAuditOrchestrator +from core.schemas_audit import ( + VERDICT_CONTRADICTED, + VERDICT_DRIFTED, + VERDICT_UNSUPPORTED, + VERDICT_VERIFIED, +) +from core.sources.github import ( + GitHubAdapter, + PrSpec, + load_pr_fixture, + parse_pr_url, +) +from providers.unified import UnifiedLLM + +FIXTURE_PR = Path(__file__).resolve().parent.parent / "fixtures" / "sample_pr" + + +_PLANNER_CLAIMS = { + "readme_path": "https://github.com/example/widgetlib/pull/42", + "claims": [ + { + "claim_id": "claim-01", + "claim_text": "adds a `widgetctl --port 9000` CLI for launching the dashboard", + "claim_type": "add", + "search_hints": ["widgetctl", "--port", "9000", "argparse"], + }, + { + "claim_id": "claim-02", + "claim_text": "adds a new `Widget.render_html` method", + "claim_type": "add", + "search_hints": ["render_html", "Widget"], + }, + { + "claim_id": "claim-03", + "claim_text": "adds tests covering the new render path", + "claim_type": "test", + "search_hints": ["test_render_html", "render_html"], + }, + { + "claim_id": "claim-04", + "claim_text": "removes the deprecated `legacy_render` helper", + "claim_type": "remove", + "search_hints": ["legacy_render"], + }, + { + "claim_id": "claim-05", + "claim_text": "introduces no telemetry", + "claim_type": "behavior", + "search_hints": ["telemetry", "emit_telemetry"], + }, + { + "claim_id": "claim-06", + "claim_text": "Adds WIDGETLIB_DEBUG=1 flag for verbose logging", + "claim_type": "add", + "search_hints": ["WIDGETLIB_DEBUG"], + }, + { + "claim_id": "claim-07", + "claim_text": "Performance: render is now 3x faster on the bench", + "claim_type": "behavior", + "search_hints": ["bench", "benchmark", "perf"], + }, + ], +} + + +_CHECKER_VERDICTS = { + "verdicts": [ + { + "claim_id": "claim-01", + "claim_text": _PLANNER_CLAIMS["claims"][0]["claim_text"], + "verdict": VERDICT_DRIFTED, + "confidence": 0.8, + "evidence_paths": ["widgetlib/cli.py"], + "notes": "PR claims --port 9000; diff adds --bind defaulting to 127.0.0.1:8000.", + }, + { + "claim_id": "claim-02", + "claim_text": _PLANNER_CLAIMS["claims"][1]["claim_text"], + "verdict": VERDICT_VERIFIED, + "confidence": 0.95, + "evidence_paths": ["widgetlib/widget.py"], + "notes": "render_html method added on Widget.", + }, + { + "claim_id": "claim-03", + "claim_text": _PLANNER_CLAIMS["claims"][2]["claim_text"], + "verdict": VERDICT_VERIFIED, + "confidence": 0.9, + "evidence_paths": ["tests/test_render.py"], + "notes": "Test file added asserting render_html output.", + }, + { + "claim_id": "claim-04", + "claim_text": _PLANNER_CLAIMS["claims"][3]["claim_text"], + "verdict": VERDICT_DRIFTED, + "confidence": 0.7, + "evidence_paths": ["widgetlib/cli.py"], + "notes": "Diff touches cli.py near _legacy() but the helper is not removed.", + }, + { + "claim_id": "claim-05", + "claim_text": _PLANNER_CLAIMS["claims"][4]["claim_text"], + "verdict": VERDICT_CONTRADICTED, + "confidence": 0.95, + "evidence_paths": ["widgetlib/telemetry.py"], + "notes": "Diff adds a telemetry emitter that posts to telemetry.example.com.", + }, + # claim-06: LLM hallucinates verified — trust gate must override + { + "claim_id": "claim-06", + "claim_text": _PLANNER_CLAIMS["claims"][5]["claim_text"], + "verdict": VERDICT_VERIFIED, + "confidence": 0.85, + "evidence_paths": ["widgetlib/cli.py"], + "notes": "Hallucinated by checker — trust gate must overwrite.", + }, + { + "claim_id": "claim-07", + "claim_text": _PLANNER_CLAIMS["claims"][6]["claim_text"], + "verdict": VERDICT_UNSUPPORTED, + "confidence": 0.0, + "evidence_paths": [], + "notes": "No benchmark or measurement in the diff.", + }, + ] +} + + +_CRITIC_RESPONSE = { + "weak_verdicts": [], + "missed_claims": [], + "suggestions": ["Require benchmarks for performance claims in CI."], + "overall_assessment": "Caught one drift in CLI flag, one contradicted no-telemetry, and two unsupported claims.", +} + + +def _make_stub_unified() -> UnifiedLLM: + llm = UnifiedLLM() + llm.dry_run = False + llm.set_available_providers(["anthropic"]) + + async def fake_generate( + provider: str, + model: str, + prompt: str, + system_prompt: str, + response_schema: Any, + ) -> str: + if "PR PLANNER" in system_prompt: + return json.dumps(_PLANNER_CLAIMS) + if "PR CHECKER" in system_prompt: + return json.dumps(_CHECKER_VERDICTS) + if "AUDIT CRITIC" in system_prompt: + return json.dumps(_CRITIC_RESPONSE) + return "{}" + + llm.set_generate_fn(fake_generate) + return llm + + +@pytest.fixture +def stubbed_unified(monkeypatch) -> UnifiedLLM: + monkeypatch.setenv("DRY_RUN", "false") + for tier in ("QUALITY", "BALANCED", "CHEAP"): + monkeypatch.setenv(f"THESIS_{tier}_PROVIDER", "anthropic") + monkeypatch.setenv(f"THESIS_{tier}_MODEL", "claude-sonnet-4-20250514") + return _make_stub_unified() + + +def test_parse_pr_url_happy_path(): + owner, repo, number = parse_pr_url("https://github.com/example/widgetlib/pull/42") + assert (owner, repo, number) == ("example", "widgetlib", 42) + + +def test_parse_pr_url_rejects_non_pr(): + with pytest.raises(ValueError): + parse_pr_url("https://github.com/example/widgetlib/issues/42") + + +def test_load_pr_fixture_round_trips(): + pr = load_pr_fixture(str(FIXTURE_PR)) + assert pr.number == 42 + assert pr.title.startswith("feat:") + assert "widgetctl" in pr.body + assert pr.diff, "fixture diff should be non-empty" + assert "telemetry.py" in pr.diff + + +def test_github_adapter_finds_hits_in_diff_only(): + pr = load_pr_fixture(str(FIXTURE_PR)) + adapter = GitHubAdapter(pr) + # Real addition in the diff + hits = adapter.search_any(["render_html"], limit=10) + assert any(h.path == "widgetlib/widget.py" for h in hits) + # Hallucinated env var: nowhere in the diff + assert adapter.search_any(["WIDGETLIB_DEBUG"], limit=10) == [] + # Telemetry contradicts the "no telemetry" claim — should surface + telemetry_hits = adapter.search_any(["telemetry", "emit_telemetry"], limit=10) + assert any(h.path == "widgetlib/telemetry.py" for h in telemetry_hits) + + +@pytest.mark.asyncio +async def test_pr_audit_end_to_end(stubbed_unified): + pr = load_pr_fixture(str(FIXTURE_PR)) + orch = PrAuditOrchestrator(unified=stubbed_unified) + report, critique = await orch.audit(pr) + + by_id = {v.claim_id: v for v in report.verdicts} + assert len(report.verdicts) == 7 + + # Verified claims have evidence in the diff + assert by_id["claim-02"].verdict == VERDICT_VERIFIED + assert by_id["claim-02"].evidence_paths + assert by_id["claim-03"].verdict == VERDICT_VERIFIED + + # Drifted: PR says --port, diff adds --bind + assert by_id["claim-01"].verdict == VERDICT_DRIFTED + + # Contradicted: PR says "no telemetry" but diff adds it + assert by_id["claim-05"].verdict == VERDICT_CONTRADICTED + + # Trust gate: hallucinated WIDGETLIB_DEBUG verdict overwritten + assert by_id["claim-06"].verdict == VERDICT_UNSUPPORTED, ( + "Trust gate failed: stub hallucinated 'verified' for WIDGETLIB_DEBUG; " + "trust gate should have overridden because the diff has zero hits." + ) + assert by_id["claim-06"].evidence_paths == [] + assert by_id["claim-06"].confidence == 0.0 + + # Unsupported: performance claim with no benchmark evidence + assert by_id["claim-07"].verdict == VERDICT_UNSUPPORTED + + # Summary tallies + assert sum(report.summary.values()) == len(report.verdicts) + assert report.summary[VERDICT_VERIFIED] >= 2 + assert report.summary[VERDICT_UNSUPPORTED] >= 2 + + # Target field carries the PR URL + assert "pull/42" in report.target + + # Critic ran + assert critique.suggestions + + +@pytest.mark.asyncio +async def test_pr_audit_empty_description(stubbed_unified): + """An empty PR description must produce a structured no-op report.""" + pr = PrSpec( + owner="example", + repo="empty", + number=1, + title="", + body="", + diff="", + changed_files=[], + url="https://github.com/example/empty/pull/1", + ) + orch = PrAuditOrchestrator(unified=stubbed_unified) + report, critique = await orch.audit(pr) + assert report.verdicts == [] + assert report.errors and "empty" in report.errors[0].lower() + assert critique.weak_verdicts == [] diff --git a/tests/fixtures/sample_pr/diff.patch b/tests/fixtures/sample_pr/diff.patch new file mode 100644 index 0000000..e4b67e7 --- /dev/null +++ b/tests/fixtures/sample_pr/diff.patch @@ -0,0 +1,43 @@ +diff --git a/widgetlib/cli.py b/widgetlib/cli.py +@@ -1,5 +1,12 @@ + import argparse + ++def main(): ++ parser = argparse.ArgumentParser(prog="widgetctl") ++ parser.add_argument("--bind", default="127.0.0.1:8000") ++ args = parser.parse_args() ++ print(f"Starting widgetctl on {args.bind}") ++ ++ + def _legacy(): + return None +diff --git a/widgetlib/widget.py b/widgetlib/widget.py +@@ -1,7 +1,12 @@ + class Widget: + def __init__(self, payload): + self.payload = payload + + def render(self): + return f"
{self.payload}
" ++ ++ def render_html(self): ++ return f"
{self.payload}
" +diff --git a/widgetlib/telemetry.py b/widgetlib/telemetry.py +@@ -0,0 +1,6 @@ ++import urllib.request ++ ++ ++def emit_telemetry(event): ++ urllib.request.urlopen("https://telemetry.example.com", data=event.encode()) ++ +diff --git a/tests/test_render.py b/tests/test_render.py +@@ -0,0 +1,9 @@ ++from widgetlib import Widget ++ ++ ++def test_render_html_returns_section_tag(): ++ w = Widget("hi") ++ assert "
" in w.render_html() ++ ++ ++ diff --git a/tests/fixtures/sample_pr/pr.json b/tests/fixtures/sample_pr/pr.json new file mode 100644 index 0000000..d832a61 --- /dev/null +++ b/tests/fixtures/sample_pr/pr.json @@ -0,0 +1,16 @@ +{ + "owner": "example", + "repo": "widgetlib", + "number": 42, + "title": "feat: dashboard CLI + render performance", + "body": "## Summary\n\nThis PR adds a `widgetctl --port 9000` CLI for launching the dashboard. It also adds a new `Widget.render_html` method and adds tests covering the new render path. The PR removes the deprecated `legacy_render` helper. Importantly, this PR introduces **no telemetry**.\n\n## Notes\n\n- Performance: render is now 3x faster on the bench.\n- Adds WIDGETLIB_DEBUG=1 flag for verbose logging.", + "base_sha": "aaaaaaa", + "head_sha": "bbbbbbb", + "changed_files": [ + "widgetlib/cli.py", + "widgetlib/widget.py", + "widgetlib/telemetry.py", + "tests/test_render.py" + ], + "url": "https://github.com/example/widgetlib/pull/42" +}