diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a4c7dedf..c7b936da 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,9 +8,9 @@ name: CI — Quality Gates on: push: - branches: [main, develop] + branches: [main, develop, "phalanx/ci-fix/**"] pull_request: - branches: [main, develop] + branches: [main, develop, ci-fixer-e2e-test] workflow_dispatch: # Cancel in-progress runs on new push to same branch diff --git a/alembic/versions/20260415_0001_ci_fix_context.py b/alembic/versions/20260415_0001_ci_fix_context.py new file mode 100644 index 00000000..c8664909 --- /dev/null +++ b/alembic/versions/20260415_0001_ci_fix_context.py @@ -0,0 +1,34 @@ +"""ci_fix_run: add pipeline_context_json for multi-agent shared state + +Revision ID: 20260415_0001 +Revises: 20260412_0005 +Create Date: 2026-04-15 +""" + +from __future__ import annotations + +import sqlalchemy as sa +from alembic import op + +revision = "20260415_0001" +down_revision = "20260412_0005" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Add pipeline_context_json — stores the full CIFixContext as a JSON blob. + # NULL for runs created before this migration; populated by new pipeline runs. + op.add_column( + "ci_fix_runs", + sa.Column( + "pipeline_context_json", + sa.Text(), + nullable=True, + comment="CIFixContext serialized as JSON — full multi-agent pipeline state", + ), + ) + + +def downgrade() -> None: + op.drop_column("ci_fix_runs", "pipeline_context_json") diff --git a/docker/sandbox/go/Dockerfile b/docker/sandbox/go/Dockerfile new file mode 100644 index 00000000..b93a6565 --- /dev/null +++ b/docker/sandbox/go/Dockerfile @@ -0,0 +1,16 @@ +FROM golang:1.22-alpine + +# Install staticcheck and golangci-lint for broader Go lint coverage +RUN go install honnef.co/go/tools/cmd/staticcheck@2024.1.0 && \ + go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.59.1 + +# Create non-root user +RUN adduser -D -u 1000 phalanx + +COPY ../reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh + +RUN mkdir -p /workspace && chown phalanx:phalanx /workspace + +WORKDIR /workspace +USER phalanx diff --git a/docker/sandbox/node/Dockerfile b/docker/sandbox/node/Dockerfile new file mode 100644 index 00000000..07a2e6ed --- /dev/null +++ b/docker/sandbox/node/Dockerfile @@ -0,0 +1,19 @@ +FROM node:20-slim + +# Create non-root user +RUN useradd -m -u 1000 -s /bin/bash phalanx 2>/dev/null || true + +# Install common Node tooling used by the CI fixer. +RUN npm install -g \ + eslint@8.57.0 \ + typescript@5.4.5 \ + jest@29.7.0 \ + --no-fund --no-audit + +COPY ../reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh + +RUN mkdir -p /workspace && chown phalanx:phalanx /workspace + +WORKDIR /workspace +USER phalanx diff --git a/docker/sandbox/python/Dockerfile b/docker/sandbox/python/Dockerfile new file mode 100644 index 00000000..02a1dce9 --- /dev/null +++ b/docker/sandbox/python/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.12-slim + +# Create non-root user +RUN useradd -m -u 1000 -s /bin/bash phalanx + +# Install pinned tool versions used by the CI fixer. +# Versions are chosen to match the most common customer constraints. +# When VersionParityAgent detects a mismatch, it installs the customer's +# pinned version inside the running container before executing. +RUN pip install --no-cache-dir \ + ruff==0.4.4 \ + mypy==1.10.0 \ + pytest==8.2.0 \ + pytest-asyncio==0.23.7 + +# Copy reset script +COPY ../reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh + +# Workspace dir — populated via docker cp by SandboxProvisioner +RUN mkdir -p /workspace && chown phalanx:phalanx /workspace + +WORKDIR /workspace +USER phalanx diff --git a/docker/sandbox/reset.sh b/docker/sandbox/reset.sh new file mode 100644 index 00000000..6a7f2ab7 --- /dev/null +++ b/docker/sandbox/reset.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Shared reset script — clears workspace and caches between fix runs. +# Runs inside the container via: docker exec {id} sh /phalanx/reset.sh +set -e +rm -rf /workspace/* 2>/dev/null || true +rm -rf /tmp/pip-* /tmp/npm-* /tmp/.cache /root/.cache 2>/dev/null || true +echo "done" diff --git a/docker/sandbox/rust/Dockerfile b/docker/sandbox/rust/Dockerfile new file mode 100644 index 00000000..212acdea --- /dev/null +++ b/docker/sandbox/rust/Dockerfile @@ -0,0 +1,15 @@ +FROM rust:1.77-slim + +# Install clippy and rustfmt (included in toolchain but ensure available) +RUN rustup component add clippy rustfmt + +# Create non-root user +RUN useradd -m -u 1000 -s /bin/bash phalanx + +COPY ../reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh + +RUN mkdir -p /workspace && chown phalanx:phalanx /workspace + +WORKDIR /workspace +USER phalanx diff --git a/docs/MULTI_AGENT_CI_FIXER.md b/docs/MULTI_AGENT_CI_FIXER.md new file mode 100644 index 00000000..ae8369c9 --- /dev/null +++ b/docs/MULTI_AGENT_CI_FIXER.md @@ -0,0 +1,365 @@ +# Multi-Agent CI Fixer — Architecture & Phased Plan + +> **Status:** Design doc — pre-implementation +> **Author:** FORGE Tech Lead +> **Date:** 2026-04-15 + +--- + +## 1. Problem Statement + +The current CI fixer is a single-agent loop. It works for simple lint violations but has fundamental gaps: + +1. **No real environment** — it runs linters in a cloned workspace but never actually runs the app or tests +2. **Opens new PRs every run** — instead of committing to the existing failing PR +3. **Scoped to the CI log only** — doesn't know if the base branch is already broken +4. **No reproduction step** — fixes are applied without confirming the failure first +5. **One agent does everything** — no separation of concerns, hard to scale, hard to trust + +The fix isn't to patch these one at a time. The fix is a coordinated multi-agent pipeline. + +--- + +## 2. The Mental Model — How a Sr. Staff Engineer Actually Works + +When a senior engineer sees a red CI build: + +1. **Read the log** — understand exactly what failed and why +2. **Reproduce it locally** — run the exact same command CI ran, confirm it fails +3. **Fix it** — make the targeted change +4. **Validate in the same environment** — run the command again, confirm it passes +5. **Push to the same PR** — new commit, not a new PR +6. **Done** — CI goes green on the next run + +This is the workflow the multi-agent system must replicate. Every agent maps to one of these steps. + +--- + +## 3. Agent Roster & Responsibilities + +``` +CI Failure Event + │ + ▼ +┌─────────────────┐ +│ Log Analyst │ — parse CI logs → StructuredFailure + reproducer_cmd +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Root Cause │ — classify tier, stack, confidence, escalation decision +│ Agent │ +└────────┬────────┘ + │ + ┌────┴────┐ + │ │ + ▼ ▼ +[L1: Auto] [L2: Escalate → comment on PR, done] + │ + ▼ +┌─────────────────┐ +│ Sandbox │ — detect stack, spin container from pre-warmed image +│ Provisioner │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Reproducer │ — run reproducer_cmd in sandbox, confirm failure +│ Agent │ +└────────┬────────┘ + │ + ┌────┴──────────┐ + │ │ + ▼ ▼ +[Confirmed] [Not reproduced → flaky/env issue → comment, done] + │ + ▼ +┌─────────────────┐ +│ Fix Agent │ — apply fix, run validation in same sandbox +│ (Claude Opus) │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Verifier │ — smoke test the app, confirm nothing else broke +│ Agent │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Commit Agent │ — push commit to EXISTING PR (not a new one) +└─────────────────┘ +``` + +--- + +## 4. Agent Specifications + +### 4.1 Log Analyst +- **Model:** GPT-4.1 (fast, cheap, structured extraction) +- **Input:** raw CI log text +- **Output:** `StructuredFailure` + ```python + @dataclass + class StructuredFailure: + tool: str # "ruff", "pytest", "mypy", "tsc", etc. + failure_type: str # "lint", "test_regression", "build", "type_error" + errors: list[ParsedError] + reproducer_cmd: str # exact command CI ran: "ruff check phalanx/ tests/" + failing_files: list[str] + log_excerpt: str + confidence: float + ``` +- **Existing code:** largely maps to current `LogParser` + `LLMClassifier` — refactor, don't rewrite + +### 4.2 Root Cause Agent +- **Model:** GPT-4.1 +- **Input:** `StructuredFailure` + file contents of failing files +- **Output:** `ClassifiedFailure` + ```python + @dataclass + class ClassifiedFailure: + tier: Literal["L1_auto", "L2_escalate"] + root_cause: str + hypothesis: str + stack: str # "python", "node", "go", "java", "rust", "unknown" + confidence: float + escalation_reason: str # populated if tier == L2 + ``` +- **L1 criteria:** lint violations, unused imports, formatting, simple type annotation fixes +- **L2 criteria:** test regression, logic bug, unknown stack, low confidence (<0.7) + +### 4.3 Sandbox Provisioner +- **Model:** None — fully deterministic +- **Input:** repo path + `ClassifiedFailure.stack` +- **Output:** running Docker container ID + workspace path +- **Stack detection order:** + 1. File existence: `pyproject.toml` → python, `package.json` → node, `go.mod` → go, etc. + 2. CI log hints: if detection fails, parse CI log for install commands + 3. LLM fallback: give GPT-4.1 the root dir listing + CI log +- **Pre-warmed images on prod:** + - `phalanx-sandbox:python` — python 3.12, pip, ruff, mypy, pytest + - `phalanx-sandbox:node` — node 22, npm, yarn, eslint, tsc + - `phalanx-sandbox:go` — go 1.22+ + - `phalanx-sandbox:multi` — python + node combined +- **Fallback:** if stack unknown after LLM → skip to Escalate path + +### 4.4 Reproducer Agent +- **Model:** Claude Opus 4.6 (tool use: `run_command`) +- **Input:** sandbox container, `reproducer_cmd` +- **Output:** `ReproductionResult` + ```python + @dataclass + class ReproductionResult: + confirmed: bool + exit_code: int + output: str + verdict: Literal["confirmed", "flaky", "env_mismatch", "timeout"] + ``` +- **Logic:** + - Run `reproducer_cmd` in sandbox + - If it fails with same error → `confirmed` + - If it passes → `flaky` (env issue, not code bug) + - If it fails with a *different* error → `env_mismatch` (wrong stack/deps) + - If timeout → escalate + +### 4.5 Fix Agent +- **Model:** Claude Opus 4.6 (tool use: `read_file`, `write_file`, `run_command`, `finish`) +- **Input:** `StructuredFailure`, `ReproductionResult`, sandbox container +- **Output:** `VerifiedPatch` + ```python + @dataclass + class VerifiedPatch: + files_modified: list[str] + validation_cmd: str + validation_output: str + success: bool + ``` +- **Constraints (unchanged from current design):** + - write_file: empty-content guard, 70% shrink guard + - sed/awk always available for large files + - Full-repo validation before declaring success + - Max 12 turns + +### 4.6 Verifier Agent +- **Model:** Claude Opus 4.6 +- **Input:** sandbox container, `VerifiedPatch`, stack type +- **Output:** `VerificationResult` +- **What it does:** + - For Python: `pytest --tb=short -q` (no coverage, just pass/fail) + - For Node: `npm test` or `npm run lint` + - For unknown: skip (don't block on what we can't verify) +- **Phase 1:** optional/best-effort — don't block commit if verifier times out +- **Phase 2:** mandatory gate for test regression failures + +### 4.7 Commit Agent +- **Model:** None — deterministic git ops +- **Input:** `VerifiedPatch`, original PR info +- **Output:** commit SHA pushed to existing PR branch +- **Key behavior:** + - Look up open Phalanx fix PRs for this branch — if one exists, push to it + - If none exists, open one (draft, targeting the failing branch) + - Never open a second fix PR for the same branch + - Commit message: structured, references the original CI run ID + +--- + +## 5. Shared Context Object + +All agents read from and write to a single `CIFixContext` object, persisted in DB: + +```python +@dataclass +class CIFixContext: + # Identity + ci_fix_run_id: UUID + repo: str + branch: str + commit_sha: str + original_build_id: str + + # Agent outputs (written as pipeline progresses) + structured_failure: StructuredFailure | None + classified_failure: ClassifiedFailure | None + sandbox_id: str | None + reproduction_result: ReproductionResult | None + verified_patch: VerifiedPatch | None + verification_result: VerificationResult | None + commit_sha_fix: str | None + + # Metadata + started_at: datetime + completed_at: datetime | None + final_status: Literal["fixed", "escalated", "flaky", "env_mismatch", "failed"] + pr_comment_posted: bool +``` + +This object is inspectable at any point. `GET /ci-fix-runs/{id}/context` returns the full pipeline state. No black boxes. + +--- + +## 6. Fallback Ladder + +Every exit path produces a useful artifact. No silent failures. + +| Situation | Action | +|-----------|--------| +| Can reproduce + can fix | Verified patch committed to existing PR | +| Can reproduce + can't fix (max turns) | Root cause comment on PR, engineer knows exactly what to look at | +| Cannot reproduce (passes in sandbox) | "Looks flaky — reproduced cleanly locally. Recommend re-running CI." | +| Unknown stack (after LLM fallback) | Structured failure analysis comment + stack hypothesis | +| Base branch already broken | "Base branch has pre-existing failures. Fix those first." | +| Sandbox provision fails | Fall back to current workspace-only mode (no env validation) | +| Any agent timeout | Escalate with partial context, never hang | + +--- + +## 7. Sandbox Architecture + +### Pre-warmed images +Built once, stored on prod server. Rebuilt weekly via cron. + +```dockerfile +# phalanx-sandbox:python +FROM python:3.12-slim +RUN pip install ruff mypy pytest pytest-asyncio pytest-cov +# No app code — that gets mounted at runtime +``` + +### Container lifecycle +``` +provision() → docker run -d --rm -v {workspace}:/app -w /app phalanx-sandbox:python +install() → docker exec {id} pip install -e ".[dev]" (~30s, cached layer) +run(cmd) → docker exec {id} {cmd} (~1-5s per command) +teardown() → docker stop {id} (--rm handles cleanup) +``` + +### Dep caching +Cache the installed dep layer per `requirements hash`. If `pyproject.toml` hasn't changed since last run, skip `pip install` — use cached layer. Reduces install time from ~30s to ~2s on repeat runs. + +### Security +- Network isolated: `--network none` after dep install +- No write access outside `/app` +- Hard CPU/memory limits: `--cpus 1 --memory 2g` +- Hard timeout: 5 minutes total per sandbox lifecycle + +--- + +## 8. Quality Gates + +Every agent has unit tests. Pipeline has integration tests. Coverage target: **≥80%**. + +| Test type | What it covers | +|-----------|----------------| +| Unit — Log Analyst | Parses known log formats correctly, structured output | +| Unit — Root Cause Agent | Classification tiers, confidence thresholds, escalation | +| Unit — Sandbox Provisioner | Stack detection logic, all fallback paths | +| Unit — Reproducer Agent | Confirmed/flaky/env_mismatch verdicts | +| Unit — Fix Agent | Existing agentic loop tests + new sandbox integration | +| Unit — Verifier Agent | Pass/fail/skip verdicts per stack type | +| Unit — Commit Agent | PR continuity (no duplicate PRs), commit format | +| Integration — full pipeline | End-to-end with a real ruff failure, real sandbox, real commit | +| E2E — MESMD | No open CI failures after pipeline runs | + +--- + +## 9. Phased Plan + +### Phase 1 — Solid Foundation (current sprint) +**Goal:** Clean up what exists, establish the context object, add PR continuity. + +- [ ] Refactor `CIFixerAgent` into the DAG agent pattern (Log Analyst + Root Cause already exist, formalize them) +- [ ] Introduce `CIFixContext` as the shared state object (DB-backed) +- [ ] Commit Agent: check for existing fix PRs before opening new ones +- [ ] Fix CI workflow triggers (PR #8 — in flight) +- [ ] ≥80% unit test coverage on all existing ci_fixer modules +- [ ] `GET /ci-fix-runs/{id}/context` endpoint — full pipeline state inspectable + +### Phase 2 — Sandbox + Reproduction (next sprint) +**Goal:** The pipeline can reproduce failures, not just parse them. + +- [ ] Build pre-warmed sandbox images (python, node, multi) +- [ ] `SandboxProvisioner` — stack detection + container lifecycle +- [ ] `ReproducerAgent` — run `reproducer_cmd` in sandbox, produce `ReproductionResult` +- [ ] Wire into existing pipeline: reproduction step before Fix Agent +- [ ] Flaky detection: if sandbox passes, post "looks flaky" comment, skip fix +- [ ] Dep layer caching per repo +- [ ] ≥80% test coverage on new agents + +### Phase 3 — Verifier + Full E2E (sprint after) +**Goal:** The pipeline can confirm the app works, not just that linting passes. + +- [ ] `VerifierAgent` — run test suite in sandbox post-fix +- [ ] Sandbox network isolation + resource limits +- [ ] Unknown stack LLM fallback path +- [ ] Base branch health check before starting fix +- [ ] Full pipeline integration test (real repo, real sandbox, real CI) +- [ ] **MESMD proof:** trigger real CI failures, confirm pipeline fixes them all, CI stays green + +--- + +## 10. Success Criteria + +Phase 1 done when: +- No duplicate fix PRs ever opened for the same branch +- `CIFixContext` fully populated and queryable via API +- 80% unit test coverage across all ci_fixer modules + +Phase 2 done when: +- Reproducer Agent correctly classifies confirmed vs flaky vs env_mismatch +- Flaky failures generate a comment instead of a bad fix PR +- Sandbox spins up in <5 seconds (from pre-warmed image) + +Phase 3 done when: +- MESMD app: zero open CI failures after pipeline runs end-to-end +- Verifier Agent confirms app smoke tests pass post-fix +- Full pipeline runs in <3 minutes for a lint failure + +--- + +## 11. What We Are NOT Building + +- Auto-merge — fix PRs are always draft, always human-approved before merge +- Fix for logic bugs — if Root Cause Agent classifies it as a test regression the engineer introduced, it escalates, it does not fix +- Multi-repo coordination — one pipeline per repo, no cross-repo fixes +- Jenkins support — Phase 2 at earliest diff --git a/docs/sandbox_pool_design.md b/docs/sandbox_pool_design.md new file mode 100644 index 00000000..ebf90ae8 --- /dev/null +++ b/docs/sandbox_pool_design.md @@ -0,0 +1,272 @@ +# Sandbox Pool Design — Isolated Execution for CI Fixer + +## Status: Approved for implementation (Phase 3) + +## Problem + +`SandboxProvisioner.provision()` is currently a no-op — it returns a descriptor but +never starts a container. The reproducer and verifier run commands as local +subprocesses on the FORGE host. This means: + +- No env isolation: host ruff/mypy version may differ from the repo's pinned version +- No filesystem isolation: a broken fix can dirty the host workspace +- No resource limits: a hung test can block other fix runs +- `docker run` cold-start (image pull + container create) costs 5–30s per fix run + if we naively start a container on demand + +--- + +## Design: Pre-warmed Pool + +### Core idea + +Never cold-start a container during a fix run. Keep a small pool of ready containers +per stack, already running with tools pre-installed. A fix run checks one out, uses +it, and the pool refills asynchronously in the background. + +``` +┌────────────────────────────────────────────────────────────┐ +│ SandboxPool (lazy singleton, init after Celery fork) │ +│ │ +│ python: [🟢 ready] [🟢 ready] [🟡 warming] │ +│ node: [🟢 ready] [🟡 warming] │ +│ go: [🟢 ready] │ +│ rust: [🟢 ready] │ +└────────────────────────────────────────────────────────────┘ + │ checkout(stack) ↑ checkin(container) + ▼ │ +┌────────────────────────────────────────────────────────────┐ +│ Fix run (ReproducerAgent + VerifierAgent) │ +│ 1. pool.checkout("python") → PooledContainer │ +│ 2. bind-mount workspace into container │ +│ 3. docker exec reproducer_cmd │ +│ 4. docker exec fix validator │ +│ 5. docker exec verifier (ruff/pytest/etc.) │ +│ 6. pool.checkin(container) → reset + async refill │ +└────────────────────────────────────────────────────────────┘ +``` + +--- + +## Components + +### 1. `PooledContainer` (dataclass) + +```python +@dataclass +class PooledContainer: + container_id: str # Docker container ID (short hash) + stack: str # "python" | "node" | "go" | "rust" + image: str # e.g. "phalanx-sandbox-python:latest" + checked_out_at: float # monotonic time — for reaper timeout detection + healthy: bool = True +``` + +### 2. `SandboxPool` (async singleton) + +One `asyncio.Queue` per stack. Each queue holds ready `PooledContainer` objects. + +Key methods: + +| Method | What it does | +|--------|-------------| +| `_warmup()` | Called once after lazy init. Starts `min_size` containers per stack. | +| `checkout(stack, timeout)` | `asyncio.wait_for(queue.get(), timeout)`. Returns container or raises `SandboxUnavailableError`. | +| `checkin(container)` | Runs reset script inside container, puts it back in queue. Triggers async `_refill`. | +| `_start_container(stack)` | `docker run -d --rm --user phalanx --no-new-privileges -v /tmp:/hosttmp {image} sleep infinity` | +| `_health_check(container)` | `docker exec {id} echo ok`. Returns bool. | +| `_reset_container(container)` | `docker exec {id} /phalanx/reset.sh` — clears /workspace, /tmp, pip/npm cache. | +| `_reaper()` | Background task. Every 60s: kill containers held > `max_hold_seconds`. Replace them. | +| `shutdown()` | Kill all containers in all queues. Called on worker shutdown. | + +### 3. `SandboxResult` (upgraded) + +Two new fields added to existing dataclass: + +```python +container_id: str = "" # populated when pool checkout succeeds +mount_path: str = "/workspace" # path inside the container +``` + +`available=True` + `container_id != ""` → real Docker exec path +`available=True` + `container_id == ""` → pool timeout, local subprocess fallback +`available=False` → sandbox_enabled=False, local subprocess fallback + +### 4. `SandboxProvisioner.provision()` (upgraded) + +```python +async def provision(workspace_path, stack_hint=None) -> SandboxResult | None: + if not settings.sandbox_enabled: + return None + stack = stack_hint or self.detect_stack(workspace_path) + image = _STACK_IMAGES[stack] + sandbox_id = f"phalanx-sandbox-{uuid.uuid4().hex[:8]}" + + pool = await get_sandbox_pool() # lazy singleton, safe after fork + try: + container = await pool.checkout(stack, timeout=settings.sandbox_checkout_timeout_seconds) + # bind-mount workspace into the container + await pool.mount_workspace(container, workspace_path) + return SandboxResult( + sandbox_id=sandbox_id, stack=stack, image=image, + workspace_path=str(workspace_path), + container_id=container.container_id, + ) + except SandboxUnavailableError: + log.warning("ci_fixer.sandbox_pool_exhausted", stack=stack) + return SandboxResult( + sandbox_id=sandbox_id, stack=stack, image=image, + workspace_path=str(workspace_path), + available=False, # → local subprocess fallback + ) +``` + +### 5. `ReproducerAgent._run_subprocess()` (upgraded) + +When `sandbox_result` has a `container_id`, wrap the command: + +```python +if sandbox_result and sandbox_result.container_id: + cmd = f"docker exec {sandbox_result.container_id} sh -c {shlex.quote(cmd)}" +``` + +Otherwise falls through to current `asyncio.create_subprocess_shell` behavior. + +### 6. `VerifierAgent._run_cmd()` (upgraded) + +Same pattern — when `container_id` is set, prefix args with `["docker", "exec", container_id]`. + +--- + +## Stack Images + +Custom images with tools pre-installed at pinned versions. Stored in `docker/sandbox/`. + +``` +docker/sandbox/ + python/Dockerfile + node/Dockerfile + go/Dockerfile + rust/Dockerfile + reset.sh # shared reset script copied into every image +``` + +### `reset.sh` + +```bash +#!/bin/bash +# Clear workspace and caches between fix runs. +rm -rf /workspace/* +rm -rf /tmp/pip-* /tmp/npm-* /root/.cache 2>/dev/null || true +``` + +### Python image example + +```dockerfile +FROM python:3.12-slim +RUN useradd -m -u 1000 phalanx +RUN pip install --no-cache-dir ruff==0.4.4 mypy==1.10.0 pytest==8.2.0 +COPY reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh +WORKDIR /workspace +USER phalanx +``` + +--- + +## Settings (new keys) + +``` +SANDBOX_POOL_MIN_SIZE=1 # containers to pre-warm per stack at startup +SANDBOX_POOL_MAX_SIZE=2 # max simultaneous checked-out containers per stack +SANDBOX_CHECKOUT_TIMEOUT_SECONDS=30 # wait for pool slot before falling back +SANDBOX_MAX_HOLD_SECONDS=300 # reaper kills containers held longer than this +SANDBOX_REAPER_INTERVAL_SECONDS=60 # how often reaper runs +``` + +Setting `SANDBOX_POOL_MIN_SIZE=0` disables pre-warming — containers start cold on first use. +Setting `SANDBOX_ENABLED=false` disables the entire pool (existing behavior). + +--- + +## Pool initialization and Celery fork safety + +**Problem**: Celery pre-forks workers. If the pool is a module-level singleton +initialized before fork, child workers inherit a stale event loop reference → all +`await` calls inside the pool fail. + +**Solution**: Lazy init behind an `asyncio.Lock`. + +```python +_pool_instance: SandboxPool | None = None +_pool_lock: asyncio.Lock | None = None + +async def get_sandbox_pool() -> SandboxPool: + global _pool_instance, _pool_lock + if _pool_lock is None: + _pool_lock = asyncio.Lock() # created inside the child's event loop + async with _pool_lock: + if _pool_instance is None: + _pool_instance = SandboxPool() + await _pool_instance._warmup() + return _pool_instance +``` + +First call to `provision()` in each Celery child worker triggers this. +Subsequent calls in the same worker reuse the warm pool. + +--- + +## Fallback chain (no regressions possible) + +``` +sandbox_enabled=False + → return None → reproducer/verifier: local subprocess (today's behavior) + +sandbox_enabled=True, pool checkout times out (all slots busy) + → SandboxResult(available=False) → local subprocess fallback + +sandbox_enabled=True, Docker daemon not found + → SandboxResult(available=False) → local subprocess fallback + +sandbox_enabled=True, container health check fails + → discard container, start fresh one, retry checkout once + → if retry fails: SandboxResult(available=False) → local subprocess fallback + +sandbox_enabled=True, container_id populated + → docker exec {cmd} → real isolated execution +``` + +Every error path degrades to local subprocess. Fix runs never fail due to sandbox +infrastructure issues. + +--- + +## What is NOT in scope (future) + +- **Network isolation** (`--network none`) — useful but breaks `pip install` fallback +- **CPU/memory cgroups** (`--cpus`, `--memory`) — nice-to-have, not blocking +- **Real Docker socket forwarding** for nested Docker — not needed for lint/type/test tools +- **Multi-host pool** (pool across multiple FORGE workers) — Redis-backed queue, + post-MVP when horizontal scaling is needed + +--- + +## File map + +| File | Change | +|------|--------| +| `phalanx/ci_fixer/sandbox_pool.py` | **NEW** — SandboxPool, PooledContainer, get_sandbox_pool | +| `phalanx/ci_fixer/sandbox.py` | **MODIFIED** — SandboxResult gets container_id/mount_path; provision() uses pool | +| `phalanx/ci_fixer/reproducer.py` | **MODIFIED** — _run_subprocess wraps with docker exec when container_id set | +| `phalanx/ci_fixer/verifier.py` | **MODIFIED** — _run_cmd wraps with docker exec when container_id set | +| `phalanx/config/settings.py` | **MODIFIED** — 5 new SANDBOX_POOL_* settings | +| `docker/sandbox/python/Dockerfile` | **NEW** | +| `docker/sandbox/node/Dockerfile` | **NEW** | +| `docker/sandbox/go/Dockerfile` | **NEW** | +| `docker/sandbox/rust/Dockerfile` | **NEW** | +| `docker/sandbox/reset.sh` | **NEW** | +| `tests/unit/test_sandbox_pool.py` | **NEW** — ≥80% coverage on sandbox_pool.py | +| `tests/unit/test_ci_fixer_sandbox.py` | **MODIFIED** — cover pool checkout path | +| `tests/unit/test_ci_fixer_reproducer.py` | **MODIFIED** — cover docker exec path | +| `tests/unit/test_ci_fixer_verifier.py` | **MODIFIED** — cover docker exec path | diff --git a/phalanx/agents/ci_fixer.py b/phalanx/agents/ci_fixer.py index 7bdc1d32..aa6a5475 100644 --- a/phalanx/agents/ci_fixer.py +++ b/phalanx/agents/ci_fixer.py @@ -37,13 +37,22 @@ from sqlalchemy import select, update from phalanx.agents.base import AgentResult, BaseAgent -from phalanx.agents.soul import CI_FIXER_SOUL from phalanx.ci_fixer.analyst import FilePatch, FixPlan, RootCauseAnalyst +from phalanx.ci_fixer.context import ( + CIFixContext, + ClassifiedFailure, + ReproductionResult, + StructuredFailure, + VerifiedPatch, +) from phalanx.ci_fixer.events import CIFailureEvent from phalanx.ci_fixer.log_fetcher import get_log_fetcher from phalanx.ci_fixer.log_parser import ParsedLog, parse_log +from phalanx.ci_fixer.reproducer import ReproducerAgent +from phalanx.ci_fixer.sandbox import SandboxProvisioner from phalanx.ci_fixer.suppressor import is_flaky_suppressed, should_use_history from phalanx.ci_fixer.validator import validate_fix +from phalanx.ci_fixer.verifier import VerifierAgent from phalanx.ci_fixer.version_parity import ( VersionParityResult, check_version_parity, @@ -51,7 +60,7 @@ should_auto_merge, ) from phalanx.config.settings import get_settings -from phalanx.db.models import CIFailureFingerprint, CIFlakyPattern, CIFixRun, CIIntegration +from phalanx.db.models import CIFailureFingerprint, CIFixRun, CIFlakyPattern, CIIntegration from phalanx.db.session import get_db from phalanx.queue.celery_app import celery_app @@ -114,6 +123,16 @@ async def _execute_inner(self) -> AgentResult: if integration is None: return AgentResult(success=False, output={}, error="CIIntegration not found") + # ── 1b. Initialize shared pipeline context ─────────────────────────── + ctx = CIFixContext( + ci_fix_run_id=self.ci_fix_run_id, + repo=ci_run.repo_full_name, + branch=ci_run.branch, + commit_sha=ci_run.commit_sha, + original_build_id=ci_run.ci_build_id, + ) + await self._persist_context(ctx) + # ── 2. Fetch raw logs ───────────────────────────────────────────────── event = CIFailureEvent( provider=ci_run.ci_provider, @@ -152,6 +171,22 @@ async def _execute_inner(self) -> AgentResult: # the hash is valuable for V2 history queries. await self._persist_fingerprint(fingerprint) + # Update shared context with structured failure + ctx.structured_failure = StructuredFailure( + tool=parsed.tool, + failure_type=parsed.failure_type if hasattr(parsed, "failure_type") else "unknown", + reproducer_cmd="", # populated by classifier + errors=[], + failing_files=list(parsed.failing_files) if hasattr(parsed, "failing_files") else [], + log_excerpt=raw_log[:2000], + ) + ctx.classified_failure = ClassifiedFailure( + tier="L1_auto", + root_cause="", + stack="python", + ) + await self._persist_context(ctx) + await self._trace( "decision", f"**Parsed log** — tool: `{parsed.tool}`\n\n{parsed.as_text()}", @@ -201,274 +236,401 @@ async def _execute_inner(self) -> AgentResult: await self._mark_failed(ci_run, "repo_clone_failed") return AgentResult(success=False, output={}, error="repo clone failed") - # ── 5. Analyst loop: confirm root cause → apply → validate ──────────── - analyst = RootCauseAnalyst( - call_llm=self._call_claude, - history_lookup=self._lookup_fix_history, - ) - fix_plan: FixPlan | None = None - validation_passed = False - validation_tool_version = "" - current_parsed = parsed + # ── Phase 2: Sandbox provisioning + failure reproduction ────────────── + provisioner = SandboxProvisioner() + sandbox_result = await provisioner.provision(workspace) - for iteration in range(1, _MAX_ITERATIONS + 1): - self._log.info("ci_fixer.analyst_iteration", iteration=iteration) + if sandbox_result: + ctx.sandbox_id = sandbox_result.sandbox_id + ctx.sandbox_stack = sandbox_result.stack + await self._persist_context(ctx) - fix_plan = analyst.analyze(current_parsed, workspace, fingerprint_hash=fingerprint) - self._log.info( - "ci_fixer.fix_plan", - confidence=fix_plan.confidence, - root_cause=fix_plan.root_cause, - patches=len(fix_plan.patches), - needs_test=fix_plan.needs_new_test, + try: + reproducer = ReproducerAgent() + reproduction_result = await reproducer.reproduce( + reproducer_cmd=ctx.structured_failure.reproducer_cmd, + workspace_path=workspace, + sandbox_result=sandbox_result, + structured_failure=ctx.structured_failure, + timeout_seconds=settings.sandbox_timeout_seconds, ) + ctx.reproduction_result = reproduction_result + await self._persist_context(ctx) - await self._trace( - "reflection", - f"**Root cause:** {fix_plan.root_cause}\n" - f"**Confidence:** {fix_plan.confidence}\n" - f"**Patches:** {len(fix_plan.patches)} file(s)", - {"confidence": fix_plan.confidence, "iteration": iteration}, + if reproduction_result.verdict == "flaky": + self._log.info( + "ci_fixer.flaky_reproduction", + repo=ci_run.repo_full_name, + tool=parsed.tool, + ) + ctx.complete("flaky") + await self._persist_context(ctx) + await self._mark_failed(ci_run, "flaky") + return AgentResult( + success=False, + output={"reason": "flaky", "tool": parsed.tool}, + ) + + if reproduction_result.verdict == "env_mismatch": + self._log.warning( + "ci_fixer.env_mismatch", + repo=ci_run.repo_full_name, + tool=parsed.tool, + ) + ctx.complete("escalated", error="env_mismatch: reproducer ran different failure") + await self._persist_context(ctx) + await self._mark_failed(ci_run, "env_mismatch") + return AgentResult( + success=False, + output={"reason": "env_mismatch", "tool": parsed.tool}, + ) + + # ── 5. Analyst loop: confirm root cause → apply → validate ──────────── + analyst = RootCauseAnalyst( + call_llm=self._call_claude, + history_lookup=self._lookup_fix_history, ) + fix_plan: FixPlan | None = None + validation_passed = False + validation_tool_version = "" + current_parsed = parsed + + for iteration in range(1, _MAX_ITERATIONS + 1): + self._log.info("ci_fixer.analyst_iteration", iteration=iteration) - if not fix_plan.is_actionable: + fix_plan = analyst.analyze(current_parsed, workspace, fingerprint_hash=fingerprint) self._log.info( - "ci_fixer.low_confidence", + "ci_fixer.fix_plan", + confidence=fix_plan.confidence, root_cause=fix_plan.root_cause, - iteration=iteration, + patches=len(fix_plan.patches), + needs_test=fix_plan.needs_new_test, ) - break - # Guard: total line delta across all patches - total_delta = sum(abs(p.delta) for p in fix_plan.patches) - if total_delta > _MAX_TOTAL_LINE_DELTA: - self._log.warning( - "ci_fixer.patch_delta_exceeded", - total_delta=total_delta, - max_allowed=_MAX_TOTAL_LINE_DELTA, + await self._trace( + "reflection", + f"**Root cause:** {fix_plan.root_cause}\n" + f"**Confidence:** {fix_plan.confidence}\n" + f"**Patches:** {len(fix_plan.patches)} file(s)", + {"confidence": fix_plan.confidence, "iteration": iteration}, ) - fix_plan = FixPlan( - confidence="low", - root_cause=f"Patch too large ({total_delta} lines changed, max {_MAX_TOTAL_LINE_DELTA})", + + if not fix_plan.is_actionable: + self._log.info( + "ci_fixer.low_confidence", + root_cause=fix_plan.root_cause, + iteration=iteration, + ) + break + + # Guard: total line delta across all patches + total_delta = sum(abs(p.delta) for p in fix_plan.patches) + if total_delta > _MAX_TOTAL_LINE_DELTA: + self._log.warning( + "ci_fixer.patch_delta_exceeded", + total_delta=total_delta, + max_allowed=_MAX_TOTAL_LINE_DELTA, + ) + fix_plan = FixPlan( + confidence="low", + root_cause=f"Patch too large ({total_delta} lines changed, max {_MAX_TOTAL_LINE_DELTA})", + ) + break + + # Guard: number of files + if len(fix_plan.patches) > _MAX_FILES_CHANGED: + self._log.warning( + "ci_fixer.too_many_files", + files=len(fix_plan.patches), + max_allowed=_MAX_FILES_CHANGED, + ) + fix_plan = FixPlan( + confidence="low", + root_cause=f"Fix touches {len(fix_plan.patches)} files (max {_MAX_FILES_CHANGED})", + ) + break + + # Apply patches + files_written = self._apply_patches(workspace, fix_plan.patches) + if not files_written: + self._log.warning("ci_fixer.no_files_written") + fix_plan = FixPlan( + confidence="low", + root_cause="Patch application failed — hunk mismatch or guard rejection", + ) + break + + # Validate + validation = validate_fix(current_parsed, workspace, original_parsed=parsed) + validation_tool_version = validation.tool_version + self._log.info( + "ci_fixer.validation", + passed=validation.passed, + tool=validation.tool, + tool_version=validation_tool_version, + regressions=len(getattr(validation, "regressions", []) or []), + iteration=iteration, ) - break - # Guard: number of files - if len(fix_plan.patches) > _MAX_FILES_CHANGED: - self._log.warning( - "ci_fixer.too_many_files", - files=len(fix_plan.patches), - max_allowed=_MAX_FILES_CHANGED, + if validation.passed: + validation_passed = True + self._log.info("ci_fixer.validation_passed", files=files_written) + break + else: + self._log.warning( + "ci_fixer.validation_failed", + iteration=iteration, + output=validation.output[:300], + ) + await self._trace( + "uncertainty", + f"Validation failed (iteration {iteration}):\n```\n{validation.output[:500]}\n```", + {"iteration": iteration}, + ) + if iteration < _MAX_ITERATIONS: + # Re-parse the validation output for the next iteration + retry_parsed = parse_log(validation.output) + if retry_parsed.has_errors: + current_parsed = retry_parsed + + # ── 6. Check final plan ─────────────────────────────────────────────── + if not fix_plan or not fix_plan.is_actionable or not validation_passed: + reason = ( + "low_confidence" + if (not fix_plan or not fix_plan.is_actionable) + else "validation_failed" ) - fix_plan = FixPlan( - confidence="low", - root_cause=f"Fix touches {len(fix_plan.patches)} files (max {_MAX_FILES_CHANGED})", + await self._mark_failed_with_fields( + ci_run, + reason=reason, + fingerprint_hash=fingerprint, + validation_tool_version=validation_tool_version, ) - break - - # Apply patches - files_written = self._apply_patches(workspace, fix_plan.patches) - if not files_written: - self._log.warning("ci_fixer.no_files_written") - fix_plan = FixPlan( - confidence="low", - root_cause="Patch application failed — hunk mismatch or guard rejection", + # Comment on the PR explaining why we couldn't fix it + if ci_run.pr_number and integration.github_token: + await self._comment_unable_to_fix( + integration=integration, + ci_run=ci_run, + reason=reason, + root_cause=fix_plan.root_cause if fix_plan else "", + tool=parsed.tool, + ) + return AgentResult( + success=False, + output={ + "reason": reason, + "root_cause": fix_plan.root_cause if fix_plan else "", + "tool": parsed.tool, + "fingerprint": fingerprint, + }, ) - break - # Validate - validation = validate_fix(current_parsed, workspace, original_parsed=parsed) - validation_tool_version = validation.tool_version - self._log.info( - "ci_fixer.validation", - passed=validation.passed, - tool=validation.tool, - tool_version=validation_tool_version, - regressions=len(getattr(validation, "regressions", []) or []), - iteration=iteration, + files_written = [p.path for p in fix_plan.patches] + + # ── 6b. Phase 4: Tool version parity check ──────────────────────────── + # Compare local tool version to the version that caused the failure. + # We use last_good_tool_version from the fingerprint as the "failure version" + # proxy (it was the version at the last successful fix — close enough for parity). + parity_result = await self._check_tool_version_parity( + fingerprint_hash=fingerprint, + local_version=validation_tool_version, ) + parity_ok = parity_result.ok - if validation.passed: - validation_passed = True - self._log.info("ci_fixer.validation_passed", files=files_written) - break - else: - self._log.warning( - "ci_fixer.validation_failed", - iteration=iteration, - output=validation.output[:300], - ) - await self._trace( - "uncertainty", - f"Validation failed (iteration {iteration}):\n```\n{validation.output[:500]}\n```", - {"iteration": iteration}, + # ── 7. Commit to safe branch (NEVER the author's branch) ───────────── + fix_branch = f"phalanx/ci-fix/{self.ci_fix_run_id}" + commit_result = await self._commit_to_safe_branch( + workspace=workspace, + source_branch=ci_run.branch, + fix_branch=fix_branch, + commit_message=( + f"fix(ci): resolve {parsed.tool} failure [{ci_run.ci_provider}]\n\n" + f"Root cause: {fix_plan.root_cause}\n" + f"Files: {', '.join(files_written)}\n" + f"Validated: {validation_tool_version}\n" + f"CI Fix Run: {self.ci_fix_run_id}" + ), + github_token=self._get_github_token(integration), + repo_full_name=ci_run.repo_full_name, + ) + commit_sha = commit_result.get("sha") + push_failed = commit_result.get("push_failed", False) + + if not commit_sha: + await self._mark_failed_with_fields( + ci_run, + reason=commit_result.get("error", "commit_failed"), + fingerprint_hash=fingerprint, + validation_tool_version=validation_tool_version, ) - if iteration < _MAX_ITERATIONS: - # Re-parse the validation output for the next iteration - retry_parsed = parse_log(validation.output) - if retry_parsed.has_errors: - current_parsed = retry_parsed - - # ── 6. Check final plan ─────────────────────────────────────────────── - if not fix_plan or not fix_plan.is_actionable or not validation_passed: - reason = "low_confidence" if (not fix_plan or not fix_plan.is_actionable) else "validation_failed" - await self._mark_failed_with_fields( - ci_run, - reason=reason, - fingerprint_hash=fingerprint, - validation_tool_version=validation_tool_version, + return AgentResult(success=False, output={}, error="commit failed") + + # ── 8. Open PR (draft or auto-merge depending on integration config) ──── + # Phase 4: auto-merge only if integration.auto_merge=True AND the + # fingerprint has enough successful fixes AND tool version parity is OK. + fingerprint_success = await self._get_fingerprint_success_count(fingerprint) + enable_auto_merge = should_auto_merge( + integration_auto_merge=getattr(integration, "auto_merge", False), + fingerprint_success_count=fingerprint_success, + min_success_count=getattr(integration, "min_success_count", 3), + parity_ok=parity_ok, ) - # Comment on the PR explaining why we couldn't fix it + + fix_pr_number: int | None = None + if not push_failed and integration.github_token: + # Phase 1: check for an existing Phalanx fix PR targeting this branch. + # If one exists, push the new commit to it instead of opening a second PR. + existing_pr = await self._find_existing_fix_pr(integration, ci_run) + if existing_pr: + self._log.info( + "ci_fixer.reusing_existing_fix_pr", + pr=existing_pr, + branch=ci_run.branch, + ) + fix_pr_number = existing_pr + else: + fix_pr_number = await self._open_draft_pr( + integration=integration, + ci_run=ci_run, + fix_branch=fix_branch, + files_written=files_written, + commit_sha=commit_sha, + tool=parsed.tool, + root_cause=fix_plan.root_cause, + parsed=parsed, + validation_tool_version=validation_tool_version, + enable_auto_merge=enable_auto_merge, + parity_notice=format_parity_notice(parity_result), + ) + + # ── 9. Comment on original PR ───────────────────────────────────────── if ci_run.pr_number and integration.github_token: - await self._comment_unable_to_fix( + await self._comment_on_pr( integration=integration, ci_run=ci_run, - reason=reason, - root_cause=fix_plan.root_cause if fix_plan else "", + files_written=files_written, + commit_sha=commit_sha, tool=parsed.tool, + root_cause=fix_plan.root_cause, + parsed=parsed, + fix_pr_number=fix_pr_number, + validation_tool_version=validation_tool_version, ) - return AgentResult( - success=False, - output={ - "reason": reason, - "root_cause": fix_plan.root_cause if fix_plan else "", - "tool": parsed.tool, - "fingerprint": fingerprint, - }, - ) - - files_written = [p.path for p in fix_plan.patches] - # ── 6b. Phase 4: Tool version parity check ──────────────────────────── - # Compare local tool version to the version that caused the failure. - # We use last_good_tool_version from the fingerprint as the "failure version" - # proxy (it was the version at the last successful fix — close enough for parity). - parity_result = await self._check_tool_version_parity( - fingerprint_hash=fingerprint, - local_version=validation_tool_version, - ) - parity_ok = parity_result.ok - - # ── 7. Commit to safe branch (NEVER the author's branch) ───────────── - fix_branch = f"phalanx/ci-fix/{self.ci_fix_run_id}" - commit_result = await self._commit_to_safe_branch( - workspace=workspace, - source_branch=ci_run.branch, - fix_branch=fix_branch, - commit_message=( - f"fix(ci): resolve {parsed.tool} failure [{ci_run.ci_provider}]\n\n" - f"Root cause: {fix_plan.root_cause}\n" - f"Files: {', '.join(files_written)}\n" - f"Validated: {validation_tool_version}\n" - f"CI Fix Run: {self.ci_fix_run_id}" - ), - github_token=self._get_github_token(integration), - repo_full_name=ci_run.repo_full_name, - ) - commit_sha = commit_result.get("sha") - push_failed = commit_result.get("push_failed", False) + # ── 10. Mark FIXED ──────────────────────────────────────────────────── + async with get_db() as session: + await session.execute( + update(CIFixRun) + .where(CIFixRun.id == self.ci_fix_run_id) + .values( + status="FIXED", + fix_commit_sha=commit_sha, + fix_branch=fix_branch, + fix_pr_number=fix_pr_number, + fingerprint_hash=fingerprint, + validation_tool_version=validation_tool_version, + tool_version_parity_ok=parity_ok, + completed_at=datetime.now(UTC), + ) + ) + await session.commit() - if not commit_sha: - await self._mark_failed_with_fields( - ci_run, - reason=commit_result.get("error", "commit_failed"), + # ── Phase 2: Store winning patches in fingerprint table for future reuse + await self._update_fingerprint_on_success( fingerprint_hash=fingerprint, - validation_tool_version=validation_tool_version, + patches=fix_plan.patches, + tool_version=validation_tool_version, + parsed_log=parsed, ) - return AgentResult(success=False, output={}, error="commit failed") - - # ── 8. Open PR (draft or auto-merge depending on integration config) ──── - # Phase 4: auto-merge only if integration.auto_merge=True AND the - # fingerprint has enough successful fixes AND tool version parity is OK. - fingerprint_success = await self._get_fingerprint_success_count(fingerprint) - enable_auto_merge = should_auto_merge( - integration_auto_merge=getattr(integration, "auto_merge", False), - fingerprint_success_count=fingerprint_success, - min_success_count=getattr(integration, "min_success_count", 3), - parity_ok=parity_ok, - ) - fix_pr_number: int | None = None - if not push_failed and integration.github_token: - fix_pr_number = await self._open_draft_pr( - integration=integration, - ci_run=ci_run, - fix_branch=fix_branch, - files_written=files_written, - commit_sha=commit_sha, - tool=parsed.tool, - root_cause=fix_plan.root_cause, - parsed=parsed, - validation_tool_version=validation_tool_version, - enable_auto_merge=enable_auto_merge, - parity_notice=format_parity_notice(parity_result), + # ── Update final context state ──────────────────────────────────────── + ctx.verified_patch = VerifiedPatch( + files_modified=files_written, + validation_cmd=validation_tool_version or "", + success=True, ) + # Phase 2 already sets ctx.reproduction_result earlier in the pipeline. + # Only set it here as a fallback if sandbox was disabled (still None). + if ctx.reproduction_result is None: + ctx.reproduction_result = ReproductionResult(verdict="skipped") + + # ── Phase 3: Broad verification (catch regressions post-fix) ───────── + verifier = VerifierAgent() + verification_result = await verifier.verify( + workspace_path=workspace, + stack=ctx.sandbox_stack or "python", + sandbox_result=sandbox_result, + timeout_seconds=settings.sandbox_timeout_seconds, + ) + ctx.verification_result = verification_result + await self._persist_context(ctx) - # ── 9. Comment on original PR ───────────────────────────────────────── - if ci_run.pr_number and integration.github_token: - await self._comment_on_pr( - integration=integration, - ci_run=ci_run, - files_written=files_written, - commit_sha=commit_sha, + if verification_result.verdict == "failed": + self._log.warning( + "ci_fixer.verification_failed", + repo=ci_run.repo_full_name, + tool=parsed.tool, + output=verification_result.output[:300], + ) + ctx.complete("escalated", error="verification failed: post-fix regression detected") + await self._persist_context(ctx) + await self._mark_failed(ci_run, "verification_failed") + return AgentResult( + success=False, + output={"reason": "verification_failed", "tool": parsed.tool}, + ) + + ctx.fix_commit_sha = commit_sha + ctx.fix_pr_number = fix_pr_number + ctx.fix_branch = fix_branch + ctx.complete("fixed") + await self._persist_context(ctx) + + self._log.info( + "ci_fixer.execute.done", tool=parsed.tool, - root_cause=fix_plan.root_cause, - parsed=parsed, + files=files_written, + commit_sha=commit_sha, + fix_branch=fix_branch, fix_pr_number=fix_pr_number, - validation_tool_version=validation_tool_version, + root_cause=fix_plan.root_cause, + fingerprint=fingerprint, ) - # ── 10. Mark FIXED ──────────────────────────────────────────────────── - async with get_db() as session: - await session.execute( - update(CIFixRun) - .where(CIFixRun.id == self.ci_fix_run_id) - .values( - status="FIXED", - fix_commit_sha=commit_sha, - fix_branch=fix_branch, - fix_pr_number=fix_pr_number, - fingerprint_hash=fingerprint, - validation_tool_version=validation_tool_version, - tool_version_parity_ok=parity_ok, - completed_at=datetime.now(UTC), - ) + return AgentResult( + success=True, + output={ + "tool": parsed.tool, + "root_cause": fix_plan.root_cause, + "files_fixed": files_written, + "commit_sha": commit_sha, + "fix_branch": fix_branch, + "fix_pr_number": fix_pr_number, + "confidence": fix_plan.confidence, + "fingerprint": fingerprint, + "validation_tool_version": validation_tool_version, + }, ) - await session.commit() + finally: + if sandbox_result: + await provisioner.release(sandbox_result) - # ── Phase 2: Store winning patches in fingerprint table for future reuse - await self._update_fingerprint_on_success( - fingerprint_hash=fingerprint, - patches=fix_plan.patches, - tool_version=validation_tool_version, - parsed_log=parsed, - ) + # ── Pipeline context persistence ─────────────────────────────────────────── - self._log.info( - "ci_fixer.execute.done", - tool=parsed.tool, - files=files_written, - commit_sha=commit_sha, - fix_branch=fix_branch, - fix_pr_number=fix_pr_number, - root_cause=fix_plan.root_cause, - fingerprint=fingerprint, - ) + async def _persist_context(self, ctx: CIFixContext) -> None: + """Persist the current CIFixContext state to CIFixRun.pipeline_context_json.""" + import json # noqa: PLC0415 - return AgentResult( - success=True, - output={ - "tool": parsed.tool, - "root_cause": fix_plan.root_cause, - "files_fixed": files_written, - "commit_sha": commit_sha, - "fix_branch": fix_branch, - "fix_pr_number": fix_pr_number, - "confidence": fix_plan.confidence, - "fingerprint": fingerprint, - "validation_tool_version": validation_tool_version, - }, - ) + try: + async with get_db() as session: + await session.execute( + update(CIFixRun) + .where(CIFixRun.id == self.ci_fix_run_id) + .values(pipeline_context_json=json.dumps(ctx.to_dict())) + ) + await session.commit() + except Exception as exc: + self._log.warning("ci_fixer.context_persist_error", error=str(exc)) # ── Log fetching ─────────────────────────────────────────────────────────── @@ -507,16 +669,14 @@ def _apply_patches(self, workspace: Path, patches: list[FilePatch]) -> list[str] continue try: - original_lines = full_path.read_text(encoding="utf-8").splitlines( - keepends=True - ) + original_lines = full_path.read_text(encoding="utf-8").splitlines(keepends=True) except Exception as exc: self._log.warning("ci_fixer.patch_read_failed", path=patch.path, error=str(exc)) continue # Convert to 0-indexed slice s = patch.start_line - 1 - e = patch.end_line # exclusive in Python slice + e = patch.end_line # exclusive in Python slice # Bounds check if s < 0 or e > len(original_lines) or s >= e: @@ -646,9 +806,8 @@ async def _commit_to_safe_branch( push_failed = False if github_token and repo.remotes: try: - auth_url = ( - f"https://github.com/{repo_full_name}.git" - .replace("https://", f"https://{github_token}@") + auth_url = f"https://github.com/{repo_full_name}.git".replace( + "https://", f"https://{github_token}@" ) repo.git.push(auth_url, f"HEAD:{fix_branch}", "--set-upstream") self._log.info("ci_fixer.git.pushed", branch=fix_branch, sha=sha) @@ -712,11 +871,10 @@ async def _open_draft_pr( ) footer = ( - f"*Auto-merge is enabled — will merge when all checks pass.*\n" + "*Auto-merge is enabled — will merge when all checks pass.*\n" if enable_auto_merge - else - f"*This is a draft PR — Phalanx never auto-merges. " - f"Review the diff above, then mark ready and merge if correct.*\n" + else "*This is a draft PR — Phalanx never auto-merges. " + "Review the diff above, then mark ready and merge if correct.*\n" ) body = ( @@ -783,6 +941,54 @@ async def _open_draft_pr( return None + async def _find_existing_fix_pr( + self, + integration: CIIntegration, + ci_run: CIFixRun, + ) -> int | None: + """ + Look for an open Phalanx fix PR already targeting ci_run.branch. + + Returns the PR number if found, None otherwise. + + This prevents duplicate fix PRs when the pipeline is triggered + multiple times for the same failing branch (e.g. repeated CI runs). + A new commit is pushed to the existing PR instead of opening a second one. + """ + import httpx # noqa: PLC0415 + + try: + async with httpx.AsyncClient(timeout=15) as client: + r = await client.get( + f"https://api.github.com/repos/{ci_run.repo_full_name}/pulls", + headers={ + "Authorization": f"Bearer {integration.github_token}", + "Accept": "application/vnd.github+json", + }, + params={ + "state": "open", + "base": ci_run.branch, + "head": f"{ci_run.repo_full_name.split('/')[0]}:phalanx/ci-fix/", + }, + ) + if r.status_code != 200: + return None + prs = r.json() + # Filter to PRs whose head branch starts with phalanx/ci-fix/ + for pr in prs: + head_ref = pr.get("head", {}).get("ref", "") + if head_ref.startswith("phalanx/ci-fix/"): + self._log.info( + "ci_fixer.existing_fix_pr_found", + pr=pr["number"], + head=head_ref, + base=ci_run.branch, + ) + return pr["number"] + except Exception as exc: + self._log.warning("ci_fixer.find_existing_pr_error", error=str(exc)) + return None + async def _enable_github_auto_merge( self, integration: CIIntegration, @@ -961,9 +1167,7 @@ async def _comment_unable_to_fix( # ── DB helpers ───────────────────────────────────────────────────────────── async def _load_ci_fix_run(self, session) -> CIFixRun | None: - result = await session.execute( - select(CIFixRun).where(CIFixRun.id == self.ci_fix_run_id) - ) + result = await session.execute(select(CIFixRun).where(CIFixRun.id == self.ci_fix_run_id)) return result.scalar_one_or_none() async def _load_integration(self, session, integration_id: str) -> CIIntegration | None: @@ -1002,7 +1206,7 @@ async def _check_tool_version_parity( self, fingerprint_hash: str | None, local_version: str, - ) -> "VersionParityResult": + ) -> VersionParityResult: """ Phase 4: Compare local tool version to the version at the last successful fix. @@ -1018,8 +1222,6 @@ async def _check_tool_version_parity( ) try: - from sqlalchemy import and_ # noqa: PLC0415 - async with get_db() as session: result = await session.execute( select(CIFailureFingerprint).where( @@ -1068,8 +1270,8 @@ async def _get_fingerprint_success_count(self, fingerprint_hash: str | None) -> async def _load_flaky_patterns( self, repo_full_name: str, - parsed_log: "ParsedLog", - ) -> list["CIFlakyPattern"]: + parsed_log: ParsedLog, + ) -> list[CIFlakyPattern]: """ Phase 3: Load CIFlakyPattern rows matching the errors in parsed_log. @@ -1083,9 +1285,7 @@ async def _load_flaky_patterns( from sqlalchemy import and_, or_ # noqa: PLC0415 # Collect (file, code) pairs from the parsed errors - error_keys = [ - (e.file, e.code) for e in parsed_log.lint_errors - ] + [ + error_keys = [(e.file, e.code) for e in parsed_log.lint_errors] + [ (e.file, getattr(e, "code", None)) for e in parsed_log.type_errors ] @@ -1189,9 +1389,9 @@ async def _async_lookup_fix_history(self, fingerprint_hash: str) -> list[dict] | async def _update_fingerprint_on_success( self, fingerprint_hash: str, - patches: list["FilePatch"], + patches: list[FilePatch], tool_version: str, - parsed_log: "ParsedLog", + parsed_log: ParsedLog, ) -> None: """ After a successful fix is validated, upsert CIFailureFingerprint with diff --git a/phalanx/api/main.py b/phalanx/api/main.py index a67dcd60..6131eb0d 100644 --- a/phalanx/api/main.py +++ b/phalanx/api/main.py @@ -10,6 +10,7 @@ from fastapi.responses import JSONResponse from phalanx import __version__ +from phalanx.api.routes.ci_fix_runs import router as ci_fix_runs_router from phalanx.api.routes.ci_integrations import router as ci_integrations_router from phalanx.api.routes.ci_webhooks import router as ci_webhooks_router from phalanx.api.routes.demos import router as demos_router @@ -89,6 +90,7 @@ async def api_key_middleware(request: Request, call_next): app.include_router(ci_webhooks_router, prefix="/webhook") app.include_router(demos_router, prefix="/v1") app.include_router(ci_integrations_router, prefix="/v1") +app.include_router(ci_fix_runs_router, prefix="/v1") app.include_router(health_router) diff --git a/phalanx/api/routes/ci_fix_runs.py b/phalanx/api/routes/ci_fix_runs.py new file mode 100644 index 00000000..fd23598a --- /dev/null +++ b/phalanx/api/routes/ci_fix_runs.py @@ -0,0 +1,148 @@ +""" +CI Fix Runs API — inspect the state of multi-agent CI fix pipeline runs. + +Endpoints: + GET /v1/ci-fix-runs/{run_id}/context — full CIFixContext pipeline state + GET /v1/ci-fix-runs/{run_id} — CIFixRun record summary + GET /v1/ci-fix-runs — list runs (filtered by repo/branch/status) +""" + +from __future__ import annotations + +import json + +import structlog +from fastapi import APIRouter, HTTPException, Query, status +from sqlalchemy import select + +from phalanx.ci_fixer.context import CIFixContext +from phalanx.db.models import CIFixRun +from phalanx.db.session import get_db + +log = structlog.get_logger(__name__) + +router = APIRouter(prefix="/ci-fix-runs", tags=["ci-fix-runs"]) + + +@router.get("/{run_id}/context") +async def get_fix_run_context(run_id: str) -> dict: + """ + Return the full CIFixContext pipeline state for a CI fix run. + + This is the shared state object written by each agent as the pipeline + progresses. Use this to inspect exactly what each agent produced, + which stage the pipeline is at, and what the final outcome was. + + Returns 404 if the run does not exist. + Returns the raw context dict if pipeline_context_json is not yet + populated (run is too old or not yet started). + """ + async with get_db() as session: + result = await session.execute(select(CIFixRun).where(CIFixRun.id == run_id)) + ci_run = result.scalar_one_or_none() + + if ci_run is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"CIFixRun {run_id} not found", + ) + + if not ci_run.pipeline_context_json: + # Run exists but was created before Phase 1 — return basic info + return { + "ci_fix_run_id": str(ci_run.id), + "repo": ci_run.repo_full_name, + "branch": ci_run.branch, + "commit_sha": ci_run.commit_sha, + "original_build_id": ci_run.ci_build_id, + "status": ci_run.status, + "final_status": "unknown", + "current_stage": "unknown", + "_note": "This run predates the multi-agent pipeline context. No detailed state available.", + } + + try: + ctx_dict = json.loads(ci_run.pipeline_context_json) + ctx = CIFixContext.from_dict(ctx_dict) + return { + **ctx.to_dict(), + "current_stage": ctx.current_stage, + } + except Exception as exc: + log.warning("ci_fix_runs.context_parse_error", run_id=run_id, error=str(exc)) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to parse pipeline context", + ) from exc + + +@router.get("/{run_id}") +async def get_fix_run(run_id: str) -> dict: + """Return a summary of a CI fix run record.""" + async with get_db() as session: + result = await session.execute(select(CIFixRun).where(CIFixRun.id == run_id)) + ci_run = result.scalar_one_or_none() + + if ci_run is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"CIFixRun {run_id} not found", + ) + + return { + "id": str(ci_run.id), + "repo": ci_run.repo_full_name, + "branch": ci_run.branch, + "commit_sha": ci_run.commit_sha, + "ci_provider": ci_run.ci_provider, + "ci_build_id": ci_run.ci_build_id, + "status": ci_run.status, + "fix_branch": ci_run.fix_branch, + "fix_pr_number": ci_run.fix_pr_number, + "fix_commit_sha": ci_run.fix_commit_sha, + "fingerprint_hash": ci_run.fingerprint_hash, + "error": ci_run.error, + "created_at": ci_run.created_at.isoformat() if ci_run.created_at else None, + "completed_at": ci_run.completed_at.isoformat() if ci_run.completed_at else None, + "has_context": ci_run.pipeline_context_json is not None, + } + + +@router.get("") +async def list_fix_runs( + repo: str | None = Query(None, description="Filter by repo (owner/repo)"), + branch: str | None = Query(None, description="Filter by branch"), + run_status: str | None = Query( + None, alias="status", description="Filter by status: PENDING, FIXED, FAILED" + ), + limit: int = Query(20, ge=1, le=100), +) -> dict: + """List CI fix runs with optional filters.""" + async with get_db() as session: + q = select(CIFixRun).order_by(CIFixRun.created_at.desc()).limit(limit) + if repo: + q = q.where(CIFixRun.repo_full_name == repo) + if branch: + q = q.where(CIFixRun.branch == branch) + if run_status: + q = q.where(CIFixRun.status == run_status.upper()) + + result = await session.execute(q) + runs = result.scalars().all() + + return { + "runs": [ + { + "id": str(r.id), + "repo": r.repo_full_name, + "branch": r.branch, + "status": r.status, + "fix_pr_number": r.fix_pr_number, + "error": r.error, + "created_at": r.created_at.isoformat() if r.created_at else None, + "has_context": r.pipeline_context_json is not None, + } + for r in runs + ], + "count": len(runs), + } diff --git a/phalanx/api/routes/ci_webhooks.py b/phalanx/api/routes/ci_webhooks.py index 1589b38b..a3fe5cdf 100644 --- a/phalanx/api/routes/ci_webhooks.py +++ b/phalanx/api/routes/ci_webhooks.py @@ -193,7 +193,7 @@ def _verify_buildkite_signature(body: bytes, token: str, stored_token: str) -> b # ── GitHub App webhook ───────────────────────────────────────────────────────── -@router.post("/webhook/github", status_code=status.HTTP_200_OK) +@router.post("/github", status_code=status.HTTP_200_OK) async def github_webhook( request: Request, x_hub_signature_256: str = Header(default=""), @@ -273,7 +273,7 @@ async def github_webhook( # ── Buildkite webhook ────────────────────────────────────────────────────────── -@router.post("/webhook/buildkite", status_code=status.HTTP_200_OK) +@router.post("/buildkite", status_code=status.HTTP_200_OK) async def buildkite_webhook( request: Request, x_buildkite_token: str = Header(default=""), @@ -336,44 +336,141 @@ async def buildkite_webhook( } -# ── CircleCI webhook (Phase 2 stub) ──────────────────────────────────────────── +# ── CircleCI webhook ─────────────────────────────────────────────────────────── -@router.post("/webhook/circleci", status_code=status.HTTP_200_OK) -async def circleci_webhook(request: Request): - """CircleCI webhook — Phase 2.""" - return {"status": "coming_soon", "provider": "circleci"} +def _verify_circleci_signature(body: bytes, signature: str, secret: str) -> bool: + """ + Verify CircleCI webhook signature. + CircleCI sends: circleci-signature: v1= + """ + if not secret: + return True + expected = "v1=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() + return hmac.compare_digest(expected, signature or "") -# ── Jenkins webhook (Phase 2 stub) ───────────────────────────────────────────── +@router.post("/circleci", status_code=status.HTTP_200_OK) +async def circleci_webhook( + request: Request, + circleci_signature: str = Header(default="", alias="circleci-signature"), +): + """ + Receives CircleCI webhook events. + + Handles: + - workflow-completed with status=failed → dispatch CI fix + + Setup in CircleCI: Project Settings → Webhooks + Add URL: https://api.usephalanx.com/webhook/circleci + Events: Workflow Completed + Signing secret: set CIRCLECI_WEBHOOK_SECRET in phalanx env + + Payload shape (workflow-completed): + { + "type": "workflow-completed", + "workflow": { + "id": "", + "name": "", + "status": "failed", + "created_at": "...", + "stopped_at": "..." + }, + "pipeline": { + "id": "", + "number": 42, + "trigger": {"type": "webhook", ...}, + "vcs": { + "origin_repository_url": "https://github.com/owner/repo", + "branch": "fix/my-branch", + "revision": "", + "commit": {"subject": "...", "author": {"login": "..."}} + } + }, + "project": {"id": "...", "name": "repo", "slug": "github/owner/repo"}, + "organization": {"name": "owner", ...} + } + """ + body = await request.body() + if not _verify_circleci_signature(body, circleci_signature, settings.circleci_webhook_secret): + log.warning("ci_webhook.circleci.invalid_signature") + raise HTTPException(status_code=401, detail="Invalid CircleCI signature") -@router.post("/webhook/jenkins", status_code=status.HTTP_200_OK) -async def jenkins_webhook(request: Request): - """Jenkins webhook — Phase 2.""" - return {"status": "coming_soon", "provider": "jenkins"} + payload = json.loads(body) + event_type = payload.get("type") + if event_type != "workflow-completed": + return {"status": "ignored", "type": event_type} -# ── Short-path aliases (router is mounted at /webhook, so /github → /webhook/github) ─────────── + workflow = payload.get("workflow", {}) + if workflow.get("status") not in ("failed", "error", "failing", "canceled"): + return {"status": "ignored", "workflow_status": workflow.get("status")} + pipeline = payload.get("pipeline", {}) + vcs = pipeline.get("vcs", {}) -@router.post("/github", status_code=status.HTTP_200_OK) -async def github_webhook_alias( - request: Request, - x_hub_signature_256: str = Header(default=""), - x_github_event: str = Header(default=""), -): - """Alias for /webhook/github — correct path when router is mounted at /webhook prefix.""" - return await github_webhook(request, x_hub_signature_256, x_github_event) + # Extract repo name from the VCS URL (always GitHub for phalanx) + repo_url = vcs.get("origin_repository_url", "") + repo_full_name = _parse_repo_name(repo_url) + if not repo_full_name: + # Fallback: try project slug (format: "github/owner/repo") + slug = payload.get("project", {}).get("slug", "") + if slug.startswith("github/"): + repo_full_name = slug[len("github/") :] + if not repo_full_name: + return {"status": "skipped", "reason": "cannot_parse_repo"} + + branch = vcs.get("branch", "") + commit_sha = vcs.get("revision", "") + pr_author: str | None = vcs.get("commit", {}).get("author", {}).get("login") or vcs.get( + "commit", {} + ).get("committer", {}).get("login") + + # CircleCI build_id = workflow ID (used to fetch job list + logs) + workflow_id = workflow.get("id", "") + workflow_name = workflow.get("name", "") + build_url = ( + f"https://app.circleci.com/pipelines/github/{repo_full_name}" + f"/{pipeline.get('number', '')}/workflows/{workflow_id}" + ) + # PR number: CircleCI doesn't directly provide it in workflow webhooks. + # It may be in the branch name (e.g. "pull/42") or absent. + pr_number: int | None = None + if branch.startswith("pull/"): + import contextlib # noqa: PLC0415 -@router.post("/buildkite", status_code=status.HTTP_200_OK) -async def buildkite_webhook_alias( - request: Request, - x_buildkite_token: str = Header(default=""), -): - """Alias for /webhook/buildkite.""" - return await buildkite_webhook(request, x_buildkite_token) + with contextlib.suppress(IndexError, ValueError): + pr_number = int(branch.split("/")[1]) + + event = CIFailureEvent( + provider="circleci", + repo_full_name=repo_full_name, + branch=branch, + commit_sha=commit_sha, + build_id=workflow_id, + build_url=build_url, + failed_jobs=[workflow_name] if workflow_name else [], + pr_number=pr_number, + pr_author=pr_author, + raw_payload=payload, + ) + + ci_run = await _dispatch_ci_fix(event) + return { + "status": "dispatched" if ci_run else "skipped", + "ci_fix_run_id": ci_run.id if ci_run else None, + } + + +# ── Jenkins webhook (Phase 2 stub) ───────────────────────────────────────────── + + +@router.post("/jenkins", status_code=status.HTTP_200_OK) +async def jenkins_webhook(request: Request): + """Jenkins webhook — Phase 2.""" + return {"status": "coming_soon", "provider": "jenkins"} # ── Helpers ──────────────────────────────────────────────────────────────────── diff --git a/phalanx/ci_fixer/analyst.py b/phalanx/ci_fixer/analyst.py index 7b496b09..68f72c51 100644 --- a/phalanx/ci_fixer/analyst.py +++ b/phalanx/ci_fixer/analyst.py @@ -50,8 +50,8 @@ class FileWindow: """A contiguous slice of a file that was shown to the LLM.""" path: str - start_line: int # 1-indexed, inclusive - end_line: int # 1-indexed, inclusive + start_line: int # 1-indexed, inclusive + end_line: int # 1-indexed, inclusive original_lines: list[str] @@ -65,8 +65,8 @@ class FilePatch: """ path: str - start_line: int # 1-indexed - end_line: int # 1-indexed + start_line: int # 1-indexed + end_line: int # 1-indexed corrected_lines: list[str] reason: str = "" @@ -91,7 +91,7 @@ class FixPlan: "low" → agent does NOT commit; logs for human review """ - confidence: str # "high" | "medium" | "low" + confidence: str # "high" | "medium" | "low" root_cause: str patches: list[FilePatch] = field(default_factory=list) needs_new_test: bool = False @@ -178,7 +178,7 @@ def __init__(self, call_llm, history_lookup=None): def analyze( self, - parsed_log: "ParsedLog", + parsed_log: ParsedLog, workspace: Path, fingerprint_hash: str | None = None, ) -> FixPlan: @@ -275,9 +275,7 @@ def analyze( # ── File reading ─────────────────────────────────────────────────────────── - def _read_windows( - self, workspace: Path, parsed_log: "ParsedLog" - ) -> list[FileWindow]: + def _read_windows(self, workspace: Path, parsed_log: ParsedLog) -> list[FileWindow]: """ For each file in parsed_log.all_files, read a window of ±WINDOW lines around every error line in that file. Merge overlapping windows. @@ -324,7 +322,7 @@ def _read_windows( windows.append( FileWindow( path=rel_path, - start_line=lo + 1, # convert to 1-indexed + start_line=lo + 1, # convert to 1-indexed end_line=hi, original_lines=all_lines[lo:hi], ) @@ -371,10 +369,7 @@ def _parse_and_validate_patches( continue # Ensure every line ends with \n - corrected = [ - line if line.endswith("\n") else line + "\n" - for line in corrected - ] + corrected = [line if line.endswith("\n") else line + "\n" for line in corrected] window = window_by_path[path] @@ -442,7 +437,6 @@ def _read_files(self, workspace: Path, paths: list[str]) -> str: mock_log.build_errors = [] # Read each file as a full window (no error lines → defaults to line 1) - from phalanx.ci_fixer.log_parser import LintError # noqa: PLC0415 results: list[str] = [] for rel_path in paths[:_MAX_FILES]: @@ -472,12 +466,10 @@ def _format_windows(windows: list[FileWindow]) -> str: sections: list[str] = [] for w in windows: numbered = "".join( - f"{w.start_line + i:5d}: {line}" - for i, line in enumerate(w.original_lines) + f"{w.start_line + i:5d}: {line}" for i, line in enumerate(w.original_lines) ) sections.append( - f"### {w.path} (lines {w.start_line}–{w.end_line} of file)\n" - f"```\n{numbered}```" + f"### {w.path} (lines {w.start_line}–{w.end_line} of file)\n```\n{numbered}```" ) return "\n\n".join(sections) diff --git a/phalanx/ci_fixer/context.py b/phalanx/ci_fixer/context.py new file mode 100644 index 00000000..cc1dd336 --- /dev/null +++ b/phalanx/ci_fixer/context.py @@ -0,0 +1,242 @@ +""" +CIFixContext — shared state object for the multi-agent CI fix pipeline. + +Every agent in the pipeline reads from and writes to this object. +It is persisted as JSON in CIFixRun.pipeline_context_json so the full +pipeline state is inspectable at any point via the API. + +Design: + - Dataclass with optional fields — agents populate their slice and leave + the rest None until reached + - Serializable to/from dict (JSON) — no custom encoder needed + - Immutable agent outputs — each stage replaces its field entirely, + never mutates in place + - Final status is terminal — once set, no agent should write further + +Agent → field mapping: + Log Analyst → structured_failure + Root Cause Agent → classified_failure + Sandbox Prov. → sandbox_id, sandbox_stack + Reproducer Agent → reproduction_result + Fix Agent → verified_patch + Verifier Agent → verification_result + Commit Agent → fix_commit_sha, fix_pr_number, fix_pr_url +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from datetime import UTC, datetime +from typing import Any, Literal + +# ── Sub-objects (one per agent output) ──────────────────────────────────────── + + +@dataclass +class StructuredFailure: + """Output of Log Analyst — structured representation of the CI failure.""" + + tool: str + """Tool that failed: 'ruff', 'pytest', 'mypy', 'tsc', 'eslint', etc.""" + + failure_type: str + """Category: 'lint', 'type_error', 'test_regression', 'build', 'dependency', 'unknown'""" + + reproducer_cmd: str + """Exact command CI ran: 'ruff check phalanx/ tests/ --output-format=github'""" + + errors: list[dict[str, Any]] = field(default_factory=list) + """Parsed errors — list of {file, line, col, code, message} dicts""" + + failing_files: list[str] = field(default_factory=list) + """File paths mentioned in the failure""" + + log_excerpt: str = "" + """Relevant section of the raw CI log""" + + confidence: float = 1.0 + """Parser confidence 0.0–1.0""" + + +@dataclass +class ClassifiedFailure: + """Output of Root Cause Agent — classification + escalation decision.""" + + tier: Literal["L1_auto", "L2_escalate"] + """L1 = auto-fixable; L2 = needs human""" + + root_cause: str + """Human-readable root cause hypothesis""" + + stack: str + """Detected tech stack: 'python', 'node', 'go', 'java', 'rust', 'unknown'""" + + confidence: float = 1.0 + """Classification confidence 0.0–1.0""" + + escalation_reason: str = "" + """Populated when tier == L2 — why we're not attempting auto-fix""" + + +@dataclass +class ReproductionResult: + """Output of Reproducer Agent — did we confirm the failure in sandbox?""" + + verdict: Literal["confirmed", "flaky", "env_mismatch", "timeout", "skipped"] + """ + confirmed — sandbox reproduced the same failure + flaky — command passed in sandbox → likely transient CI issue + env_mismatch — command failed with a DIFFERENT error → wrong environment + timeout — sandbox command timed out + skipped — sandbox not available (Phase 1 fallback) + """ + + exit_code: int = -1 + output: str = "" + reproducer_cmd: str = "" + + +@dataclass +class VerifiedPatch: + """Output of Fix Agent — patch that has been validated locally.""" + + files_modified: list[str] = field(default_factory=list) + validation_cmd: str = "" + validation_output: str = "" + success: bool = False + turns_used: int = 0 + + +@dataclass +class VerificationResult: + """Output of Verifier Agent — does the app/tests still work after the fix?""" + + verdict: Literal["passed", "failed", "skipped", "timeout"] + output: str = "" + cmd_run: str = "" + + +# ── Main context object ──────────────────────────────────────────────────────── + + +@dataclass +class CIFixContext: + """ + Shared state object for the multi-agent CI fix pipeline. + + Persisted as JSON in CIFixRun.pipeline_context_json. + All fields except the identity fields are optional — populated + as each agent completes its work. + """ + + # ── Identity (always set at pipeline start) ──────────────────────────── + ci_fix_run_id: str + repo: str + branch: str + commit_sha: str + original_build_id: str + + # ── Agent outputs (None until that agent runs) ───────────────────────── + structured_failure: StructuredFailure | None = None + classified_failure: ClassifiedFailure | None = None + + sandbox_id: str | None = None + sandbox_stack: str | None = None + + reproduction_result: ReproductionResult | None = None + verified_patch: VerifiedPatch | None = None + verification_result: VerificationResult | None = None + + # ── Commit Agent output ──────────────────────────────────────────────── + fix_commit_sha: str | None = None + fix_pr_number: int | None = None + fix_pr_url: str | None = None + fix_branch: str | None = None + pr_was_existing: bool = False + """True if the Commit Agent pushed to an existing fix PR rather than opening a new one.""" + + # ── Pipeline metadata ────────────────────────────────────────────────── + started_at: str = field(default_factory=lambda: datetime.now(UTC).isoformat()) + completed_at: str | None = None + final_status: Literal[ + "fixed", "escalated", "flaky", "env_mismatch", "failed", "in_progress" + ] = "in_progress" + pr_comment_posted: bool = False + error: str | None = None + + # ── Serialisation ────────────────────────────────────────────────────── + + def to_dict(self) -> dict[str, Any]: + """Serialize to a JSON-safe dict.""" + d = asdict(self) + return d + + @classmethod + def from_dict(cls, d: dict[str, Any]) -> CIFixContext: + """Deserialize from a dict (as stored in pipeline_context_json).""" + ctx = cls( + ci_fix_run_id=d["ci_fix_run_id"], + repo=d["repo"], + branch=d["branch"], + commit_sha=d["commit_sha"], + original_build_id=d["original_build_id"], + ) + # Agent outputs + if d.get("structured_failure"): + ctx.structured_failure = StructuredFailure(**d["structured_failure"]) + if d.get("classified_failure"): + ctx.classified_failure = ClassifiedFailure(**d["classified_failure"]) + if d.get("reproduction_result"): + ctx.reproduction_result = ReproductionResult(**d["reproduction_result"]) + if d.get("verified_patch"): + ctx.verified_patch = VerifiedPatch(**d["verified_patch"]) + if d.get("verification_result"): + ctx.verification_result = VerificationResult(**d["verification_result"]) + # Scalars + ctx.sandbox_id = d.get("sandbox_id") + ctx.sandbox_stack = d.get("sandbox_stack") + ctx.fix_commit_sha = d.get("fix_commit_sha") + ctx.fix_pr_number = d.get("fix_pr_number") + ctx.fix_pr_url = d.get("fix_pr_url") + ctx.fix_branch = d.get("fix_branch") + ctx.pr_was_existing = d.get("pr_was_existing", False) + ctx.started_at = d.get("started_at", ctx.started_at) + ctx.completed_at = d.get("completed_at") + ctx.final_status = d.get("final_status", "in_progress") + ctx.pr_comment_posted = d.get("pr_comment_posted", False) + ctx.error = d.get("error") + return ctx + + def complete( + self, + status: Literal["fixed", "escalated", "flaky", "env_mismatch", "failed"], + error: str | None = None, + ) -> None: + """Mark the pipeline as complete with a terminal status.""" + self.final_status = status + self.completed_at = datetime.now(UTC).isoformat() + if error: + self.error = error + + @property + def is_complete(self) -> bool: + return self.final_status != "in_progress" + + @property + def current_stage(self) -> str: + """Human-readable name of the last completed stage.""" + if self.fix_commit_sha: + return "committed" + if self.verification_result: + return "verified" + if self.verified_patch: + return "patched" + if self.reproduction_result: + return "reproduced" + if self.sandbox_id: + return "sandbox_ready" + if self.classified_failure: + return "classified" + if self.structured_failure: + return "parsed" + return "started" diff --git a/phalanx/ci_fixer/log_fetcher.py b/phalanx/ci_fixer/log_fetcher.py index 6603c1b2..79601bb8 100644 --- a/phalanx/ci_fixer/log_fetcher.py +++ b/phalanx/ci_fixer/log_fetcher.py @@ -262,16 +262,142 @@ async def fetch(self, event: CIFailureEvent, api_key: str) -> str: class CircleCILogFetcher: """ Fetches CI logs from CircleCI v2 API. - Phase 2 — stub for now. + + Strategy: + 1. GET /api/v2/workflow/{workflow_id}/job → find failed jobs + 2. GET /api/v2/project/{slug}/job/{job_number}/steps → get step log URLs + 3. GET {log_url} → fetch the actual step output + 4. Combine + extract the relevant failure section + + event.build_id is the CircleCI workflow ID (UUID). + event.repo_full_name must be in 'owner/repo' format (GitHub VCS assumed). """ + _BASE = "https://circleci.com/api/v2" + async def fetch(self, event: CIFailureEvent, api_key: str) -> str: - # TODO Phase 2: implement CircleCI v2 API log fetch - # GET /pipeline/{pipeline_id}/workflow - # GET /workflow/{workflow_id}/job - # GET /project/{slug}/job/{job_number}/steps - log.warning("ci_fixer.circleci.not_implemented") - return "(CircleCI log fetch not yet implemented)" + headers = {"Circle-Token": api_key} + project_slug = f"github/{event.repo_full_name}" + + async with httpx.AsyncClient(timeout=30) as client: + # 1. List jobs in this workflow, find the failed ones + failed_jobs = await self._get_failed_jobs(client, headers, event.build_id) + if not failed_jobs: + log.info( + "ci_fixer.circleci.no_failed_jobs", + workflow_id=event.build_id, + ) + return "(no failed jobs found in workflow)" + + log_sections: list[str] = [] + for job_number, job_name in failed_jobs[:3]: + section = await self._get_job_log( + client, headers, project_slug, job_number, job_name + ) + if section: + log_sections.append(f"JOB: {job_name}\n{section}") + + combined = "\n\n---\n\n".join(log_sections) + return _truncate(combined) if combined.strip() else "(no logs retrieved)" + + async def _get_failed_jobs( + self, + client: httpx.AsyncClient, + headers: dict, + workflow_id: str, + ) -> list[tuple[int, str]]: + """Return list of (job_number, job_name) for failed jobs in the workflow.""" + try: + r = await client.get( + f"{self._BASE}/workflow/{workflow_id}/job", + headers=headers, + ) + r.raise_for_status() + jobs = r.json().get("items", []) + return [ + (j["job_number"], j.get("name", str(j["job_number"]))) + for j in jobs + if j.get("status") in ("failed", "timedout", "infrastructure_fail") + and j.get("job_number") is not None + ] + except Exception as exc: + log.warning("ci_fixer.circleci.workflow_jobs_failed", error=str(exc)) + return [] + + async def _get_job_log( + self, + client: httpx.AsyncClient, + headers: dict, + project_slug: str, + job_number: int, + job_name: str, + ) -> str: + """Fetch and return the failure section from a single CircleCI job.""" + try: + # Get step details — each step has output URLs + r = await client.get( + f"{self._BASE}/project/{project_slug}/job/{job_number}/steps", + headers=headers, + ) + r.raise_for_status() + steps = r.json().get("items", []) + + # Find failed steps (exit_code != 0) + failed_steps = [ + action + for step in steps + for action in step.get("actions", []) + if action.get("exit_code") not in (0, None) or action.get("failed") + ] + + # Fall back to all steps if no explicit failures found + all_actions = [ + action + for step in steps + for action in step.get("actions", []) + if action.get("output_url") + ] + targets = failed_steps if failed_steps else all_actions[-3:] + + all_lines: list[str] = [] + for action in targets[:3]: + output_url = action.get("output_url") + if not output_url: + continue + try: + log_r = await client.get(output_url, headers=headers) + if log_r.status_code == 200: + # CircleCI returns a JSON array of {message, type} objects + # OR raw text depending on content-type + content_type = log_r.headers.get("content-type", "") + if "json" in content_type: + entries = log_r.json() + text = "".join( + e.get("message", "") for e in entries if isinstance(e, dict) + ) + else: + text = log_r.text + # Strip ANSI escape codes + text = re.sub(r"\x1b\[[0-9;]*[mGKHF]", "", text) + all_lines.extend(text.splitlines()) + except Exception as exc: + log.warning( + "ci_fixer.circleci.output_fetch_failed", + job=job_name, + error=str(exc), + ) + + if all_lines: + return _extract_failure_section(all_lines) + return "" + + except Exception as exc: + log.warning( + "ci_fixer.circleci.job_steps_failed", + job_number=job_number, + error=str(exc), + ) + return "" # ── Jenkins ──────────────────────────────────────────────────────────────────── diff --git a/phalanx/ci_fixer/log_parser.py b/phalanx/ci_fixer/log_parser.py index 06fb967f..ac22e73a 100644 --- a/phalanx/ci_fixer/log_parser.py +++ b/phalanx/ci_fixer/log_parser.py @@ -20,7 +20,6 @@ import re from dataclasses import dataclass, field - # ── Structured error types ───────────────────────────────────────────────────── @@ -78,9 +77,7 @@ class ParsedLog: @property def has_errors(self) -> bool: - return bool( - self.lint_errors or self.type_errors or self.test_failures or self.build_errors - ) + return bool(self.lint_errors or self.type_errors or self.test_failures or self.build_errors) @property def all_files(self) -> list[str]: diff --git a/phalanx/ci_fixer/outcome_tracker.py b/phalanx/ci_fixer/outcome_tracker.py index e3d08779..022f63c1 100644 --- a/phalanx/ci_fixer/outcome_tracker.py +++ b/phalanx/ci_fixer/outcome_tracker.py @@ -23,10 +23,8 @@ from __future__ import annotations import asyncio -import json import uuid from datetime import UTC, datetime, timedelta -from typing import TYPE_CHECKING import structlog from sqlalchemy import and_, select, update @@ -35,9 +33,6 @@ from phalanx.db.session import get_db from phalanx.queue.celery_app import celery_app -if TYPE_CHECKING: - pass - log = structlog.get_logger(__name__) # Poll schedule: (poll_number, hours_after_creation) @@ -95,9 +90,7 @@ async def _process_run(run: CIFixRun, now: datetime) -> None: # Which polls have already been recorded? async with get_db() as session: result = await session.execute( - select(CIFixOutcome.poll_number).where( - CIFixOutcome.ci_fix_run_id == run.id - ) + select(CIFixOutcome.poll_number).where(CIFixOutcome.ci_fix_run_id == run.id) ) done_polls = {row[0] for row in result.all()} @@ -203,10 +196,12 @@ async def _get_github_token(run: CIFixRun) -> str | None: if integration.ci_api_key_enc: # Decrypt if needed — same logic as CIFixerAgent._decrypt_key from phalanx.config.settings import get_settings # noqa: PLC0415 + settings = get_settings() if settings.encryption_key: try: from cryptography.fernet import Fernet # noqa: PLC0415 + f = Fernet(settings.encryption_key.encode()) return f.decrypt(integration.ci_api_key_enc.encode()).decode() except Exception: @@ -286,9 +281,7 @@ async def _mark_outcome_checked(run: CIFixRun) -> None: """Mark a CIFixRun as fully outcome-checked — no more polling.""" async with get_db() as session: await session.execute( - update(CIFixRun) - .where(CIFixRun.id == run.id) - .values(outcome_checked=True) + update(CIFixRun).where(CIFixRun.id == run.id).values(outcome_checked=True) ) await session.commit() diff --git a/phalanx/ci_fixer/pattern_promoter.py b/phalanx/ci_fixer/pattern_promoter.py index dda42bb5..d076fcd3 100644 --- a/phalanx/ci_fixer/pattern_promoter.py +++ b/phalanx/ci_fixer/pattern_promoter.py @@ -16,7 +16,6 @@ from __future__ import annotations import asyncio -import json import uuid from datetime import UTC, datetime @@ -129,10 +128,7 @@ def is_promotion_eligible( repo_count: distinct repos where this fix has succeeded total_success_count: total successful applications across all repos """ - return ( - repo_count >= MIN_REPOS_FOR_PROMOTION - or total_success_count >= MIN_GLOBAL_SUCCESS_COUNT - ) + return repo_count >= MIN_REPOS_FOR_PROMOTION or total_success_count >= MIN_GLOBAL_SUCCESS_COUNT # ── Celery task ──────────────────────────────────────────────────────────────── diff --git a/phalanx/ci_fixer/proactive_scanner.py b/phalanx/ci_fixer/proactive_scanner.py index 81361ae8..37133cc6 100644 --- a/phalanx/ci_fixer/proactive_scanner.py +++ b/phalanx/ci_fixer/proactive_scanner.py @@ -25,7 +25,6 @@ import time import uuid from datetime import UTC, datetime -from typing import TYPE_CHECKING import structlog @@ -33,9 +32,6 @@ from phalanx.db.session import get_db from phalanx.queue.celery_app import celery_app -if TYPE_CHECKING: - pass - log = structlog.get_logger(__name__) @@ -81,11 +77,13 @@ def format_proactive_comment(findings: list[ProactiveFinding], pr_number: int) - f"Consider reviewing before CI runs.\n\n" ) else: - header += ( - f"Found **{info_count} informational pattern(s)** — low severity.\n\n" - ) + header += f"Found **{info_count} informational pattern(s)** — low severity.\n\n" - lines = [header, "| Pattern | Tool | Files | Severity |\n", "|---------|------|-------|----------|\n"] + lines = [ + header, + "| Pattern | Tool | Files | Severity |\n", + "|---------|------|-------|----------|\n", + ] for f in findings[:10]: files_str = ", ".join(f"`{p}`" for p in f.affected_files[:3]) if len(f.affected_files) > 3: diff --git a/phalanx/ci_fixer/reproducer.py b/phalanx/ci_fixer/reproducer.py new file mode 100644 index 00000000..2c798e65 --- /dev/null +++ b/phalanx/ci_fixer/reproducer.py @@ -0,0 +1,248 @@ +""" +ReproducerAgent — runs the CI reproducer command in the provisioned sandbox +and classifies the outcome. + +Verdicts: + confirmed — sandbox reproduced the same failure (exit != 0, pattern match) + flaky — command passed in sandbox → CI failure was transient + env_mismatch — command failed with a DIFFERENT error → wrong environment + timeout — reproducer command exceeded sandbox_timeout_seconds + skipped — no sandbox available (sandbox_enabled=False or provision failed) + +Design: + - When sandbox_result.container_id is set, the command is executed inside + the pre-warmed container via `docker exec`. The workspace is already + bind-mounted at /workspace inside the container by SandboxProvisioner. + - When sandbox_result.available=False or container_id is empty, falls back + to local subprocess (same as Phase 2 behaviour — no regression). + - asyncio.create_subprocess_shell is used for the local path because + reproducer_cmd is a string that may contain flags, pipes, etc. + - For the container path, we use create_subprocess_exec with docker exec + args to avoid shell injection. + - Timeout is enforced via asyncio.wait_for; the process is killed on breach. + - Output matching is conservative: if tool name OR any error code appears + in stdout/stderr we call it "confirmed". +""" + +from __future__ import annotations + +import asyncio +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +import structlog + +from phalanx.ci_fixer.context import ReproductionResult + +if TYPE_CHECKING: + from pathlib import Path + + from phalanx.ci_fixer.context import StructuredFailure + from phalanx.ci_fixer.sandbox import SandboxResult + +log = structlog.get_logger(__name__) + + +@dataclass +class ReproductionAttempt: + """Raw result of a single subprocess execution — internal to this module.""" + + cmd: str + exit_code: int + stdout: str + stderr: str + elapsed_seconds: float + timed_out: bool = False + + +class ReproducerAgent: + """ + Runs the reproducer command and classifies the CI failure. + + One instance per pipeline run; no shared state between calls. + """ + + def _output_matches_failure( + self, + output: str, + structured_failure: StructuredFailure, + ) -> bool: + """ + Return True if stdout/stderr output looks like the original CI failure. + + Conservative check — matches if: + 1. The tool name appears anywhere in the output (e.g. "ruff"), OR + 2. Any structured error code appears (e.g. "F401", "E501", "TS2345"). + + Lowercase comparison for tool name; error codes are case-sensitive. + """ + lowered = output.lower() + + # Match 1: tool name anywhere in output + if structured_failure.tool.lower() in lowered: + return True + + # Match 2: any parsed error code in output + errors: list[dict[str, Any]] = structured_failure.errors or [] + for err in errors: + code = err.get("code", "") + if code and code in output: + return True + + return False + + async def reproduce( + self, + reproducer_cmd: str, + workspace_path: Path, + sandbox_result: SandboxResult | None, + structured_failure: StructuredFailure, + timeout_seconds: int = 120, + ) -> ReproductionResult: + """ + Execute reproducer_cmd and return a classified ReproductionResult. + + Args: + reproducer_cmd: The exact command CI ran (e.g. "ruff check ."). + workspace_path: Working directory for the subprocess. + sandbox_result: From SandboxProvisioner; None or available=False → skip. + structured_failure: Parsed failure context used for output matching. + timeout_seconds: Hard ceiling on subprocess wall time. + + Returns: + ReproductionResult with verdict, exit_code, output, reproducer_cmd. + """ + # ── Gate: no sandbox or sandbox unavailable ─────────────────────────── + if sandbox_result is None or not sandbox_result.available: + log.info("ci_fixer.reproduce_skipped", reason="no_sandbox") + return ReproductionResult( + verdict="skipped", + reproducer_cmd=reproducer_cmd, + ) + + # ── Gate: empty command ─────────────────────────────────────────────── + if not reproducer_cmd or not reproducer_cmd.strip(): + log.info("ci_fixer.reproduce_skipped", reason="empty_cmd") + return ReproductionResult( + verdict="skipped", + reproducer_cmd=reproducer_cmd, + ) + + # ── Run in container or local subprocess ────────────────────────────── + container_id = getattr(sandbox_result, "container_id", "") + attempt = await self._run_subprocess( + cmd=reproducer_cmd, + cwd=workspace_path, + timeout_seconds=timeout_seconds, + container_id=container_id, + ) + + combined_output = (attempt.stdout + "\n" + attempt.stderr).strip() + + log.info( + "ci_fixer.reproduce_attempt", + cmd=attempt.cmd, + exit_code=attempt.exit_code, + elapsed=round(attempt.elapsed_seconds, 2), + timed_out=attempt.timed_out, + output_chars=len(combined_output), + ) + + # ── Classify verdict ────────────────────────────────────────────────── + from typing import Literal # noqa: PLC0415 + + verdict: Literal["confirmed", "flaky", "env_mismatch", "timeout", "skipped"] + if attempt.timed_out: + verdict = "timeout" + elif attempt.exit_code == 0: + verdict = "flaky" + elif self._output_matches_failure(combined_output, structured_failure): + verdict = "confirmed" + else: + verdict = "env_mismatch" + + log.info( + "ci_fixer.reproduced", + verdict=verdict, + exit_code=attempt.exit_code, + cmd=reproducer_cmd, + ) + + return ReproductionResult( + verdict=verdict, + exit_code=attempt.exit_code, + output=combined_output[:4000], # cap stored output + reproducer_cmd=reproducer_cmd, + ) + + async def _run_subprocess( + self, + cmd: str, + cwd: Path, + timeout_seconds: int, + container_id: str = "", + ) -> ReproductionAttempt: + """ + Run cmd with a hard timeout. + + When container_id is provided, wraps the command as: + docker exec -w /workspace {container_id} sh -c {cmd} + so it executes inside the pre-warmed isolated container. + + When container_id is empty, falls back to local subprocess via + asyncio.create_subprocess_shell (original Phase 2 behaviour). + """ + from phalanx.ci_fixer.sandbox_pool import wrap_shell_cmd_for_container + from phalanx.config.settings import get_settings as _get_settings + + start = time.monotonic() + + if container_id: + # Isolated container exec path + docker_cmd = _get_settings().sandbox_docker_cmd + args = wrap_shell_cmd_for_container(container_id, cmd, docker_cmd=docker_cmd) + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + else: + # Local subprocess fallback + proc = await asyncio.create_subprocess_shell( + cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=str(cwd), + ) + + try: + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), + timeout=timeout_seconds, + ) + elapsed = time.monotonic() - start + return ReproductionAttempt( + cmd=cmd, + exit_code=proc.returncode or 0, + stdout=stdout_b.decode(errors="replace"), + stderr=stderr_b.decode(errors="replace"), + elapsed_seconds=elapsed, + timed_out=False, + ) + + except TimeoutError: + elapsed = time.monotonic() - start + try: + proc.kill() + await proc.wait() + except Exception: # noqa: BLE001 + pass + return ReproductionAttempt( + cmd=cmd, + exit_code=-1, + stdout="", + stderr="", + elapsed_seconds=elapsed, + timed_out=True, + ) diff --git a/phalanx/ci_fixer/sandbox.py b/phalanx/ci_fixer/sandbox.py new file mode 100644 index 00000000..2255c814 --- /dev/null +++ b/phalanx/ci_fixer/sandbox.py @@ -0,0 +1,270 @@ +""" +SandboxProvisioner — selects and provisions an isolated execution environment +for the CI reproducer and fix agents. + +Design: + - Stack detection is pure file-existence: no subprocess, no LLM call + - provision() checks out a pre-warmed container from SandboxPool and + bind-mounts the workspace path into it at /workspace. + - sandbox_enabled=False fast-path returns None → reproducer uses "skipped" + - SandboxUnavailableError (pool timeout / Docker down) → SandboxResult + with available=False → reproducer/verifier fall back to local subprocess + +See docs/sandbox_pool_design.md for the full design rationale. +""" + +from __future__ import annotations + +import uuid +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +import structlog + +from phalanx.ci_fixer.sandbox_pool import ( + PooledContainer, + SandboxUnavailableError, + get_sandbox_pool, +) +from phalanx.config.settings import get_settings + +if TYPE_CHECKING: + from pathlib import Path + +log = structlog.get_logger(__name__) +settings = get_settings() + +# ── Stack detection markers ─────────────────────────────────────────────────── +# Ordered by priority: first match wins when multiple markers coexist. +_STACK_FILES: dict[str, list[str]] = { + "python": ["pyproject.toml", "requirements.txt", "setup.py"], + "node": ["package.json"], + "go": ["go.mod"], + "rust": ["Cargo.toml"], +} + +# ── Docker images per stack ─────────────────────────────────────────────────── +# Slim/alpine variants: fastest pull, fewest CVEs, sufficient for lint/type tools. +_STACK_IMAGES: dict[str, str] = { + "python": "python:3.12-slim", + "node": "node:20-slim", + "go": "golang:1.22-alpine", + "rust": "rust:1.77-slim", + "unknown": "ubuntu:22.04", +} + + +@dataclass +class SandboxResult: + """Describes the provisioned sandbox environment for a single fix run.""" + + sandbox_id: str + """Unique ID for this sandbox instance: 'phalanx-sandbox-{8 hex chars}'.""" + + stack: str + """Detected tech stack: 'python', 'node', 'go', 'rust', 'unknown'.""" + + image: str + """Docker image the container was started from.""" + + workspace_path: str + """Host path bind-mounted into the container at /workspace.""" + + available: bool = True + """ + False when the sandbox is not usable: + - sandbox_enabled=False in settings + - Docker daemon unreachable + - Pool checkout timed out (all slots busy) + When False, ReproducerAgent and VerifierAgent fall back to local subprocess. + """ + + container_id: str = "" + """ + Docker container ID (short hash) when a pool slot was successfully checked out. + Empty string means local subprocess fallback is in effect. + """ + + mount_path: str = "/workspace" + """Path inside the container where workspace_path is bind-mounted.""" + + extra: dict = field(default_factory=dict) + """Reserved for future metadata (port map, resource stats, etc.).""" + + +class SandboxProvisioner: + """ + Provisions a sandbox for a given workspace by checking out a pre-warmed + container from SandboxPool and bind-mounting the workspace into it. + + Fallback chain (no regressions): + sandbox_enabled=False → return None + pool checkout timeout → SandboxResult(available=False, container_id="") + Docker daemon missing → SandboxResult(available=False, container_id="") + happy path → SandboxResult(available=True, container_id="abc123") + """ + + def detect_stack(self, workspace_path: Path) -> str: + """ + Infer the primary tech stack from marker files in workspace_path. + + Returns the first matching stack name from _STACK_FILES, or 'unknown' + if no markers are found. Order matters: python is checked first so + a monorepo with both pyproject.toml and package.json resolves to python. + """ + for stack, markers in _STACK_FILES.items(): + if any((workspace_path / marker).exists() for marker in markers): + return stack + return "unknown" + + async def provision( + self, + workspace_path: Path, + stack_hint: str | None = None, + ) -> SandboxResult | None: + """ + Return a SandboxResult for workspace_path, or None if sandbox is disabled. + + Args: + workspace_path: Absolute path to the cloned repo on the host. + stack_hint: Override stack detection (e.g. caller already knows + the stack from structured_failure). + + Returns: + SandboxResult with container_id populated (happy path), + SandboxResult with available=False (pool exhausted / Docker down), + or None (sandbox_enabled=False). + """ + if not settings.sandbox_enabled: + log.info("ci_fixer.sandbox_disabled") + return None + + stack = stack_hint if stack_hint else self.detect_stack(workspace_path) + image = _STACK_IMAGES.get(stack, _STACK_IMAGES["unknown"]) + sandbox_id = f"phalanx-sandbox-{uuid.uuid4().hex[:8]}" + + try: + pool = await get_sandbox_pool() + container = await pool.checkout( + stack, + timeout=settings.sandbox_checkout_timeout_seconds, + ) + + # Bind-mount the workspace into the container. + # docker run used --volume /tmp:/hosttmp; we create a per-run symlink + # inside the container pointing /workspace → the actual cloned path. + # For simplicity we use docker cp for the initial seed if the bind + # mount path isn't already accessible. + await self._bind_workspace(container.container_id, workspace_path) + + result = SandboxResult( + sandbox_id=sandbox_id, + stack=stack, + image=image, + workspace_path=str(workspace_path), + available=True, + container_id=container.container_id, + ) + + log.info( + "ci_fixer.sandbox_provisioned", + sandbox_id=sandbox_id, + stack=stack, + container_id=container.container_id, + ) + return result + + except SandboxUnavailableError as exc: + log.warning( + "ci_fixer.sandbox_unavailable", + sandbox_id=sandbox_id, + stack=stack, + error=str(exc), + ) + return SandboxResult( + sandbox_id=sandbox_id, + stack=stack, + image=image, + workspace_path=str(workspace_path), + available=False, + container_id="", + ) + + except Exception as exc: + log.warning( + "ci_fixer.sandbox_provision_error", + sandbox_id=sandbox_id, + stack=stack, + error=str(exc), + ) + return SandboxResult( + sandbox_id=sandbox_id, + stack=stack, + image=image, + workspace_path=str(workspace_path), + available=False, + container_id="", + ) + + async def release(self, sandbox_result: SandboxResult) -> None: + """ + Return the container back to the pool after the fix run completes. + Safe to call even when container_id is empty (no-op). + """ + if not sandbox_result.container_id: + return + + try: + pool = await get_sandbox_pool() + container = PooledContainer( + container_id=sandbox_result.container_id, + stack=sandbox_result.stack, + image=sandbox_result.image, + ) + await pool.checkin(container) + log.info( + "ci_fixer.sandbox_released", + container_id=sandbox_result.container_id, + stack=sandbox_result.stack, + ) + except Exception as exc: + log.warning( + "ci_fixer.sandbox_release_error", + container_id=sandbox_result.container_id, + error=str(exc), + ) + + async def _bind_workspace(self, container_id: str, workspace_path: Path) -> None: + """ + Make the workspace accessible at /workspace inside the container. + + Strategy: docker cp the workspace contents into the container. + This is safe for the typical repo size (< 50MB of source). + For large repos, a bind-mount at container start time is preferred + (set via docker run -v flag in SandboxPool._start_container). + """ + import asyncio + + cmd = settings.sandbox_docker_cmd + try: + proc = await asyncio.create_subprocess_exec( + cmd, + "cp", + f"{workspace_path}/.", + f"{container_id}:/workspace", + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await asyncio.wait_for(proc.communicate(), timeout=30) + if proc.returncode != 0: + log.warning( + "ci_fixer.sandbox_cp_failed", + container_id=container_id, + error=stderr.decode().strip(), + ) + except Exception as exc: + log.warning( + "ci_fixer.sandbox_cp_error", + container_id=container_id, + error=str(exc), + ) diff --git a/phalanx/ci_fixer/sandbox_pool.py b/phalanx/ci_fixer/sandbox_pool.py new file mode 100644 index 00000000..8cf51df2 --- /dev/null +++ b/phalanx/ci_fixer/sandbox_pool.py @@ -0,0 +1,581 @@ +""" +SandboxPool — pre-warmed container pool for isolated CI fix execution. + +Design (see docs/sandbox_pool_design.md for full rationale): + + One asyncio.Queue per stack holds ready PooledContainer objects. + fix runs call checkout() → get an already-running container → + exec commands inside it → call checkin() → container is reset + and returned to the queue. A background refill task keeps the + queue at min_size after each checkout. + + A reaper task runs every sandbox_reaper_interval_seconds and kills + containers that have been checked out longer than sandbox_max_hold_seconds + (safety net for fix runs that crash without calling checkin). + +Celery fork safety: + The pool is NEVER initialised at module import time. Call + get_sandbox_pool() (async) from inside an already-running event loop + (i.e. inside a Celery task's asyncio.run() call). The Lock and + instance are created lazily on first call in each child process. + +Fallback contract: + checkout() raises SandboxUnavailableError on timeout or Docker error. + Callers must catch it and fall back to local-subprocess execution. + The pool NEVER raises uncaught exceptions that would abort a fix run. +""" + +from __future__ import annotations + +import asyncio +import time +from contextlib import asynccontextmanager, suppress +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +import structlog + +from phalanx.config.settings import get_settings + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + +log = structlog.get_logger(__name__) +settings = get_settings() + +# ── Custom exceptions ───────────────────────────────────────────────────────── + + +class SandboxUnavailableError(Exception): + """Raised by checkout() when no container is available within the timeout.""" + + +# ── Stack → custom image mapping ────────────────────────────────────────────── +# Falls back to official slim images if custom image is not present locally. +_POOL_IMAGES: dict[str, str] = { + "python": "phalanx-sandbox-python:latest", + "node": "phalanx-sandbox-node:latest", + "go": "phalanx-sandbox-go:latest", + "rust": "phalanx-sandbox-rust:latest", + "unknown": "ubuntu:22.04", +} + +_FALLBACK_IMAGES: dict[str, str] = { + "python": "python:3.12-slim", + "node": "node:20-slim", + "go": "golang:1.22-alpine", + "rust": "rust:1.77-slim", + "unknown": "ubuntu:22.04", +} + + +# ── PooledContainer ─────────────────────────────────────────────────────────── + + +@dataclass +class PooledContainer: + """A single running container slot in the pool.""" + + container_id: str + """Short Docker container ID.""" + + stack: str + """Tech stack this container is configured for.""" + + image: str + """Image the container was started from.""" + + checked_out_at: float = field(default_factory=time.monotonic) + """monotonic timestamp of last checkout — used by the reaper.""" + + healthy: bool = True + """False after a failed health check — container will be replaced.""" + + +# ── SandboxPool ─────────────────────────────────────────────────────────────── + + +class SandboxPool: + """ + Pre-warmed container pool. One instance per Celery worker process. + Never instantiate directly — use get_sandbox_pool(). + """ + + def __init__(self) -> None: + self._queues: dict[str, asyncio.Queue[PooledContainer]] = {} + self._checked_out: dict[str, PooledContainer] = {} # container_id → container + self._refill_lock: dict[str, asyncio.Lock] = {} + self._reaper_task: asyncio.Task | None = None # type: ignore[type-arg] + self._shutdown = False + + # ── Lifecycle ───────────────────────────────────────────────────────────── + + async def _warmup(self) -> None: + """ + Start min_size containers per stack and populate queues. + Called once by get_sandbox_pool() after construction. + Errors during warmup are logged but do not raise — the pool + starts empty and fills as containers become available. + """ + stacks = list(_POOL_IMAGES.keys()) + for stack in stacks: + self._queues[stack] = asyncio.Queue() + self._refill_lock[stack] = asyncio.Lock() + + min_size = settings.sandbox_pool_min_size + if min_size == 0: + log.info("ci_fixer.sandbox_pool.warmup_skipped", reason="min_size=0") + return + + warmup_tasks = [] + for stack in stacks: + for _ in range(min_size): + warmup_tasks.append(self._start_and_enqueue(stack)) + + results = await asyncio.gather(*warmup_tasks, return_exceptions=True) + started = sum(1 for r in results if not isinstance(r, Exception)) + log.info( + "ci_fixer.sandbox_pool.warmed", + started=started, + total=len(warmup_tasks), + ) + + # Start reaper background task + self._reaper_task = asyncio.create_task(self._reaper_loop()) + + async def shutdown(self) -> None: + """Kill all containers and stop the reaper. Called on worker shutdown.""" + self._shutdown = True + if self._reaper_task: + self._reaper_task.cancel() + with suppress(asyncio.CancelledError): + await self._reaper_task + + # Drain queues and kill containers + kill_tasks = [] + for queue in self._queues.values(): + while not queue.empty(): + try: + container = queue.get_nowait() + kill_tasks.append(self._kill_container(container.container_id)) + except asyncio.QueueEmpty: + break + + for container in list(self._checked_out.values()): + kill_tasks.append(self._kill_container(container.container_id)) + + if kill_tasks: + await asyncio.gather(*kill_tasks, return_exceptions=True) + + log.info("ci_fixer.sandbox_pool.shutdown_complete") + + # ── Public API ──────────────────────────────────────────────────────────── + + async def checkout( + self, + stack: str, + timeout: int | None = None, + ) -> PooledContainer: + """ + Check out a ready container for the given stack. + + Waits up to `timeout` seconds (default: settings.sandbox_checkout_timeout_seconds). + Raises SandboxUnavailableError if no container becomes available in time. + """ + if stack not in self._queues: + raise SandboxUnavailableError(f"no pool for stack={stack!r}") + + effective_timeout = ( + timeout if timeout is not None else settings.sandbox_checkout_timeout_seconds + ) + + try: + container = await asyncio.wait_for( + self._queues[stack].get(), + timeout=effective_timeout, + ) + except TimeoutError as exc: + raise SandboxUnavailableError( + f"pool exhausted for stack={stack!r} after {effective_timeout}s" + ) from exc + + # Health check — if unhealthy, discard and try once more + if not await self._health_check(container): + log.warning( + "ci_fixer.sandbox_pool.unhealthy_on_checkout", + container_id=container.container_id, + stack=stack, + ) + await self._kill_container(container.container_id) + # Start a fresh replacement asynchronously + asyncio.create_task(self._start_and_enqueue(stack)) + # Try one more time with a shorter timeout + try: + container = await asyncio.wait_for( + self._queues[stack].get(), + timeout=min(effective_timeout, 15), + ) + except TimeoutError as exc: + raise SandboxUnavailableError( + f"pool exhausted after health check retry for stack={stack!r}" + ) from exc + + container.checked_out_at = time.monotonic() + self._checked_out[container.container_id] = container + + log.info( + "ci_fixer.sandbox_pool.checkout", + container_id=container.container_id, + stack=stack, + queue_depth=self._queues[stack].qsize(), + ) + + # Kick off background refill so the queue stays at min_size + asyncio.create_task(self._refill(stack)) + + return container + + async def checkin(self, container: PooledContainer) -> None: + """ + Return a container to the pool after a fix run completes. + Resets the container state, then re-enqueues it. + """ + self._checked_out.pop(container.container_id, None) + + log.info( + "ci_fixer.sandbox_pool.checkin", + container_id=container.container_id, + stack=container.stack, + ) + + if self._shutdown: + await self._kill_container(container.container_id) + return + + # Reset filesystem state inside the container + reset_ok = await self._reset_container(container) + if not reset_ok: + log.warning( + "ci_fixer.sandbox_pool.reset_failed", + container_id=container.container_id, + ) + await self._kill_container(container.container_id) + asyncio.create_task(self._start_and_enqueue(container.stack)) + return + + # Verify still healthy after reset + if not await self._health_check(container): + log.warning( + "ci_fixer.sandbox_pool.unhealthy_after_reset", + container_id=container.container_id, + ) + await self._kill_container(container.container_id) + asyncio.create_task(self._start_and_enqueue(container.stack)) + return + + await self._queues[container.stack].put(container) + + @asynccontextmanager + async def borrow( + self, + stack: str, + timeout: int | None = None, + ) -> AsyncIterator[PooledContainer]: + """ + Context manager that checks out a container and guarantees checkin, + even if the fix run raises. + + Usage: + async with pool.borrow("python") as container: + await exec_in_container(container, "ruff check .") + """ + container = await self.checkout(stack, timeout=timeout) + try: + yield container + finally: + await self.checkin(container) + + async def mount_workspace( + self, + container: PooledContainer, + workspace_path: object, + ) -> None: + """ + Ensure the workspace is accessible inside the container at /workspace. + + Strategy: the container is started with -v /tmp:/hosttmp, so we symlink + /workspace → /hosttmp/{run_subdir} which avoids docker cp entirely. + If the container was started without that mount, fall back to docker cp. + + In practice provision() always starts containers with the bind mount; + this method is a no-op in the happy path (the bind mount already exists). + """ + # The bind mount is set up at container start time in _start_container. + # Nothing to do here — workspace is already visible at /workspace inside + # the container via the per-run bind mount added by provision(). + pass + + # ── Docker helpers ──────────────────────────────────────────────────────── + + async def _start_container(self, stack: str) -> str: + """ + Start a new sandbox container for the given stack. + Returns the container ID (short hash). + Raises on Docker error. + """ + image = await self._resolve_image(stack) + cmd = settings.sandbox_docker_cmd + + proc = await asyncio.create_subprocess_exec( + cmd, + "run", + "-d", # detached + "--rm", # auto-remove when stopped + "--user", + "1000:1000", # non-root + "--no-new-privileges", # no privilege escalation + "--network", + "none", # no network (lint/type tools don't need it) + "--memory", + "512m", # memory limit + "--cpus", + "1", # cpu limit + image, + "sleep", + "infinity", # keep alive until we kill it + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + + if proc.returncode != 0: + raise RuntimeError(f"docker run failed for stack={stack!r}: {stderr.decode().strip()}") + + container_id = stdout.decode().strip()[:12] + log.info( + "ci_fixer.sandbox_pool.container_started", + container_id=container_id, + stack=stack, + image=image, + ) + return container_id + + async def _resolve_image(self, stack: str) -> str: + """ + Return phalanx-sandbox-{stack}:latest if it exists locally, + else fall back to the official slim image. + """ + preferred = _POOL_IMAGES.get(stack, _FALLBACK_IMAGES.get(stack, "ubuntu:22.04")) + cmd = settings.sandbox_docker_cmd + + proc = await asyncio.create_subprocess_exec( + cmd, + "image", + "inspect", + preferred, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + await proc.communicate() + + if proc.returncode == 0: + return preferred + + fallback = _FALLBACK_IMAGES.get(stack, "ubuntu:22.04") + log.info( + "ci_fixer.sandbox_pool.image_fallback", + preferred=preferred, + fallback=fallback, + stack=stack, + ) + return fallback + + async def _health_check(self, container: PooledContainer) -> bool: + """Return True if container responds to `docker exec echo ok`.""" + cmd = settings.sandbox_docker_cmd + try: + proc = await asyncio.create_subprocess_exec( + cmd, + "exec", + container.container_id, + "echo", + "ok", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=5) + return proc.returncode == 0 and b"ok" in stdout + except Exception: + return False + + async def _reset_container(self, container: PooledContainer) -> bool: + """ + Run the reset script inside the container to clear /workspace and caches. + Returns True on success. + """ + cmd = settings.sandbox_docker_cmd + try: + proc = await asyncio.create_subprocess_exec( + cmd, + "exec", + container.container_id, + "sh", + "-c", + "rm -rf /workspace/* /tmp/pip-* /tmp/npm-* /root/.cache 2>/dev/null; echo done", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10) + return proc.returncode == 0 and b"done" in stdout + except Exception: + return False + + async def _kill_container(self, container_id: str) -> None: + """Kill and remove a container, ignoring errors.""" + cmd = settings.sandbox_docker_cmd + try: + proc = await asyncio.create_subprocess_exec( + cmd, + "kill", + container_id, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + await asyncio.wait_for(proc.communicate(), timeout=10) + except Exception: + pass + + # ── Background tasks ────────────────────────────────────────────────────── + + async def _start_and_enqueue(self, stack: str) -> None: + """Start a container and add it to the pool queue. Errors are swallowed.""" + try: + container_id = await self._start_container(stack) + image = await self._resolve_image(stack) + container = PooledContainer( + container_id=container_id, + stack=stack, + image=image, + ) + # Only enqueue if within max_size + current_depth = self._queues[stack].qsize() + current_checked_out = sum(1 for c in self._checked_out.values() if c.stack == stack) + if current_depth + current_checked_out < settings.sandbox_pool_max_size: + await self._queues[stack].put(container) + else: + # Pool is full — kill the just-started container + await self._kill_container(container_id) + except Exception as exc: + log.warning( + "ci_fixer.sandbox_pool.start_failed", + stack=stack, + error=str(exc), + ) + + async def _refill(self, stack: str) -> None: + """ + Ensure the queue has at least min_size containers after a checkout. + Uses a per-stack lock to avoid duplicate refill tasks racing. + """ + async with self._refill_lock[stack]: + current = self._queues[stack].qsize() + needed = settings.sandbox_pool_min_size - current + if needed > 0: + await self._start_and_enqueue(stack) + + async def _reaper_loop(self) -> None: + """ + Background task: every sandbox_reaper_interval_seconds, kill containers + that have been checked out longer than sandbox_max_hold_seconds. + This is a safety net for fix runs that crash without calling checkin(). + """ + while not self._shutdown: + try: + await asyncio.sleep(settings.sandbox_reaper_interval_seconds) + now = time.monotonic() + max_hold = settings.sandbox_max_hold_seconds + stale = [ + c for c in list(self._checked_out.values()) if now - c.checked_out_at > max_hold + ] + for container in stale: + log.warning( + "ci_fixer.sandbox_pool.reaper_killing", + container_id=container.container_id, + stack=container.stack, + held_seconds=round(now - container.checked_out_at), + ) + self._checked_out.pop(container.container_id, None) + await self._kill_container(container.container_id) + await self._start_and_enqueue(container.stack) + except asyncio.CancelledError: + break + except Exception as exc: + log.warning("ci_fixer.sandbox_pool.reaper_error", error=str(exc)) + + +# ── Lazy singleton ──────────────────────────────────────────────────────────── + +_pool_instance: SandboxPool | None = None +_pool_lock: asyncio.Lock | None = None + + +async def get_sandbox_pool() -> SandboxPool: + """ + Return the process-local SandboxPool singleton, initialising it on first call. + + Safe to call from inside a Celery asyncio.run() task — the Lock and instance + are created lazily inside the child's own event loop, avoiding Celery pre-fork + event-loop conflicts. + """ + global _pool_instance, _pool_lock + + if _pool_lock is None: + _pool_lock = asyncio.Lock() + + async with _pool_lock: + if _pool_instance is None: + _pool_instance = SandboxPool() + await _pool_instance._warmup() + + return _pool_instance + + +def reset_pool_for_testing() -> None: + """ + Reset the global singleton. Only call from test teardown — never in production. + """ + global _pool_instance, _pool_lock + _pool_instance = None + _pool_lock = None + + +# ── exec helper used by ReproducerAgent + VerifierAgent ─────────────────────── + + +def wrap_cmd_for_container( + container_id: str, + cmd_args: list[str], + workspace_path: str, + docker_cmd: str = "docker", +) -> list[str]: + """ + Wrap a command list so it executes inside the given container. + + The workspace is bind-mounted at /workspace inside the container. + We set WORKDIR via -w flag so relative paths resolve correctly. + + Returns a new args list: [docker, exec, -w, /workspace, container_id, *cmd_args] + """ + return [docker_cmd, "exec", "-w", "/workspace", container_id, *cmd_args] + + +def wrap_shell_cmd_for_container( + container_id: str, + shell_cmd: str, + docker_cmd: str = "docker", +) -> list[str]: + """ + Wrap a shell string command to run inside a container via `docker exec sh -c`. + Used by ReproducerAgent which takes a shell string (not an args list). + """ + return [docker_cmd, "exec", "-w", "/workspace", container_id, "sh", "-c", shell_cmd] diff --git a/phalanx/ci_fixer/suppressor.py b/phalanx/ci_fixer/suppressor.py index 6547950a..b8fe0578 100644 --- a/phalanx/ci_fixer/suppressor.py +++ b/phalanx/ci_fixer/suppressor.py @@ -41,8 +41,8 @@ def is_flaky_suppressed( - parsed_log: "ParsedLog", - flaky_patterns: list["CIFlakyPattern"], + parsed_log: ParsedLog, + flaky_patterns: list[CIFlakyPattern], ) -> bool: """ Return True if ALL errors in parsed_log are high-flakiness patterns. @@ -64,7 +64,7 @@ def is_flaky_suppressed( return False # Build lookup: (file, code) → CIFlakyPattern - pattern_map: dict[tuple[str, str], "CIFlakyPattern"] = {} + pattern_map: dict[tuple[str, str], CIFlakyPattern] = {} for p in flaky_patterns: key = (p.error_file or "", p.error_code or "") pattern_map[key] = p @@ -109,7 +109,7 @@ def is_flaky_suppressed( return True -def should_use_history(fingerprint: "CIFailureFingerprint | None") -> bool: +def should_use_history(fingerprint: CIFailureFingerprint | None) -> bool: """ Return True if the fingerprint's history is trustworthy enough to reuse. @@ -146,7 +146,7 @@ def record_flaky_pattern( error_code: str | None, error_file: str | None, was_flaky: bool, - existing_pattern: "CIFlakyPattern | None" = None, + existing_pattern: CIFlakyPattern | None = None, ) -> dict: """ Return the dict of fields to set when upserting a CIFlakyPattern row. diff --git a/phalanx/ci_fixer/validator.py b/phalanx/ci_fixer/validator.py index a81e6cbf..b47f767d 100644 --- a/phalanx/ci_fixer/validator.py +++ b/phalanx/ci_fixer/validator.py @@ -16,18 +16,19 @@ import subprocess from dataclasses import dataclass, field -from pathlib import Path from typing import TYPE_CHECKING import structlog if TYPE_CHECKING: - from phalanx.ci_fixer.log_parser import LintError, ParsedLog, TypeError + from pathlib import Path + + from phalanx.ci_fixer.log_parser import ParsedLog log = structlog.get_logger(__name__) -_VALIDATE_TIMEOUT = 120 # seconds per subprocess call -_VERSION_TIMEOUT = 5 # seconds for --version queries +_VALIDATE_TIMEOUT = 120 # seconds per subprocess call +_VERSION_TIMEOUT = 5 # seconds for --version queries @dataclass @@ -41,9 +42,9 @@ class ValidationResult: def validate_fix( - parsed_log: "ParsedLog", + parsed_log: ParsedLog, workspace: Path, - original_parsed: "ParsedLog | None" = None, + original_parsed: ParsedLog | None = None, ) -> ValidationResult: """ Re-run the failing tool against the workspace to confirm the fix. @@ -88,7 +89,7 @@ def validate_fix( regressions = _regression_check(tool, workspace, original_parsed, tool_version) if regressions: reg_summary = "; ".join( - f"{getattr(e,'file','?')}:{getattr(e,'line','?')} {getattr(e,'code',getattr(e,'message',''))}" + f"{getattr(e, 'file', '?')}:{getattr(e, 'line', '?')} {getattr(e, 'code', getattr(e, 'message', ''))}" for e in regressions[:5] ) log.warning( @@ -126,9 +127,7 @@ def _run_mypy(workspace: Path, files: list[str], tool_version: str) -> Validatio return ValidationResult(passed=passed, tool="mypy", output=output, tool_version=tool_version) -def _run_pytest( - workspace: Path, parsed_log: "ParsedLog", tool_version: str -) -> ValidationResult: +def _run_pytest(workspace: Path, parsed_log: ParsedLog, tool_version: str) -> ValidationResult: test_files = list({f.file for f in parsed_log.test_failures}) targets = test_files if test_files else ["tests/"] code, output = _run(["python", "-m", "pytest", "-x", "-q"] + targets, workspace) @@ -156,7 +155,7 @@ def _run_node_linter( def _regression_check( tool: str, workspace: Path, - original_parsed: "ParsedLog", + original_parsed: ParsedLog, tool_version: str, ) -> list: """ diff --git a/phalanx/ci_fixer/verifier.py b/phalanx/ci_fixer/verifier.py new file mode 100644 index 00000000..9eda9ad3 --- /dev/null +++ b/phalanx/ci_fixer/verifier.py @@ -0,0 +1,282 @@ +""" +VerifierAgent — runs a broader verification suite after the fix is applied +to confirm no regressions were introduced. + +Design: + Unlike the validator (which re-runs only the originally-failing tool on + the originally-failing files), the verifier runs the *full* test suite + for the detected stack so we catch regressions in unrelated files. + + Verification profiles per stack: + python → pytest (if test dir exists) + ruff check . (full repo) + node → npm test (if package.json has a test script) + go → go test ./... + rust → cargo test + unknown → skipped (verdict="skipped") + + Execution: + When sandbox_result.container_id is set, each command is executed inside + the pre-warmed isolated container via `docker exec`. The workspace is + already at /workspace inside the container. + When container_id is empty or sandbox unavailable, falls back to local + subprocess (original Phase 2 behaviour — no regression). + + Timeout: settings.sandbox_timeout_seconds (same budget as reproducer). + + The verifier is intentionally conservative: + - If the test command is not found → verdict="skipped" (don't block the fix) + - If the command times out → verdict="timeout" (non-blocking per step) + - If exit_code == 0 → verdict="passed" + - If exit_code != 0 → verdict="failed" + + A "skipped" verdict does NOT block the pipeline — the fix proceeds. + A "failed" verdict causes ctx.complete("escalated") and blocks commit. + A "timeout" verdict is treated the same as "skipped" (conservative). +""" + +from __future__ import annotations + +import asyncio +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import structlog + +from phalanx.ci_fixer.context import VerificationResult + +if TYPE_CHECKING: + from pathlib import Path + + from phalanx.ci_fixer.sandbox import SandboxResult + +log = structlog.get_logger(__name__) + +# ── Verification profiles ───────────────────────────────────────────────────── +# Each profile is a list of commands to run in order. +# Commands are tuples of (label, args_list). +# All commands must pass for verdict="passed". +_PROFILES: dict[str, list[tuple[str, list[str]]]] = { + "python": [ + ("ruff_full", ["ruff", "check", "."]), + ], + "node": [ + ("npm_test", ["npm", "test", "--if-present"]), + ], + "go": [ + ("go_test", ["go", "test", "./..."]), + ], + "rust": [ + ("cargo_test", ["cargo", "test"]), + ], +} + + +@dataclass +class VerificationStep: + """Result of a single verification command.""" + + label: str + cmd: str + exit_code: int + output: str + elapsed_seconds: float + timed_out: bool = False + + +class VerifierAgent: + """ + Runs a broad verification sweep after the fix agent completes. + + One instance per pipeline run; no shared state between calls. + """ + + def _get_profile(self, stack: str) -> list[tuple[str, list[str]]]: + """Return the verification command list for the given stack.""" + return _PROFILES.get(stack, []) + + def _has_pytest(self, workspace_path: Path) -> bool: + """True if pytest is available (pyproject.toml or pytest.ini exists).""" + return ( + (workspace_path / "pyproject.toml").exists() + or (workspace_path / "pytest.ini").exists() + or (workspace_path / "setup.cfg").exists() + ) + + def _container_id(self, sandbox_result: SandboxResult | None) -> str: + """Return container_id from sandbox_result if available, else empty string.""" + if sandbox_result is None: + return "" + return getattr(sandbox_result, "container_id", "") + + async def verify( + self, + workspace_path: Path, + stack: str, + sandbox_result: SandboxResult | None, + timeout_seconds: int = 120, + ) -> VerificationResult: + """ + Run the full verification suite for the given stack. + + Args: + workspace_path: Cloned repo root (same dir used by the fix agent). + stack: Tech stack from SandboxProvisioner ('python', etc.). + sandbox_result: Passed for forward-compat; not used in Phase 3. + timeout_seconds: Hard ceiling per verification command. + + Returns: + VerificationResult with verdict, output, cmd_run. + """ + profile = self._get_profile(stack) + + # Add pytest to python profile only if test infrastructure exists + if stack == "python" and self._has_pytest(workspace_path): + profile = [ + ("pytest_full", ["python", "-m", "pytest", "-x", "-q", "--tb=short"]) + ] + profile + + if not profile: + log.info("ci_fixer.verify_skipped", stack=stack, reason="no_profile") + return VerificationResult(verdict="skipped", output="", cmd_run="") + + steps: list[VerificationStep] = [] + + container_id = self._container_id(sandbox_result) + + for label, cmd_args in profile: + step = await self._run_cmd( + label=label, + cmd_args=cmd_args, + cwd=workspace_path, + timeout_seconds=timeout_seconds, + container_id=container_id, + ) + steps.append(step) + + log.info( + "ci_fixer.verify_step", + label=label, + exit_code=step.exit_code, + timed_out=step.timed_out, + elapsed=round(step.elapsed_seconds, 2), + ) + + if step.timed_out: + # Timeout is non-blocking — treat as skipped for this step + log.warning("ci_fixer.verify_timeout", label=label) + continue + + if step.exit_code != 0: + combined = "\n".join(s.output for s in steps) + log.warning( + "ci_fixer.verify_failed", + label=label, + exit_code=step.exit_code, + ) + return VerificationResult( + verdict="failed", + output=combined[:4000], + cmd_run=" ".join(cmd_args), + ) + + # All steps passed (or timed out — conservative skip) + all_timed_out = all(s.timed_out for s in steps) + if all_timed_out: + return VerificationResult( + verdict="timeout", + output="All verification steps timed out", + cmd_run="", + ) + + combined = "\n".join(s.output for s in steps if s.output) + cmd_summary = "; ".join(" ".join(cmd) for _, cmd in profile) + log.info("ci_fixer.verify_passed", stack=stack, steps=len(steps)) + return VerificationResult( + verdict="passed", + output=combined[:4000], + cmd_run=cmd_summary, + ) + + async def _run_cmd( + self, + label: str, + cmd_args: list[str], + cwd: Path, + timeout_seconds: int, + container_id: str = "", + ) -> VerificationStep: + """ + Run a single verification command as an async subprocess. + + When container_id is provided, wraps with docker exec so the command + runs inside the pre-warmed isolated container at /workspace. + When container_id is empty, runs locally (original behaviour). + + Returns a VerificationStep with timed_out=True if timeout is exceeded. + """ + from phalanx.ci_fixer.sandbox_pool import wrap_cmd_for_container + from phalanx.config.settings import get_settings as _get_settings + + start = time.monotonic() + cmd_str = " ".join(cmd_args) + + if container_id: + docker_cmd = _get_settings().sandbox_docker_cmd + exec_args = wrap_cmd_for_container( + container_id, cmd_args, str(cwd), docker_cmd=docker_cmd + ) + else: + exec_args = cmd_args + + try: + proc = await asyncio.create_subprocess_exec( + *exec_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=str(cwd) if not container_id else None, + ) + + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), + timeout=timeout_seconds, + ) + elapsed = time.monotonic() - start + output = ( + stdout_b.decode(errors="replace") + "\n" + stderr_b.decode(errors="replace") + ).strip() + + return VerificationStep( + label=label, + cmd=cmd_str, + exit_code=proc.returncode or 0, + output=output, + elapsed_seconds=elapsed, + ) + + except TimeoutError: + elapsed = time.monotonic() - start + try: + proc.kill() + await proc.wait() + except Exception: # noqa: BLE001 + pass + return VerificationStep( + label=label, + cmd=cmd_str, + exit_code=-1, + output="", + elapsed_seconds=elapsed, + timed_out=True, + ) + + except FileNotFoundError: + elapsed = time.monotonic() - start + return VerificationStep( + label=label, + cmd=cmd_str, + exit_code=-1, + output=f"(tool not found: {cmd_args[0]})", + elapsed_seconds=elapsed, + timed_out=False, + ) diff --git a/phalanx/config/settings.py b/phalanx/config/settings.py index c272c9eb..8e345cff 100644 --- a/phalanx/config/settings.py +++ b/phalanx/config/settings.py @@ -104,6 +104,29 @@ class Settings(BaseSettings): phalanx_enable_demo_deploy: bool = True # ── CI Webhooks ─────────────────────────────────────────────────────────── buildkite_webhook_token: str = "" + circleci_token: str = "" + circleci_webhook_secret: str = "" + + # ── Sandbox / CI Reproduction ───────────────────────────────────────────── + # Command used to run containers (swap to "podman" on RHEL/CoreOS hosts). + sandbox_docker_cmd: str = "docker" + # Maximum seconds the reproducer command may run inside the sandbox. + sandbox_timeout_seconds: int = 120 + # Master switch — set SANDBOX_ENABLED=false in envs where Docker is absent. + sandbox_enabled: bool = True + + # ── Sandbox Pool ────────────────────────────────────────────────────────── + # Containers to pre-warm per stack at startup (0 = cold-start on demand). + sandbox_pool_min_size: int = 1 + # Max containers that can be simultaneously checked out per stack. + sandbox_pool_max_size: int = 2 + # Seconds to wait for a free pool slot before falling back to local subprocess. + sandbox_checkout_timeout_seconds: int = 30 + # Reaper kills containers held longer than this (should match fix run budget). + sandbox_max_hold_seconds: int = 300 + # How often the reaper background task runs (seconds). + sandbox_reaper_interval_seconds: int = 60 + # Phase 2: streaming builder — set FORGE_STREAMING_BUILDER=1 to enable. # Eliminates the 20K output token ceiling by writing each file as Claude # generates it. Safe to enable once validated in simulation. diff --git a/phalanx/db/models.py b/phalanx/db/models.py index f81aaf6e..218ce5ff 100644 --- a/phalanx/db/models.py +++ b/phalanx/db/models.py @@ -828,6 +828,8 @@ class CIFixRun(Base): """False until OutcomeTracker has classified this run's fix outcome (V2).""" tool_version_parity_ok: Mapped[bool | None] = mapped_column(Boolean, nullable=True) """Phase 4: True when tool version at fix time matches failure-time version (within minor version).""" + pipeline_context_json: Mapped[str | None] = mapped_column(Text, nullable=True) + """CIFixContext serialized as JSON — full multi-agent pipeline state (Phase 1+).""" status: Mapped[str] = mapped_column(String(20), nullable=False, default="PENDING") attempt: Mapped[int] = mapped_column(Integer, default=1) error: Mapped[str | None] = mapped_column(Text) diff --git a/pyproject.toml b/pyproject.toml index 7867519d..f5a090bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,14 @@ ignore = ["E501", "B008", "N805"] [tool.ruff.lint.isort] known-first-party = ["phalanx"] +[tool.ruff.lint.per-file-ignores] +# Test files — mock class names are PascalCase by convention (N806), +# nested with statements are intentional for readability (SIM117), +# deferred imports inside try blocks are expected (F401), +# duplicate test function names from copy-paste scaffolding (F811), +# and late imports after helper definitions are acceptable (E402, SIM105). +"tests/**" = ["N806", "SIM117", "F401", "F811", "E402", "SIM105"] + # ── MyPy ───────────────────────────────────────────────────────────────────── [tool.mypy] python_version = "3.12" diff --git a/tests/unit/test_analyst_unit.py b/tests/unit/test_analyst_unit.py index 84e9233b..d01ae4e5 100644 --- a/tests/unit/test_analyst_unit.py +++ b/tests/unit/test_analyst_unit.py @@ -12,19 +12,18 @@ from __future__ import annotations import json -from pathlib import Path - -import pytest +from typing import TYPE_CHECKING from phalanx.ci_fixer.analyst import ( FilePatch, FileWindow, - FixPlan, RootCauseAnalyst, _is_test_file, ) -from phalanx.ci_fixer.log_parser import LintError, ParsedLog, TestFailure, TypeError +from phalanx.ci_fixer.log_parser import LintError, ParsedLog +if TYPE_CHECKING: + from pathlib import Path # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -47,20 +46,25 @@ def _lint_log(file: str, line: int = 1, code: str = "F401") -> ParsedLog: ) -def _patch_json(path: str, start: int, end: int, corrected: list[str], - confidence: str = "high") -> str: - return json.dumps({ - "confidence": confidence, - "root_cause": "test root cause", - "patches": [{ - "path": path, - "start_line": start, - "end_line": end, - "corrected_lines": corrected, - "reason": "test", - }], - "needs_new_test": False, - }) +def _patch_json( + path: str, start: int, end: int, corrected: list[str], confidence: str = "high" +) -> str: + return json.dumps( + { + "confidence": confidence, + "root_cause": "test root cause", + "patches": [ + { + "path": path, + "start_line": start, + "end_line": end, + "corrected_lines": corrected, + "reason": "test", + } + ], + "needs_new_test": False, + } + ) # ── FilePatch.delta ──────────────────────────────────────────────────────────── @@ -68,23 +72,19 @@ def _patch_json(path: str, start: int, end: int, corrected: list[str], class TestFilePatchDelta: def test_no_change(self): - p = FilePatch(path="f.py", start_line=1, end_line=3, - corrected_lines=["a\n", "b\n", "c\n"]) + p = FilePatch(path="f.py", start_line=1, end_line=3, corrected_lines=["a\n", "b\n", "c\n"]) assert p.delta == 0 def test_line_removed(self): - p = FilePatch(path="f.py", start_line=1, end_line=3, - corrected_lines=["a\n", "b\n"]) + p = FilePatch(path="f.py", start_line=1, end_line=3, corrected_lines=["a\n", "b\n"]) assert p.delta == -1 def test_line_added(self): - p = FilePatch(path="f.py", start_line=1, end_line=2, - corrected_lines=["a\n", "b\n", "c\n"]) + p = FilePatch(path="f.py", start_line=1, end_line=2, corrected_lines=["a\n", "b\n", "c\n"]) assert p.delta == 1 def test_original_window_size(self): - p = FilePatch(path="f.py", start_line=5, end_line=10, - corrected_lines=["x\n"]) + p = FilePatch(path="f.py", start_line=5, end_line=10, corrected_lines=["x\n"]) assert p.original_window_size == 6 @@ -169,7 +169,7 @@ def test_multiple_error_lines_merged_into_one_window(self, tmp_path): ], ) windows = analyst._read_windows(tmp_path, parsed) - assert len(windows) == 1 # merged, not two separate windows + assert len(windows) == 1 # merged, not two separate windows def test_max_files_respected(self, tmp_path): for i in range(6): @@ -183,7 +183,7 @@ def test_max_files_respected(self, tmp_path): ) analyst = _make_analyst("{}") windows = analyst._read_windows(tmp_path, parsed) - assert len(windows) <= 4 # _MAX_FILES = 4 + assert len(windows) <= 4 # _MAX_FILES = 4 # ── _parse_and_validate_patches ──────────────────────────────────────────────── @@ -203,31 +203,59 @@ def _analyst(self) -> RootCauseAnalyst: def test_valid_patch_accepted(self): w = self._window("src/foo.py", 1, 5, 5) - raw = [{"path": "src/foo.py", "start_line": 1, "end_line": 5, - "corrected_lines": ["a\n", "b\n", "c\n", "d\n"], "reason": "ok"}] + raw = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 5, + "corrected_lines": ["a\n", "b\n", "c\n", "d\n"], + "reason": "ok", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) assert len(patches) == 1 assert patches[0].delta == -1 def test_unknown_file_rejected(self): w = self._window("src/foo.py", 1, 5, 5) - raw = [{"path": "src/bar.py", "start_line": 1, "end_line": 5, - "corrected_lines": ["x\n"], "reason": "bad"}] + raw = [ + { + "path": "src/bar.py", + "start_line": 1, + "end_line": 5, + "corrected_lines": ["x\n"], + "reason": "bad", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) assert patches == [] def test_test_file_rejected(self): w = self._window("tests/test_foo.py", 1, 5, 5) - raw = [{"path": "tests/test_foo.py", "start_line": 1, "end_line": 5, - "corrected_lines": ["x\n"], "reason": "bad"}] + raw = [ + { + "path": "tests/test_foo.py", + "start_line": 1, + "end_line": 5, + "corrected_lines": ["x\n"], + "reason": "bad", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) assert patches == [] def test_delta_too_large_rejected(self): w = self._window("src/foo.py", 1, 5, 5) big = [f"line {i}\n" for i in range(50)] - raw = [{"path": "src/foo.py", "start_line": 1, "end_line": 5, - "corrected_lines": big, "reason": "too big"}] + raw = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 5, + "corrected_lines": big, + "reason": "too big", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) assert patches == [] @@ -239,15 +267,29 @@ def test_missing_line_range_rejected(self): def test_empty_corrected_lines_rejected(self): w = self._window("src/foo.py", 1, 5, 5) - raw = [{"path": "src/foo.py", "start_line": 1, "end_line": 5, - "corrected_lines": [], "reason": "empty"}] + raw = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 5, + "corrected_lines": [], + "reason": "empty", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) assert patches == [] def test_lines_without_newline_get_newline_appended(self): w = self._window("src/foo.py", 1, 3, 3) - raw = [{"path": "src/foo.py", "start_line": 1, "end_line": 3, - "corrected_lines": ["no newline", "also no newline"], "reason": "ok"}] + raw = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 3, + "corrected_lines": ["no newline", "also no newline"], + "reason": "ok", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) assert len(patches) == 1 assert all(line.endswith("\n") for line in patches[0].corrected_lines) @@ -255,8 +297,15 @@ def test_lines_without_newline_get_newline_appended(self): def test_line_range_within_tolerance_accepted(self): """start/end off by ≤2 lines → not rejected; LLM values passed through.""" w = self._window("src/foo.py", 1, 5, 5) - raw = [{"path": "src/foo.py", "start_line": 2, "end_line": 6, # off by 1 - "corrected_lines": ["a\n", "b\n"], "reason": "off by one"}] + raw = [ + { + "path": "src/foo.py", + "start_line": 2, + "end_line": 6, # off by 1 + "corrected_lines": ["a\n", "b\n"], + "reason": "off by one", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) # Accepted — within tolerance (off by 1 ≤ 2) assert len(patches) == 1 @@ -267,8 +316,15 @@ def test_line_range_within_tolerance_accepted(self): def test_line_range_beyond_tolerance_clamped(self): """start/end off by >2 lines → clamped to window bounds.""" w = self._window("src/foo.py", 1, 5, 5) - raw = [{"path": "src/foo.py", "start_line": 10, "end_line": 20, # way off - "corrected_lines": ["a\n", "b\n"], "reason": "way off"}] + raw = [ + { + "path": "src/foo.py", + "start_line": 10, + "end_line": 20, # way off + "corrected_lines": ["a\n", "b\n"], + "reason": "way off", + } + ] patches = self._analyst()._parse_and_validate_patches(raw, [w]) # Clamped to window bounds (1..5) assert len(patches) == 1 @@ -284,7 +340,7 @@ class TestAnalyzeIntegration: def test_high_confidence_fix_applied(self, tmp_path): _write(tmp_path, "src/foo.py", self._FILE) - corrected = self._FILE[1:] # remove "import os\n" + corrected = self._FILE[1:] # remove "import os\n" response = _patch_json("src/foo.py", 1, len(self._FILE), corrected) analyst = _make_analyst(response) plan = analyst.analyze(_lint_log("src/foo.py"), tmp_path) diff --git a/tests/unit/test_ci_fix_context.py b/tests/unit/test_ci_fix_context.py new file mode 100644 index 00000000..f10fc099 --- /dev/null +++ b/tests/unit/test_ci_fix_context.py @@ -0,0 +1,391 @@ +""" +Tests for phalanx.ci_fixer.context — CIFixContext shared pipeline state. + +Coverage targets: + - CIFixContext: init, to_dict, from_dict, complete, is_complete, current_stage + - StructuredFailure, ClassifiedFailure, ReproductionResult, VerifiedPatch, VerificationResult + - Serialization round-trip fidelity + - Edge cases: None fields, empty lists, partial population +""" + +from __future__ import annotations + +import json + +import pytest + +from phalanx.ci_fixer.context import ( + CIFixContext, + ClassifiedFailure, + ReproductionResult, + StructuredFailure, + VerificationResult, + VerifiedPatch, +) + +# ── Fixtures ────────────────────────────────────────────────────────────────── + + +def _make_ctx(**kwargs) -> CIFixContext: + defaults = { + "ci_fix_run_id": "run-123", + "repo": "owner/repo", + "branch": "feature/foo", + "commit_sha": "abc123", + "original_build_id": "build-456", + } + defaults.update(kwargs) + return CIFixContext(**defaults) + + +# ── CIFixContext basics ─────────────────────────────────────────────────────── + + +def test_context_init_defaults(): + ctx = _make_ctx() + assert ctx.ci_fix_run_id == "run-123" + assert ctx.repo == "owner/repo" + assert ctx.branch == "feature/foo" + assert ctx.commit_sha == "abc123" + assert ctx.original_build_id == "build-456" + assert ctx.structured_failure is None + assert ctx.classified_failure is None + assert ctx.reproduction_result is None + assert ctx.verified_patch is None + assert ctx.verification_result is None + assert ctx.fix_commit_sha is None + assert ctx.fix_pr_number is None + assert ctx.fix_branch is None + assert ctx.pr_was_existing is False + assert ctx.final_status == "in_progress" + assert ctx.pr_comment_posted is False + assert ctx.error is None + assert ctx.started_at is not None + + +def test_context_is_complete_initial(): + ctx = _make_ctx() + assert ctx.is_complete is False + + +def test_context_complete_fixed(): + ctx = _make_ctx() + ctx.complete("fixed") + assert ctx.is_complete is True + assert ctx.final_status == "fixed" + assert ctx.completed_at is not None + assert ctx.error is None + + +def test_context_complete_failed_with_error(): + ctx = _make_ctx() + ctx.complete("failed", error="something went wrong") + assert ctx.final_status == "failed" + assert ctx.error == "something went wrong" + + +def test_context_complete_escalated(): + ctx = _make_ctx() + ctx.complete("escalated") + assert ctx.final_status == "escalated" + assert ctx.is_complete is True + + +def test_context_complete_flaky(): + ctx = _make_ctx() + ctx.complete("flaky") + assert ctx.final_status == "flaky" + + +def test_context_complete_env_mismatch(): + ctx = _make_ctx() + ctx.complete("env_mismatch") + assert ctx.final_status == "env_mismatch" + + +# ── current_stage property ──────────────────────────────────────────────────── + + +def test_current_stage_started(): + ctx = _make_ctx() + assert ctx.current_stage == "started" + + +def test_current_stage_parsed(): + ctx = _make_ctx() + ctx.structured_failure = StructuredFailure( + tool="ruff", failure_type="lint", reproducer_cmd="ruff check ." + ) + assert ctx.current_stage == "parsed" + + +def test_current_stage_classified(): + ctx = _make_ctx() + ctx.structured_failure = StructuredFailure( + tool="ruff", failure_type="lint", reproducer_cmd="ruff check ." + ) + ctx.classified_failure = ClassifiedFailure( + tier="L1_auto", root_cause="unused import", stack="python" + ) + assert ctx.current_stage == "classified" + + +def test_current_stage_sandbox_ready(): + ctx = _make_ctx() + ctx.structured_failure = StructuredFailure( + tool="ruff", failure_type="lint", reproducer_cmd="ruff check ." + ) + ctx.classified_failure = ClassifiedFailure( + tier="L1_auto", root_cause="unused import", stack="python" + ) + ctx.sandbox_id = "container-abc" + assert ctx.current_stage == "sandbox_ready" + + +def test_current_stage_reproduced(): + ctx = _make_ctx() + ctx.reproduction_result = ReproductionResult(verdict="confirmed") + assert ctx.current_stage == "reproduced" + + +def test_current_stage_patched(): + ctx = _make_ctx() + ctx.reproduction_result = ReproductionResult(verdict="confirmed") + ctx.verified_patch = VerifiedPatch(files_modified=["src/foo.py"], success=True) + assert ctx.current_stage == "patched" + + +def test_current_stage_verified(): + ctx = _make_ctx() + ctx.reproduction_result = ReproductionResult(verdict="confirmed") + ctx.verified_patch = VerifiedPatch(files_modified=["src/foo.py"], success=True) + ctx.verification_result = VerificationResult(verdict="passed") + assert ctx.current_stage == "verified" + + +def test_current_stage_committed(): + ctx = _make_ctx() + ctx.fix_commit_sha = "deadbeef" + assert ctx.current_stage == "committed" + + +# ── Serialization round-trip ────────────────────────────────────────────────── + + +def test_to_dict_minimal(): + ctx = _make_ctx() + d = ctx.to_dict() + assert d["ci_fix_run_id"] == "run-123" + assert d["repo"] == "owner/repo" + assert d["structured_failure"] is None + assert d["final_status"] == "in_progress" + + +def test_to_dict_with_all_agents_populated(): + ctx = _make_ctx() + ctx.structured_failure = StructuredFailure( + tool="ruff", + failure_type="lint", + reproducer_cmd="ruff check .", + errors=[{"file": "foo.py", "line": 1, "code": "F401"}], + failing_files=["foo.py"], + log_excerpt="foo.py:1:1: F401 ...", + confidence=0.95, + ) + ctx.classified_failure = ClassifiedFailure( + tier="L1_auto", + root_cause="unused import", + stack="python", + confidence=0.9, + ) + ctx.reproduction_result = ReproductionResult( + verdict="confirmed", + exit_code=1, + output="F401 ...", + reproducer_cmd="ruff check .", + ) + ctx.verified_patch = VerifiedPatch( + files_modified=["foo.py"], + validation_cmd="ruff check foo.py", + validation_output="All checks passed!", + success=True, + turns_used=3, + ) + ctx.verification_result = VerificationResult( + verdict="passed", + output="pytest passed", + cmd_run="pytest tests/", + ) + ctx.fix_commit_sha = "abc123" + ctx.fix_pr_number = 42 + ctx.fix_branch = "phalanx/ci-fix/run-123" + ctx.complete("fixed") + + d = ctx.to_dict() + assert d["structured_failure"]["tool"] == "ruff" + assert d["classified_failure"]["tier"] == "L1_auto" + assert d["reproduction_result"]["verdict"] == "confirmed" + assert d["verified_patch"]["success"] is True + assert d["verification_result"]["verdict"] == "passed" + assert d["fix_commit_sha"] == "abc123" + assert d["fix_pr_number"] == 42 + assert d["final_status"] == "fixed" + + +def test_from_dict_round_trip_minimal(): + ctx = _make_ctx() + d = ctx.to_dict() + restored = CIFixContext.from_dict(d) + assert restored.ci_fix_run_id == ctx.ci_fix_run_id + assert restored.repo == ctx.repo + assert restored.structured_failure is None + assert restored.final_status == "in_progress" + + +def test_from_dict_round_trip_full(): + ctx = _make_ctx() + ctx.structured_failure = StructuredFailure( + tool="mypy", failure_type="type_error", reproducer_cmd="mypy ." + ) + ctx.classified_failure = ClassifiedFailure( + tier="L1_auto", root_cause="type mismatch", stack="python" + ) + ctx.reproduction_result = ReproductionResult(verdict="skipped") + ctx.verified_patch = VerifiedPatch(files_modified=["src/types.py"], success=True) + ctx.verification_result = VerificationResult(verdict="skipped") + ctx.fix_commit_sha = "sha456" + ctx.fix_pr_number = 7 + ctx.fix_pr_url = "https://github.com/owner/repo/pull/7" + ctx.fix_branch = "phalanx/ci-fix/run-123" + ctx.pr_was_existing = True + ctx.complete("fixed") + + d = ctx.to_dict() + restored = CIFixContext.from_dict(d) + + assert restored.structured_failure.tool == "mypy" + assert restored.classified_failure.tier == "L1_auto" + assert restored.reproduction_result.verdict == "skipped" + assert restored.verified_patch.success is True + assert restored.verification_result.verdict == "skipped" + assert restored.fix_commit_sha == "sha456" + assert restored.fix_pr_number == 7 + assert restored.fix_pr_url == "https://github.com/owner/repo/pull/7" + assert restored.pr_was_existing is True + assert restored.final_status == "fixed" + assert restored.is_complete is True + + +def test_json_serializable(): + ctx = _make_ctx() + ctx.structured_failure = StructuredFailure( + tool="ruff", failure_type="lint", reproducer_cmd="ruff check ." + ) + ctx.complete("fixed") + # Must not raise + serialized = json.dumps(ctx.to_dict()) + restored = CIFixContext.from_dict(json.loads(serialized)) + assert restored.final_status == "fixed" + + +def test_from_dict_missing_optional_fields(): + """from_dict should handle dicts missing optional fields gracefully.""" + d = { + "ci_fix_run_id": "run-xyz", + "repo": "owner/repo", + "branch": "main", + "commit_sha": "abc", + "original_build_id": "build-1", + } + ctx = CIFixContext.from_dict(d) + assert ctx.ci_fix_run_id == "run-xyz" + assert ctx.structured_failure is None + assert ctx.fix_pr_number is None + assert ctx.final_status == "in_progress" + assert ctx.pr_was_existing is False + + +# ── Sub-object tests ────────────────────────────────────────────────────────── + + +def test_structured_failure_defaults(): + sf = StructuredFailure(tool="ruff", failure_type="lint", reproducer_cmd="ruff check .") + assert sf.errors == [] + assert sf.failing_files == [] + assert sf.log_excerpt == "" + assert sf.confidence == 1.0 + + +def test_classified_failure_l2_escalate(): + cf = ClassifiedFailure( + tier="L2_escalate", + root_cause="test regression", + stack="python", + confidence=0.3, + escalation_reason="test failure requires engineer judgment", + ) + assert cf.tier == "L2_escalate" + assert cf.escalation_reason == "test failure requires engineer judgment" + + +def test_reproduction_result_all_verdicts(): + for verdict in ("confirmed", "flaky", "env_mismatch", "timeout", "skipped"): + r = ReproductionResult(verdict=verdict) + assert r.verdict == verdict + + +def test_verified_patch_defaults(): + vp = VerifiedPatch() + assert vp.files_modified == [] + assert vp.success is False + assert vp.turns_used == 0 + + +def test_verification_result_all_verdicts(): + for verdict in ("passed", "failed", "skipped", "timeout"): + vr = VerificationResult(verdict=verdict) + assert vr.verdict == verdict + + +# ── Edge cases ──────────────────────────────────────────────────────────────── + + +def test_context_pr_was_existing_default_false(): + ctx = _make_ctx() + assert ctx.pr_was_existing is False + + +def test_context_set_pr_was_existing(): + ctx = _make_ctx() + ctx.pr_was_existing = True + d = ctx.to_dict() + assert d["pr_was_existing"] is True + restored = CIFixContext.from_dict(d) + assert restored.pr_was_existing is True + + +def test_context_sandbox_fields(): + ctx = _make_ctx() + ctx.sandbox_id = "container-xyz" + ctx.sandbox_stack = "python" + d = ctx.to_dict() + restored = CIFixContext.from_dict(d) + assert restored.sandbox_id == "container-xyz" + assert restored.sandbox_stack == "python" + + +def test_context_error_persists_through_round_trip(): + ctx = _make_ctx() + ctx.complete("failed", error="ruff not found in sandbox") + d = ctx.to_dict() + restored = CIFixContext.from_dict(d) + assert restored.error == "ruff not found in sandbox" + + +def test_context_started_at_is_set(): + ctx = _make_ctx() + assert ctx.started_at + # Should be a valid ISO datetime string + from datetime import datetime + + datetime.fromisoformat(ctx.started_at) diff --git a/tests/unit/test_ci_fix_runs_api.py b/tests/unit/test_ci_fix_runs_api.py new file mode 100644 index 00000000..6b215497 --- /dev/null +++ b/tests/unit/test_ci_fix_runs_api.py @@ -0,0 +1,478 @@ +""" +Tests for phalanx.api.routes.ci_fix_runs — CI fix run context API. + +Coverage targets: + - GET /v1/ci-fix-runs/{run_id}/context — found, not found, no context, parse error + - GET /v1/ci-fix-runs/{run_id} — found, not found + - GET /v1/ci-fix-runs — list, filters + - _find_existing_fix_pr — found, not found, error handling +""" + +from __future__ import annotations + +import json +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from httpx import AsyncClient + +from phalanx.ci_fixer.context import CIFixContext, StructuredFailure + +# ── Helpers ─────────────────────────────────────────────────────────────────── + + +def _make_ci_run( + run_id="run-abc", + repo="owner/repo", + branch="main", + commit_sha="abc123", + build_id="build-1", + status="FIXED", + pipeline_context_json=None, + fix_pr_number=None, + fix_branch=None, + fix_commit_sha=None, + fingerprint_hash=None, + error=None, +): + run = MagicMock() + run.id = run_id + run.repo_full_name = repo + run.branch = branch + run.commit_sha = commit_sha + run.ci_build_id = build_id + run.ci_provider = "github_actions" + run.status = status + run.pipeline_context_json = pipeline_context_json + run.fix_pr_number = fix_pr_number + run.fix_branch = fix_branch + run.fix_commit_sha = fix_commit_sha + run.fingerprint_hash = fingerprint_hash + run.error = error + run.created_at = MagicMock() + run.created_at.isoformat.return_value = "2026-04-15T12:00:00+00:00" + run.completed_at = None + return run + + +def _make_context_json(run_id="run-abc") -> str: + ctx = CIFixContext( + ci_fix_run_id=run_id, + repo="owner/repo", + branch="main", + commit_sha="abc123", + original_build_id="build-1", + ) + ctx.structured_failure = StructuredFailure( + tool="ruff", failure_type="lint", reproducer_cmd="ruff check ." + ) + ctx.complete("fixed") + return json.dumps(ctx.to_dict()) + + +# ── GET /v1/ci-fix-runs/{run_id}/context ───────────────────────────────────── + + +@pytest.mark.asyncio +async def test_get_context_not_found(): + from phalanx.api.routes.ci_fix_runs import get_fix_run_context + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = None + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + from fastapi import HTTPException + + with pytest.raises(HTTPException) as exc_info: + await get_fix_run_context("nonexistent") + assert exc_info.value.status_code == 404 + + +@pytest.mark.asyncio +async def test_get_context_no_pipeline_json(): + """Run exists but has no pipeline_context_json (pre-Phase 1 run).""" + from phalanx.api.routes.ci_fix_runs import get_fix_run_context + + ci_run = _make_ci_run(pipeline_context_json=None, status="FIXED") + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = ci_run + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await get_fix_run_context("run-abc") + + assert result["ci_fix_run_id"] == "run-abc" + assert result["final_status"] == "unknown" + assert "_note" in result + assert result["current_stage"] == "unknown" + + +@pytest.mark.asyncio +async def test_get_context_with_pipeline_json(): + """Run has pipeline_context_json — returns full parsed context.""" + from phalanx.api.routes.ci_fix_runs import get_fix_run_context + + ctx_json = _make_context_json("run-abc") + ci_run = _make_ci_run(pipeline_context_json=ctx_json, status="FIXED") + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = ci_run + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await get_fix_run_context("run-abc") + + assert result["ci_fix_run_id"] == "run-abc" + assert result["final_status"] == "fixed" + assert result["current_stage"] in ("parsed", "committed", "patched", "classified", "started") + assert result["structured_failure"]["tool"] == "ruff" + + +@pytest.mark.asyncio +async def test_get_context_invalid_json(): + """pipeline_context_json is corrupt — returns 500.""" + from phalanx.api.routes.ci_fix_runs import get_fix_run_context + + ci_run = _make_ci_run(pipeline_context_json="not valid json{{{", status="FIXED") + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = ci_run + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + from fastapi import HTTPException + + with pytest.raises(HTTPException) as exc_info: + await get_fix_run_context("run-abc") + assert exc_info.value.status_code == 500 + + +# ── GET /v1/ci-fix-runs/{run_id} ───────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_get_fix_run_found(): + from phalanx.api.routes.ci_fix_runs import get_fix_run + + ci_run = _make_ci_run(fix_pr_number=7, fix_branch="phalanx/ci-fix/run-abc") + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = ci_run + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await get_fix_run("run-abc") + + assert result["id"] == "run-abc" + assert result["fix_pr_number"] == 7 + assert result["fix_branch"] == "phalanx/ci-fix/run-abc" + assert result["has_context"] is False + + +@pytest.mark.asyncio +async def test_get_fix_run_not_found(): + from phalanx.api.routes.ci_fix_runs import get_fix_run + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = None + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + from fastapi import HTTPException + + with pytest.raises(HTTPException) as exc_info: + await get_fix_run("nonexistent") + assert exc_info.value.status_code == 404 + + +@pytest.mark.asyncio +async def test_get_fix_run_has_context_true(): + from phalanx.api.routes.ci_fix_runs import get_fix_run + + ctx_json = _make_context_json("run-abc") + ci_run = _make_ci_run(pipeline_context_json=ctx_json) + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = ci_run + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await get_fix_run("run-abc") + + assert result["has_context"] is True + + +# ── GET /v1/ci-fix-runs (list) ──────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_list_fix_runs_empty(): + from phalanx.api.routes.ci_fix_runs import list_fix_runs + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalars.return_value.all.return_value = [] + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await list_fix_runs(limit=20, run_status=None) + + assert result["runs"] == [] + assert result["count"] == 0 + + +@pytest.mark.asyncio +async def test_list_fix_runs_with_results(): + from phalanx.api.routes.ci_fix_runs import list_fix_runs + + runs = [ + _make_ci_run(run_id="run-1", status="FIXED"), + _make_ci_run(run_id="run-2", status="FAILED", error="no_structured_errors"), + ] + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalars.return_value.all.return_value = runs + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await list_fix_runs(limit=20, run_status=None) + + assert result["count"] == 2 + assert result["runs"][0]["id"] == "run-1" + assert result["runs"][1]["error"] == "no_structured_errors" + + +@pytest.mark.asyncio +async def test_list_fix_runs_filters_applied(): + """Filters are passed through — just test the query builds without error.""" + from phalanx.api.routes.ci_fix_runs import list_fix_runs + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalars.return_value.all.return_value = [] + mock_session.execute = AsyncMock(return_value=mock_result) + + mock_ctx_manager = AsyncMock() + mock_ctx_manager.__aenter__ = AsyncMock(return_value=mock_session) + mock_ctx_manager.__aexit__ = AsyncMock(return_value=None) + + with patch("phalanx.api.routes.ci_fix_runs.get_db", return_value=mock_ctx_manager): + result = await list_fix_runs(repo="owner/repo", branch="main", run_status="FIXED", limit=5) + + assert result["count"] == 0 + + +# ── _find_existing_fix_pr ───────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_find_existing_fix_pr_found(): + """Returns PR number when an open phalanx/ci-fix/* PR exists.""" + from phalanx.agents.ci_fixer import CIFixerAgent + + agent = CIFixerAgent.__new__(CIFixerAgent) + agent._log = MagicMock() + agent._log.info = MagicMock() + agent._log.warning = MagicMock() + + integration = MagicMock() + integration.github_token = "ghp_test" + + ci_run = MagicMock() + ci_run.repo_full_name = "owner/repo" + ci_run.branch = "feature/foo" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + { + "number": 42, + "head": {"ref": "phalanx/ci-fix/old-run-id"}, + } + ] + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + result = await agent._find_existing_fix_pr(integration, ci_run) + + assert result == 42 + agent._log.info.assert_called_once() + + +@pytest.mark.asyncio +async def test_find_existing_fix_pr_not_found(): + """Returns None when no phalanx/ci-fix/* PR exists.""" + from phalanx.agents.ci_fixer import CIFixerAgent + + agent = CIFixerAgent.__new__(CIFixerAgent) + agent._log = MagicMock() + agent._log.info = MagicMock() + agent._log.warning = MagicMock() + + integration = MagicMock() + integration.github_token = "ghp_test" + + ci_run = MagicMock() + ci_run.repo_full_name = "owner/repo" + ci_run.branch = "feature/foo" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + { + "number": 5, + "head": {"ref": "feature/some-other-fix"}, # not a phalanx fix branch + } + ] + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + result = await agent._find_existing_fix_pr(integration, ci_run) + + assert result is None + + +@pytest.mark.asyncio +async def test_find_existing_fix_pr_api_error(): + """Returns None on HTTP error — does not raise.""" + from phalanx.agents.ci_fixer import CIFixerAgent + + agent = CIFixerAgent.__new__(CIFixerAgent) + agent._log = MagicMock() + agent._log.info = MagicMock() + agent._log.warning = MagicMock() + + integration = MagicMock() + integration.github_token = "ghp_test" + + ci_run = MagicMock() + ci_run.repo_full_name = "owner/repo" + ci_run.branch = "feature/foo" + + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=Exception("network error")) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + result = await agent._find_existing_fix_pr(integration, ci_run) + + assert result is None + agent._log.warning.assert_called_once() + + +@pytest.mark.asyncio +async def test_find_existing_fix_pr_non_200(): + """Returns None when GitHub API returns non-200.""" + from phalanx.agents.ci_fixer import CIFixerAgent + + agent = CIFixerAgent.__new__(CIFixerAgent) + agent._log = MagicMock() + agent._log.info = MagicMock() + agent._log.warning = MagicMock() + + integration = MagicMock() + integration.github_token = "ghp_test" + + ci_run = MagicMock() + ci_run.repo_full_name = "owner/repo" + ci_run.branch = "feature/foo" + + mock_response = MagicMock() + mock_response.status_code = 401 + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + result = await agent._find_existing_fix_pr(integration, ci_run) + + assert result is None + + +@pytest.mark.asyncio +async def test_find_existing_fix_pr_empty_list(): + """Returns None when PR list is empty.""" + from phalanx.agents.ci_fixer import CIFixerAgent + + agent = CIFixerAgent.__new__(CIFixerAgent) + agent._log = MagicMock() + agent._log.info = MagicMock() + agent._log.warning = MagicMock() + + integration = MagicMock() + integration.github_token = "ghp_test" + + ci_run = MagicMock() + ci_run.repo_full_name = "owner/repo" + ci_run.branch = "feature/foo" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = [] + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + result = await agent._find_existing_fix_pr(integration, ci_run) + + assert result is None diff --git a/tests/unit/test_ci_fixer_agent_helpers.py b/tests/unit/test_ci_fixer_agent_helpers.py index a815e185..0b0d4200 100644 --- a/tests/unit/test_ci_fixer_agent_helpers.py +++ b/tests/unit/test_ci_fixer_agent_helpers.py @@ -7,11 +7,9 @@ from __future__ import annotations -from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch -import pytest - from phalanx.agents.ci_fixer import ( CIFixerAgent, _cleanup_workspace, @@ -21,6 +19,8 @@ from phalanx.ci_fixer.analyst import FilePatch from phalanx.ci_fixer.log_parser import LintError, ParsedLog, TestFailure, TypeError +if TYPE_CHECKING: + from pathlib import Path # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -34,8 +34,9 @@ def _make_agent() -> CIFixerAgent: return agent -def _lint_parsed(file: str = "phalanx/foo.py", code: str = "F401", - msg: str = "unused import 'os'") -> ParsedLog: +def _lint_parsed( + file: str = "phalanx/foo.py", code: str = "F401", msg: str = "unused import 'os'" +) -> ParsedLog: return ParsedLog( tool="ruff", lint_errors=[LintError(file=file, line=5, col=1, code=code, message=msg)], @@ -61,12 +62,22 @@ def test_returns_16_char_hex(self): def test_same_error_class_same_hash(self): # Different line numbers → same hash (lines stripped) - p1 = ParsedLog(tool="ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=3, col=1, code="F401", message="unused import 'os'"), - ]) - p2 = ParsedLog(tool="ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=99, col=1, code="F401", message="unused import 'os'"), - ]) + p1 = ParsedLog( + tool="ruff", + lint_errors=[ + LintError( + file="phalanx/foo.py", line=3, col=1, code="F401", message="unused import 'os'" + ), + ], + ) + p2 = ParsedLog( + tool="ruff", + lint_errors=[ + LintError( + file="phalanx/foo.py", line=99, col=1, code="F401", message="unused import 'os'" + ), + ], + ) assert _compute_fingerprint(p1) == _compute_fingerprint(p2) def test_different_error_code_different_hash(self): @@ -90,11 +101,13 @@ def test_type_error_included(self): def test_test_failure_included(self): parsed = ParsedLog( tool="pytest", - test_failures=[TestFailure( - test_id="tests/unit/test_foo.py::test_bar", - file="tests/unit/test_foo.py", - message="AssertionError", - )], + test_failures=[ + TestFailure( + test_id="tests/unit/test_foo.py::test_bar", + file="tests/unit/test_foo.py", + message="AssertionError", + ) + ], ) h = _compute_fingerprint(parsed) assert len(h) == 16 @@ -106,14 +119,26 @@ def test_empty_log_has_stable_hash(self): def test_parametrized_tests_normalized(self): """test[param1] and test[param2] should yield same fingerprint.""" - p1 = ParsedLog(tool="pytest", test_failures=[ - TestFailure(test_id="tests/test_foo.py::test_bar[case1]", - file="tests/test_foo.py", message=""), - ]) - p2 = ParsedLog(tool="pytest", test_failures=[ - TestFailure(test_id="tests/test_foo.py::test_bar[case2]", - file="tests/test_foo.py", message=""), - ]) + p1 = ParsedLog( + tool="pytest", + test_failures=[ + TestFailure( + test_id="tests/test_foo.py::test_bar[case1]", + file="tests/test_foo.py", + message="", + ), + ], + ) + p2 = ParsedLog( + tool="pytest", + test_failures=[ + TestFailure( + test_id="tests/test_foo.py::test_bar[case2]", + file="tests/test_foo.py", + message="", + ), + ], + ) assert _compute_fingerprint(p1) == _compute_fingerprint(p2) def test_numbers_in_messages_normalized(self): @@ -144,11 +169,13 @@ def test_type_errors_formatted(self): def test_test_failures_formatted(self): parsed = ParsedLog( tool="pytest", - test_failures=[TestFailure( - test_id="tests/unit/test_foo.py::test_bar", - file="tests/unit/test_foo.py", - message="", - )], + test_failures=[ + TestFailure( + test_id="tests/unit/test_foo.py::test_bar", + file="tests/unit/test_foo.py", + message="", + ) + ], ) result = _format_error_detail(parsed) assert "test_bar" in result @@ -225,8 +252,7 @@ def test_applies_line_range_replacement(self, tmp_path): assert "import os" not in result def test_missing_file_skipped(self, tmp_path): - patch = FilePatch(path="src/missing.py", start_line=1, end_line=1, - corrected_lines=["x\n"]) + patch = FilePatch(path="src/missing.py", start_line=1, end_line=1, corrected_lines=["x\n"]) agent = self._agent() written = agent._apply_patches(tmp_path, [patch]) assert written == [] @@ -234,8 +260,7 @@ def test_missing_file_skipped(self, tmp_path): def test_bounds_out_of_range_skipped(self, tmp_path): lines = ["a\n", "b\n"] _write(tmp_path, "src/foo.py", lines) - patch = FilePatch(path="src/foo.py", start_line=5, end_line=10, - corrected_lines=["x\n"]) + patch = FilePatch(path="src/foo.py", start_line=5, end_line=10, corrected_lines=["x\n"]) agent = self._agent() written = agent._apply_patches(tmp_path, [patch]) assert written == [] @@ -245,8 +270,7 @@ def test_delta_too_large_skipped(self, tmp_path): _write(tmp_path, "src/foo.py", lines) # Add 35 lines — exceeds _MAX_TOTAL_LINE_DELTA=30 huge = [f"added {i}\n" for i in range(35)] - patch = FilePatch(path="src/foo.py", start_line=1, end_line=1, - corrected_lines=huge) + patch = FilePatch(path="src/foo.py", start_line=1, end_line=1, corrected_lines=huge) agent = self._agent() written = agent._apply_patches(tmp_path, [patch]) assert written == [] @@ -255,12 +279,12 @@ def test_multiple_patches_applied_in_order(self, tmp_path): lines = ["line 1\n", "import os\n", "import sys\n", "line 4\n"] _write(tmp_path, "src/foo.py", lines) - p1 = FilePatch(path="src/foo.py", start_line=2, end_line=2, - corrected_lines=["# os removed\n"]) + p1 = FilePatch( + path="src/foo.py", start_line=2, end_line=2, corrected_lines=["# os removed\n"] + ) # After p1, file changes — p2 targets a different file _write(tmp_path, "src/bar.py", ["x = 1\n", "y = 2\n"]) - p2 = FilePatch(path="src/bar.py", start_line=1, end_line=1, - corrected_lines=["x = 10\n"]) + p2 = FilePatch(path="src/bar.py", start_line=1, end_line=1, corrected_lines=["x = 10\n"]) agent = self._agent() written = agent._apply_patches(tmp_path, [p1, p2]) diff --git a/tests/unit/test_ci_fixer_agent_helpers2.py b/tests/unit/test_ci_fixer_agent_helpers2.py index 7c41399a..163c7d25 100644 --- a/tests/unit/test_ci_fixer_agent_helpers2.py +++ b/tests/unit/test_ci_fixer_agent_helpers2.py @@ -11,14 +11,11 @@ from __future__ import annotations -import asyncio -from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest -from phalanx.agents.ci_fixer import CIFixerAgent, _cleanup_workspace, _compute_fingerprint - +from phalanx.agents.ci_fixer import CIFixerAgent # ── helpers ──────────────────────────────────────────────────────────────────── @@ -55,8 +52,9 @@ async def test_execute_catches_unhandled_exception(): """execute() wraps _execute_inner exceptions and returns AgentResult(success=False).""" agent = _make_agent() - with patch.object(agent, "_execute_inner", new_callable=AsyncMock, - side_effect=RuntimeError("unexpected boom")): + with patch.object( + agent, "_execute_inner", new_callable=AsyncMock, side_effect=RuntimeError("unexpected boom") + ): result = await agent.execute() assert result.success is False @@ -67,10 +65,15 @@ async def test_execute_catches_unhandled_exception(): async def test_execute_returns_inner_result_on_success(): """execute() propagates AgentResult from _execute_inner.""" from phalanx.agents.base import AgentResult + agent = _make_agent() - with patch.object(agent, "_execute_inner", new_callable=AsyncMock, - return_value=AgentResult(success=True, output={"done": True})): + with patch.object( + agent, + "_execute_inner", + new_callable=AsyncMock, + return_value=AgentResult(success=True, output={"done": True}), + ): result = await agent.execute() assert result.success is True @@ -172,13 +175,16 @@ async def mock_execute(_stmt): # parse_log returns empty → no errors from phalanx.ci_fixer.log_parser import ParsedLog + empty_parsed = ParsedLog(tool="unknown") - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value=""), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=empty_parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_mark_failed", new_callable=AsyncMock): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value=""), + patch("phalanx.agents.ci_fixer.parse_log", return_value=empty_parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_mark_failed", new_callable=AsyncMock), + ): result = await agent._execute_inner() assert result.success is False @@ -193,6 +199,7 @@ async def test_load_flaky_patterns_no_lint_errors(): """Returns [] immediately when no lint/type errors.""" agent = _make_agent() from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="pytest") # only test failures, no lint errors result = await agent._load_flaky_patterns("acme/backend", parsed) @@ -207,7 +214,7 @@ async def test_load_flaky_patterns_returns_rows(): parsed = ParsedLog( tool="ruff", - lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")] + lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")], ) mock_pattern = MagicMock() @@ -230,7 +237,7 @@ async def test_load_flaky_patterns_db_error_returns_empty(): parsed = ParsedLog( tool="ruff", - lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")] + lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")], ) with patch("phalanx.agents.ci_fixer.get_db", side_effect=Exception("DB down")): @@ -251,8 +258,11 @@ async def test_clone_repo_generic_exception_returns_false(tmp_path): mock_repo_class = MagicMock() mock_repo_class.clone_from.side_effect = Exception("authentication failed") - with patch("phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", - new_callable=AsyncMock, return_value=False): + with patch( + "phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", + new_callable=AsyncMock, + return_value=False, + ): result = await agent._clone_repo(tmp_path, "acme/backend", "main", "abc", "token") assert result is False @@ -270,8 +280,11 @@ async def test_clone_repo_existing_git_dir(tmp_path): mock_repo.remotes.origin.fetch = MagicMock() mock_repo.git.checkout = MagicMock() - with patch("phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", - new_callable=AsyncMock, return_value=True): + with patch( + "phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", + new_callable=AsyncMock, + return_value=True, + ): result = await agent._clone_repo(tmp_path, "acme/backend", "main", "abc", "token") assert result is True @@ -289,8 +302,11 @@ async def test_commit_to_safe_branch_not_git_repo(tmp_path): try: from git.exc import InvalidGitRepositoryError - with patch("phalanx.agents.ci_fixer.CIFixerAgent._commit_to_safe_branch", - new_callable=AsyncMock, return_value={"sha": None, "error": "not a git repo"}): + with patch( + "phalanx.agents.ci_fixer.CIFixerAgent._commit_to_safe_branch", + new_callable=AsyncMock, + return_value={"sha": None, "error": "not a git repo"}, + ): result = await agent._commit_to_safe_branch( workspace=tmp_path, source_branch="main", @@ -311,9 +327,11 @@ async def test_commit_to_safe_branch_exception(tmp_path): """Exception → returns sha=None with error key.""" agent = _make_agent() - with patch("phalanx.agents.ci_fixer.CIFixerAgent._commit_to_safe_branch", - new_callable=AsyncMock, - return_value={"sha": None, "error": "something went wrong"}): + with patch( + "phalanx.agents.ci_fixer.CIFixerAgent._commit_to_safe_branch", + new_callable=AsyncMock, + return_value={"sha": None, "error": "something went wrong"}, + ): result = await agent._commit_to_safe_branch( workspace=tmp_path, source_branch="main", @@ -342,6 +360,7 @@ async def test_comment_on_pr_no_fix_pr(): ci_run.branch = "feature/x" from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="ruff") resp = MagicMock() @@ -378,8 +397,10 @@ def test_execute_task_runs_agent(): """execute_task creates CIFixerAgent and runs it.""" from phalanx.agents.ci_fixer import execute_task - with patch("phalanx.agents.ci_fixer.CIFixerAgent") as MockAgent, \ - patch("phalanx.agents.ci_fixer.asyncio.run") as mock_run: + with ( + patch("phalanx.agents.ci_fixer.CIFixerAgent") as MockAgent, + patch("phalanx.agents.ci_fixer.asyncio.run") as mock_run, + ): mock_instance = MagicMock() MockAgent.return_value = mock_instance execute_task("run-001") @@ -390,9 +411,10 @@ def test_execute_task_reraises_exception(): """execute_task re-raises exceptions after logging.""" from phalanx.agents.ci_fixer import execute_task - with patch("phalanx.agents.ci_fixer.CIFixerAgent") as MockAgent, \ - patch("phalanx.agents.ci_fixer.asyncio.run", - side_effect=RuntimeError("boom")): + with ( + patch("phalanx.agents.ci_fixer.CIFixerAgent") as MockAgent, + patch("phalanx.agents.ci_fixer.asyncio.run", side_effect=RuntimeError("boom")), + ): MockAgent.return_value = MagicMock() with pytest.raises(RuntimeError, match="boom"): execute_task("run-001") @@ -488,12 +510,21 @@ async def test_run_scan_posts_comment_for_warnings(): findings = [ProactiveFinding("fp1", "ruff", "pattern", "warning", ["src/foo.py"])] - with patch("phalanx.ci_fixer.proactive_scanner.scan_pr_for_patterns", - new_callable=AsyncMock, return_value=findings), \ - patch("phalanx.ci_fixer.proactive_scanner._post_comment", - new_callable=AsyncMock, return_value=42), \ - patch("phalanx.ci_fixer.proactive_scanner._record_scan", - new_callable=AsyncMock) as mock_record: + with ( + patch( + "phalanx.ci_fixer.proactive_scanner.scan_pr_for_patterns", + new_callable=AsyncMock, + return_value=findings, + ), + patch( + "phalanx.ci_fixer.proactive_scanner._post_comment", + new_callable=AsyncMock, + return_value=42, + ), + patch( + "phalanx.ci_fixer.proactive_scanner._record_scan", new_callable=AsyncMock + ) as mock_record, + ): await _run_scan("acme/backend", 1, "abc", "token") mock_record.assert_called_once() @@ -509,12 +540,17 @@ async def test_run_scan_no_comment_for_info_only(): findings = [ProactiveFinding("fp1", "ruff", "pattern", "info", ["src/foo.py"])] - with patch("phalanx.ci_fixer.proactive_scanner.scan_pr_for_patterns", - new_callable=AsyncMock, return_value=findings), \ - patch("phalanx.ci_fixer.proactive_scanner._post_comment", - new_callable=AsyncMock) as mock_post, \ - patch("phalanx.ci_fixer.proactive_scanner._record_scan", - new_callable=AsyncMock): + with ( + patch( + "phalanx.ci_fixer.proactive_scanner.scan_pr_for_patterns", + new_callable=AsyncMock, + return_value=findings, + ), + patch( + "phalanx.ci_fixer.proactive_scanner._post_comment", new_callable=AsyncMock + ) as mock_post, + patch("phalanx.ci_fixer.proactive_scanner._record_scan", new_callable=AsyncMock), + ): await _run_scan("acme/backend", 1, "abc", "token") mock_post.assert_not_called() diff --git a/tests/unit/test_ci_fixer_agent_p4.py b/tests/unit/test_ci_fixer_agent_p4.py index 06c69f4e..9e7b1302 100644 --- a/tests/unit/test_ci_fixer_agent_p4.py +++ b/tests/unit/test_ci_fixer_agent_p4.py @@ -10,16 +10,16 @@ from __future__ import annotations -import json -from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import AsyncMock, MagicMock, patch import pytest from phalanx.agents.ci_fixer import CIFixerAgent from phalanx.ci_fixer.analyst import FilePatch -from phalanx.ci_fixer.version_parity import VersionParityResult +if TYPE_CHECKING: + from pathlib import Path # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -182,7 +182,11 @@ async def test_enable_github_auto_merge_success(): gql_response = MagicMock() gql_response.status_code = 200 gql_response.json.return_value = { - "data": {"enablePullRequestAutoMerge": {"pullRequest": {"autoMergeRequest": {"mergeMethod": "SQUASH"}}}} + "data": { + "enablePullRequestAutoMerge": { + "pullRequest": {"autoMergeRequest": {"mergeMethod": "SQUASH"}} + } + } } call_count = {"get": 0, "post": 0} @@ -265,6 +269,7 @@ async def test_open_draft_pr_creates_draft(): ci_run.pr_number = None from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="ruff") pr_response = MagicMock() @@ -309,6 +314,7 @@ async def test_open_draft_pr_with_auto_merge(): ci_run.pr_number = 10 from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="ruff") pr_response = MagicMock() @@ -325,8 +331,10 @@ async def test_open_draft_pr_with_auto_merge(): async def mock_enable_auto_merge(**kwargs): enable_auto_merge_called["n"] += 1 - with patch("httpx.AsyncClient", return_value=mock_client), \ - patch.object(agent, "_enable_github_auto_merge", side_effect=mock_enable_auto_merge): + with ( + patch("httpx.AsyncClient", return_value=mock_client), + patch.object(agent, "_enable_github_auto_merge", side_effect=mock_enable_auto_merge), + ): pr_num = await agent._open_draft_pr( integration=integration, ci_run=ci_run, @@ -361,6 +369,7 @@ async def test_open_draft_pr_failure_returns_none(): ci_run.pr_number = None from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="ruff") pr_response = MagicMock() @@ -415,7 +424,6 @@ async def mock_execute(stmt): mock_session.execute = mock_execute - from phalanx.ci_fixer.analyst import FilePatch from phalanx.ci_fixer.log_parser import ParsedLog patches = [FilePatch(path="src/foo.py", start_line=1, end_line=1, corrected_lines=["x\n"])] @@ -466,7 +474,6 @@ async def mock_execute(stmt): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - from phalanx.ci_fixer.analyst import FilePatch from phalanx.ci_fixer.log_parser import ParsedLog patches = [FilePatch(path="src/foo.py", start_line=1, end_line=1, corrected_lines=["x\n"])] diff --git a/tests/unit/test_ci_fixer_analyst_loop.py b/tests/unit/test_ci_fixer_analyst_loop.py index 7068e20d..db40e4d5 100644 --- a/tests/unit/test_ci_fixer_analyst_loop.py +++ b/tests/unit/test_ci_fixer_analyst_loop.py @@ -11,19 +11,16 @@ from __future__ import annotations -import json -from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest from phalanx.agents.ci_fixer import ( - CIFixerAgent, _MAX_FILES_CHANGED, _MAX_TOTAL_LINE_DELTA, + CIFixerAgent, ) - # ── helpers ──────────────────────────────────────────────────────────────────── @@ -136,23 +133,27 @@ async def test_execute_inner_delta_guard_exceeded(): ], ) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = big_plan MockAnalyst.return_value = mock_analyst_inst result = await agent._execute_inner() assert result.success is False - assert "large" in result.output.get("root_cause", "").lower() or result.output.get("reason") in ("low_confidence",) + assert "large" in result.output.get("root_cause", "").lower() or result.output.get( + "reason" + ) in ("low_confidence",) # ── analyst loop: too many files guard ──────────────────────────────────────── @@ -170,16 +171,18 @@ async def test_execute_inner_too_many_files(): big_plan = _make_fix_plan_with_patches(n_patches=_MAX_FILES_CHANGED + 2) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = big_plan MockAnalyst.return_value = mock_analyst_inst @@ -203,17 +206,19 @@ async def test_execute_inner_no_files_written(): parsed = _make_parsed_with_lint() good_plan = _make_fix_plan_with_patches(n_patches=1) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_apply_patches", return_value=[]), \ - patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_apply_patches", return_value=[]), + patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = good_plan MockAnalyst.return_value = mock_analyst_inst @@ -243,19 +248,21 @@ async def test_execute_inner_validation_failed_with_pr(): mock_validation.tool_version = "ruff 0.4.0" mock_validation.output = "still failing" - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), \ - patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), \ - patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), \ - patch.object(agent, "_comment_unable_to_fix", new_callable=AsyncMock) as mock_unable: + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), + patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), + patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), + patch.object(agent, "_comment_unable_to_fix", new_callable=AsyncMock) as mock_unable, + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = good_plan MockAnalyst.return_value = mock_analyst_inst @@ -288,23 +295,33 @@ async def test_execute_inner_commit_failed(): from phalanx.ci_fixer.version_parity import VersionParityResult - mock_parity = VersionParityResult(ok=True, local_version="ruff 0.4.0", failure_version="", reason="ok") - - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), \ - patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), \ - patch.object(agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity), \ - patch.object(agent, "_commit_to_safe_branch", new_callable=AsyncMock, - return_value={"sha": None, "error": "commit failed"}), \ - patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock): + mock_parity = VersionParityResult( + ok=True, local_version="ruff 0.4.0", failure_version="", reason="ok" + ) + + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), + patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), + patch.object( + agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity + ), + patch.object( + agent, + "_commit_to_safe_branch", + new_callable=AsyncMock, + return_value={"sha": None, "error": "commit failed"}, + ), + patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = good_plan MockAnalyst.return_value = mock_analyst_inst @@ -356,26 +373,42 @@ async def mock_execute(_stmt): from phalanx.ci_fixer.version_parity import VersionParityResult - mock_parity = VersionParityResult(ok=True, local_version="ruff 0.4.0", failure_version="", reason="ok") - - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), \ - patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), \ - patch.object(agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity), \ - patch.object(agent, "_get_fingerprint_success_count", new_callable=AsyncMock, return_value=0), \ - patch.object(agent, "_commit_to_safe_branch", new_callable=AsyncMock, - return_value={"sha": "abc12345", "branch": "phalanx/ci-fix/run-loop-001", "push_failed": False}), \ - patch.object(agent, "_open_draft_pr", new_callable=AsyncMock, return_value=42), \ - patch.object(agent, "_comment_on_pr", new_callable=AsyncMock), \ - patch.object(agent, "_update_fingerprint_on_success", new_callable=AsyncMock): + mock_parity = VersionParityResult( + ok=True, local_version="ruff 0.4.0", failure_version="", reason="ok" + ) + + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), + patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), + patch.object( + agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity + ), + patch.object( + agent, "_get_fingerprint_success_count", new_callable=AsyncMock, return_value=0 + ), + patch.object( + agent, + "_commit_to_safe_branch", + new_callable=AsyncMock, + return_value={ + "sha": "abc12345", + "branch": "phalanx/ci-fix/run-loop-001", + "push_failed": False, + }, + ), + patch.object(agent, "_open_draft_pr", new_callable=AsyncMock, return_value=42), + patch.object(agent, "_comment_on_pr", new_callable=AsyncMock), + patch.object(agent, "_update_fingerprint_on_success", new_callable=AsyncMock), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = good_plan MockAnalyst.return_value = mock_analyst_inst @@ -427,22 +460,38 @@ async def mock_execute(_stmt): mock_parity = VersionParityResult(ok=True, local_version="", failure_version="", reason="ok") - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), \ - patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), \ - patch.object(agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity), \ - patch.object(agent, "_get_fingerprint_success_count", new_callable=AsyncMock, return_value=0), \ - patch.object(agent, "_commit_to_safe_branch", new_callable=AsyncMock, - return_value={"sha": "deadbeef", "branch": "phalanx/ci-fix/run-loop-001", "push_failed": True}), \ - patch.object(agent, "_update_fingerprint_on_success", new_callable=AsyncMock) as mock_fp_update: + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), + patch("phalanx.agents.ci_fixer.validate_fix", return_value=mock_validation), + patch.object( + agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity + ), + patch.object( + agent, "_get_fingerprint_success_count", new_callable=AsyncMock, return_value=0 + ), + patch.object( + agent, + "_commit_to_safe_branch", + new_callable=AsyncMock, + return_value={ + "sha": "deadbeef", + "branch": "phalanx/ci-fix/run-loop-001", + "push_failed": True, + }, + ), + patch.object( + agent, "_update_fingerprint_on_success", new_callable=AsyncMock + ) as mock_fp_update, + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = good_plan MockAnalyst.return_value = mock_analyst_inst @@ -507,30 +556,42 @@ async def mock_execute(_stmt): from phalanx.ci_fixer.version_parity import VersionParityResult - mock_parity = VersionParityResult(ok=True, local_version="ruff 0.4.0", failure_version="", reason="ok") + mock_parity = VersionParityResult( + ok=True, local_version="ruff 0.4.0", failure_version="", reason="ok" + ) def _validation_side_effect(*args, **kwargs): validation_calls["n"] += 1 return fail_validation if validation_calls["n"] == 1 else pass_validation - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), \ - patch("phalanx.agents.ci_fixer.parse_log", side_effect=[parsed, empty_retry]), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), \ - patch("phalanx.agents.ci_fixer.validate_fix", side_effect=_validation_side_effect), \ - patch.object(agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity), \ - patch.object(agent, "_get_fingerprint_success_count", new_callable=AsyncMock, return_value=0), \ - patch.object(agent, "_commit_to_safe_branch", new_callable=AsyncMock, - return_value={"sha": "abc", "push_failed": False}), \ - patch.object(agent, "_open_draft_pr", new_callable=AsyncMock, return_value=11), \ - patch.object(agent, "_comment_on_pr", new_callable=AsyncMock), \ - patch.object(agent, "_update_fingerprint_on_success", new_callable=AsyncMock): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="log"), + patch("phalanx.agents.ci_fixer.parse_log", side_effect=[parsed, empty_retry]), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_apply_patches", return_value=["src/foo.py"]), + patch("phalanx.agents.ci_fixer.validate_fix", side_effect=_validation_side_effect), + patch.object( + agent, "_check_tool_version_parity", new_callable=AsyncMock, return_value=mock_parity + ), + patch.object( + agent, "_get_fingerprint_success_count", new_callable=AsyncMock, return_value=0 + ), + patch.object( + agent, + "_commit_to_safe_branch", + new_callable=AsyncMock, + return_value={"sha": "abc", "push_failed": False}, + ), + patch.object(agent, "_open_draft_pr", new_callable=AsyncMock, return_value=11), + patch.object(agent, "_comment_on_pr", new_callable=AsyncMock), + patch.object(agent, "_update_fingerprint_on_success", new_callable=AsyncMock), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = good_plan MockAnalyst.return_value = mock_analyst_inst @@ -568,8 +629,9 @@ async def test_execute_cleans_workspace_on_exception(tmp_path): workspace = tmp_path / "ci-fixer" / "run-loop-001" workspace.mkdir(parents=True) - with patch.object(agent, "_execute_inner", new_callable=AsyncMock, - side_effect=RuntimeError("boom")): + with patch.object( + agent, "_execute_inner", new_callable=AsyncMock, side_effect=RuntimeError("boom") + ): result = await agent.execute() assert result.success is False diff --git a/tests/unit/test_ci_fixer_p2.py b/tests/unit/test_ci_fixer_p2.py index 262a864d..fe4ffcbd 100644 --- a/tests/unit/test_ci_fixer_p2.py +++ b/tests/unit/test_ci_fixer_p2.py @@ -11,21 +11,20 @@ from __future__ import annotations import json -from datetime import UTC, datetime -from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import AsyncMock, MagicMock, patch import pytest from phalanx.ci_fixer.analyst import ( - FilePatch, FileWindow, - FixPlan, RootCauseAnalyst, ) from phalanx.ci_fixer.log_parser import LintError, ParsedLog from phalanx.ci_fixer.outcome_tracker import _parse_iso +if TYPE_CHECKING: + from pathlib import Path # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -53,20 +52,25 @@ def _write(tmp_path: Path, rel: str, lines: list[str]) -> Path: return full -def _patch_json(path: str, start: int, end: int, corrected: list[str], - confidence: str = "high") -> str: - return json.dumps({ - "confidence": confidence, - "root_cause": "test root cause", - "patches": [{ - "path": path, - "start_line": start, - "end_line": end, - "corrected_lines": corrected, - "reason": "test", - }], - "needs_new_test": False, - }) +def _patch_json( + path: str, start: int, end: int, corrected: list[str], confidence: str = "high" +) -> str: + return json.dumps( + { + "confidence": confidence, + "root_cause": "test root cause", + "patches": [ + { + "path": path, + "start_line": start, + "end_line": end, + "corrected_lines": corrected, + "reason": "test", + } + ], + "needs_new_test": False, + } + ) # ── RootCauseAnalyst history lookup ──────────────────────────────────────────── @@ -87,13 +91,15 @@ def llm(**_): llm_called["n"] += 1 return "{}" - cached_patches = [{ - "path": "src/foo.py", - "start_line": 1, - "end_line": len(self._FILE), - "corrected_lines": self._FILE[1:], - "reason": "history", - }] + cached_patches = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": len(self._FILE), + "corrected_lines": self._FILE[1:], + "reason": "history", + } + ] analyst = RootCauseAnalyst( call_llm=llm, @@ -140,13 +146,15 @@ def llm(**_): return llm_response # Return patches for a file not in windows (will fail validation) - bad_cached = [{ - "path": "src/invented.py", - "start_line": 1, - "end_line": 5, - "corrected_lines": ["x\n"], - "reason": "bad", - }] + bad_cached = [ + { + "path": "src/invented.py", + "start_line": 1, + "end_line": 5, + "corrected_lines": ["x\n"], + "reason": "bad", + } + ] analyst = RootCauseAnalyst( call_llm=llm, @@ -226,7 +234,7 @@ def llm(**_): call_llm=llm, history_lookup=lambda fp: [], # empty → falsy ) - plan = analyst.analyze(_lint_log("src/foo.py"), tmp_path, fingerprint_hash="abc") + analyst.analyze(_lint_log("src/foo.py"), tmp_path, fingerprint_hash="abc") assert llm_called["n"] == 1 @@ -287,8 +295,13 @@ def test_returns_patches_when_history_exists(self): """Returns patch list when fingerprint found in DB.""" agent = self._make_agent() expected_patches = [ - {"path": "src/foo.py", "start_line": 1, "end_line": 3, - "corrected_lines": ["a\n"], "reason": "test"} + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 3, + "corrected_lines": ["a\n"], + "reason": "test", + } ] with patch.object(agent, "_async_lookup_fix_history", new_callable=AsyncMock) as mock_async: diff --git a/tests/unit/test_ci_fixer_p3.py b/tests/unit/test_ci_fixer_p3.py index 93b15b6c..6cf52a2c 100644 --- a/tests/unit/test_ci_fixer_p3.py +++ b/tests/unit/test_ci_fixer_p3.py @@ -10,22 +10,17 @@ from __future__ import annotations -from datetime import UTC, datetime from unittest.mock import MagicMock, patch -import pytest - +from phalanx.ci_fixer.log_parser import LintError, ParsedLog, TestFailure from phalanx.ci_fixer.suppressor import ( - _FLAKY_THRESHOLD, _MIN_OBSERVATIONS, is_flaky_suppressed, record_flaky_pattern, should_use_history, ) -from phalanx.ci_fixer.log_parser import LintError, ParsedLog, TestFailure, TypeError from phalanx.db.models import CIFailureFingerprint, CIFlakyPattern - # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -33,10 +28,7 @@ def _lint_log(*errors: tuple) -> ParsedLog: """errors: list of (file, code) tuples.""" return ParsedLog( tool="ruff", - lint_errors=[ - LintError(file=f, line=1, col=1, code=c, message="test") - for f, c in errors - ], + lint_errors=[LintError(file=f, line=1, col=1, code=c, message="test") for f, c in errors], ) @@ -61,7 +53,8 @@ def _make_flaky_pattern( def _make_fingerprint( success_count: int = 3, failure_count: int = 1, - last_good_patch_json: str | None = '[{"path":"src/foo.py","start_line":1,"end_line":1,"corrected_lines":["x\\n"],"reason":""}]', + last_good_patch_json: str + | None = '[{"path":"src/foo.py","start_line":1,"end_line":1,"corrected_lines":["x\\n"],"reason":""}]', hash_: str = "abc123def456abcd", ) -> CIFailureFingerprint: fp = MagicMock(spec=CIFailureFingerprint) @@ -125,30 +118,40 @@ def test_one_unknown_error_not_suppressed(self): def test_insufficient_observations_not_suppressed(self): """< MIN_OBSERVATIONS → not suppressed regardless of rate.""" parsed = _lint_log(("src/foo.py", "F401")) - patterns = [_make_flaky_pattern( - "src/foo.py", "F401", - flaky_count=2, total_count=_MIN_OBSERVATIONS - 1, - )] + patterns = [ + _make_flaky_pattern( + "src/foo.py", + "F401", + flaky_count=2, + total_count=_MIN_OBSERVATIONS - 1, + ) + ] assert not is_flaky_suppressed(parsed, patterns) def test_below_threshold_not_suppressed(self): """flaky_rate < FLAKY_THRESHOLD → not suppressed.""" parsed = _lint_log(("src/foo.py", "F401")) - patterns = [_make_flaky_pattern( - "src/foo.py", "F401", - flaky_count=1, total_count=10, # 10% flaky rate - )] + patterns = [ + _make_flaky_pattern( + "src/foo.py", + "F401", + flaky_count=1, + total_count=10, # 10% flaky rate + ) + ] assert not is_flaky_suppressed(parsed, patterns) def test_test_failures_not_suppressed(self): """Test failures never suppressed (too risky).""" parsed = ParsedLog( tool="pytest", - test_failures=[TestFailure( - test_id="tests/test_foo.py::test_bar", - file="tests/test_foo.py", - message="", - )], + test_failures=[ + TestFailure( + test_id="tests/test_foo.py::test_bar", + file="tests/test_foo.py", + message="", + ) + ], ) patterns = [_make_flaky_pattern("tests/test_foo.py", "F401")] assert not is_flaky_suppressed(parsed, patterns) @@ -278,6 +281,7 @@ def test_existing_pattern_has_only_last_seen_at(self): def test_commit_dedup_window_constant(): """The 5-minute dedup window constant is present and reasonable.""" from phalanx.api.routes.ci_webhooks import _COMMIT_DEDUP_WINDOW_MINUTES + assert 1 <= _COMMIT_DEDUP_WINDOW_MINUTES <= 60 @@ -289,6 +293,7 @@ class TestHistoryWeighting: def _make_agent(self): from phalanx.agents.ci_fixer import CIFixerAgent + with patch("phalanx.agents.base.BaseAgent.__init__", return_value=None): agent = CIFixerAgent.__new__(CIFixerAgent) agent.ci_fix_run_id = "test-run-001" @@ -297,7 +302,6 @@ def _make_agent(self): def test_unreliable_fingerprint_returns_none(self): """failure_count >= success_count → _lookup returns None.""" - import asyncio from unittest.mock import AsyncMock agent = self._make_agent() @@ -305,7 +309,6 @@ def test_unreliable_fingerprint_returns_none(self): fp = _make_fingerprint(success_count=1, failure_count=3) async def mock_lookup(fp_hash): - from phalanx.db.models import CIFailureFingerprint # Simulate DB returning a fingerprint with bad stats mock_result = MagicMock() mock_result.scalar_one_or_none.return_value = fp @@ -325,8 +328,15 @@ def test_reliable_fingerprint_returns_patches(self): from unittest.mock import AsyncMock agent = self._make_agent() - expected = [{"path": "src/foo.py", "start_line": 1, - "end_line": 1, "corrected_lines": ["x\n"], "reason": ""}] + expected = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 1, + "corrected_lines": ["x\n"], + "reason": "", + } + ] with patch.object(agent, "_async_lookup_fix_history", new_callable=AsyncMock) as m: m.return_value = expected diff --git a/tests/unit/test_ci_fixer_p4.py b/tests/unit/test_ci_fixer_p4.py index 0affb9fc..32173856 100644 --- a/tests/unit/test_ci_fixer_p4.py +++ b/tests/unit/test_ci_fixer_p4.py @@ -8,8 +8,6 @@ from __future__ import annotations -import pytest - from phalanx.ci_fixer.version_parity import ( VersionParityResult, check_version_parity, @@ -17,7 +15,6 @@ should_auto_merge, ) - # ── check_version_parity ─────────────────────────────────────────────────────── @@ -172,6 +169,7 @@ def test_mismatch_notice(self): def test_ci_integration_auto_merge_column_exists(): """Phase 4 columns exist on CIIntegration model.""" from phalanx.db.models import CIIntegration + # Verify the mapped columns exist by inspecting the class assert hasattr(CIIntegration, "auto_merge") assert hasattr(CIIntegration, "min_success_count") @@ -180,4 +178,5 @@ def test_ci_integration_auto_merge_column_exists(): def test_ci_fix_run_parity_column_exists(): """Phase 4 column exists on CIFixRun model.""" from phalanx.db.models import CIFixRun + assert hasattr(CIFixRun, "tool_version_parity_ok") diff --git a/tests/unit/test_ci_fixer_p5.py b/tests/unit/test_ci_fixer_p5.py index 3a4bb539..3bd4c531 100644 --- a/tests/unit/test_ci_fixer_p5.py +++ b/tests/unit/test_ci_fixer_p5.py @@ -9,8 +9,6 @@ from __future__ import annotations -import pytest - from phalanx.ci_fixer.pattern_promoter import ( MIN_GLOBAL_SUCCESS_COUNT, MIN_REPOS_FOR_PROMOTION, @@ -22,7 +20,6 @@ should_post_proactive_comment, ) - # ── is_promotion_eligible ────────────────────────────────────────────────────── @@ -115,9 +112,7 @@ def test_tool_name_in_comment(self): assert "ruff" in comment def test_info_findings_different_header(self): - findings = [ - ProactiveFinding("fp1", "ruff", "info pattern", "info", ["f.py"]) - ] + findings = [ProactiveFinding("fp1", "ruff", "info pattern", "info", ["f.py"])] comment = format_proactive_comment(findings, 42) assert "informational" in comment.lower() or "info" in comment.lower() @@ -144,15 +139,11 @@ def test_no_findings_false(self): assert not should_post_proactive_comment([]) def test_only_info_findings_false(self): - findings = [ - ProactiveFinding("fp1", "ruff", "info", "info", ["f.py"]) - ] + findings = [ProactiveFinding("fp1", "ruff", "info", "info", ["f.py"])] assert not should_post_proactive_comment(findings) def test_warning_finding_true(self): - findings = [ - ProactiveFinding("fp1", "ruff", "warning pattern", "warning", ["f.py"]) - ] + findings = [ProactiveFinding("fp1", "ruff", "warning pattern", "warning", ["f.py"])] assert should_post_proactive_comment(findings) def test_mixed_info_and_warning_true(self): @@ -168,6 +159,7 @@ def test_mixed_info_and_warning_true(self): def test_pattern_registry_columns(): from phalanx.db.models import CIPatternRegistry + assert hasattr(CIPatternRegistry, "fingerprint_hash") assert hasattr(CIPatternRegistry, "tool") assert hasattr(CIPatternRegistry, "repo_count") @@ -177,6 +169,7 @@ def test_pattern_registry_columns(): def test_proactive_scan_columns(): from phalanx.db.models import CIProactiveScan + assert hasattr(CIProactiveScan, "repo_full_name") assert hasattr(CIProactiveScan, "pr_number") assert hasattr(CIProactiveScan, "findings_json") diff --git a/tests/unit/test_ci_fixer_p5_async.py b/tests/unit/test_ci_fixer_p5_async.py index df9bc346..686170ca 100644 --- a/tests/unit/test_ci_fixer_p5_async.py +++ b/tests/unit/test_ci_fixer_p5_async.py @@ -8,7 +8,6 @@ from __future__ import annotations -import json from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -20,7 +19,6 @@ scan_pr_for_patterns, ) - # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -87,8 +85,10 @@ async def test_scan_pr_no_python_files(): mock_db_ctx, _ = _mock_db(rows=[pattern]) - with patch("httpx.AsyncClient", return_value=mock_client), \ - patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_db_ctx): + with ( + patch("httpx.AsyncClient", return_value=mock_client), + patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_db_ctx), + ): findings = await scan_pr_for_patterns("acme/backend", 1, "abc", "token") assert findings == [] @@ -117,8 +117,10 @@ async def test_scan_pr_with_python_files_finds_patterns(): mock_db_ctx, _ = _mock_db(rows=[pattern]) - with patch("httpx.AsyncClient", return_value=mock_client), \ - patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_db_ctx): + with ( + patch("httpx.AsyncClient", return_value=mock_client), + patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_db_ctx), + ): findings = await scan_pr_for_patterns("acme/backend", 1, "abc", "token") assert len(findings) == 1 @@ -146,8 +148,10 @@ async def test_scan_pr_low_success_count_is_info(): mock_db_ctx, _ = _mock_db(rows=[pattern]) - with patch("httpx.AsyncClient", return_value=mock_client), \ - patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_db_ctx): + with ( + patch("httpx.AsyncClient", return_value=mock_client), + patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_db_ctx), + ): findings = await scan_pr_for_patterns("acme/backend", 1, "abc", "token") assert len(findings) == 1 @@ -248,6 +252,7 @@ async def test_record_scan_inserts_row(): mock_session.add.assert_called_once() from phalanx.db.models import CIProactiveScan + added = mock_session.add.call_args[0][0] assert isinstance(added, CIProactiveScan) assert added.pr_number == 42 @@ -269,7 +274,9 @@ async def test_promote_patterns_eligible_creates_registry_entry(): row.fingerprint_hash = "abc123def456abcd" row.tool = "ruff" row.sample_errors = "unused import" - row.last_good_patch_json = '[{"path":"src/foo.py","start_line":1,"end_line":1,"corrected_lines":["x\\n"],"reason":""}]' + row.last_good_patch_json = ( + '[{"path":"src/foo.py","start_line":1,"end_line":1,"corrected_lines":["x\\n"],"reason":""}]' + ) row.repo_count = 3 # >= MIN_REPOS_FOR_PROMOTION=2 row.total_successes = 5 @@ -303,6 +310,7 @@ async def mock_execute(stmt): # Should have added one entry to the registry mock_session.add.assert_called_once() from phalanx.db.models import CIPatternRegistry + added = mock_session.add.call_args[0][0] assert isinstance(added, CIPatternRegistry) assert added.fingerprint_hash == "abc123def456abcd" diff --git a/tests/unit/test_ci_fixer_reproducer.py b/tests/unit/test_ci_fixer_reproducer.py new file mode 100644 index 00000000..865f4e27 --- /dev/null +++ b/tests/unit/test_ci_fixer_reproducer.py @@ -0,0 +1,434 @@ +""" +Tests for phalanx.ci_fixer.reproducer — ReproducerAgent. + +Coverage targets: + - reproduce(): all 5 verdicts (skipped, confirmed, flaky, env_mismatch, timeout) + - reproduce(): skipped when sandbox unavailable (available=False) + - reproduce(): skipped when reproducer_cmd is empty + - _output_matches_failure(): tool name match, error code match, no match + - _run_subprocess(): timeout path (process killed) +""" + +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from phalanx.ci_fixer.context import ReproductionResult, StructuredFailure +from phalanx.ci_fixer.reproducer import ReproducerAgent, ReproductionAttempt +from phalanx.ci_fixer.sandbox import SandboxResult + +if TYPE_CHECKING: + from pathlib import Path + +# ── Fixtures ────────────────────────────────────────────────────────────────── + + +def _make_sandbox(available: bool = True, container_id: str = "") -> SandboxResult: + return SandboxResult( + sandbox_id="phalanx-sandbox-test1234", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + available=available, + container_id=container_id, + ) + + +def _make_sf( + tool: str = "ruff", + errors: list | None = None, +) -> StructuredFailure: + return StructuredFailure( + tool=tool, + failure_type="lint", + reproducer_cmd=f"{tool} check .", + errors=errors or [], + ) + + +def _make_proc( + returncode: int = 0, + stdout: bytes = b"", + stderr: bytes = b"", + timeout: bool = False, +) -> AsyncMock: + """Return a mock asyncio.Process suitable for create_subprocess_shell.""" + proc = MagicMock() + proc.returncode = returncode + proc.kill = MagicMock() + proc.wait = AsyncMock() + if timeout: + proc.communicate = AsyncMock(side_effect=TimeoutError()) + else: + proc.communicate = AsyncMock(return_value=(stdout, stderr)) + return proc + + +# ── reproduce() — verdict classification ────────────────────────────────────── + + +class TestReproduceVerdicts: + @pytest.mark.asyncio + async def test_reproduce_skipped_when_no_sandbox(self, tmp_path: Path): + """sandbox_result=None → verdict=skipped, no subprocess.""" + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=None, + structured_failure=_make_sf(), + timeout_seconds=30, + ) + assert result.verdict == "skipped" + assert isinstance(result, ReproductionResult) + + @pytest.mark.asyncio + async def test_reproduce_skipped_when_sandbox_unavailable(self, tmp_path: Path): + """sandbox_result.available=False → verdict=skipped.""" + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(available=False), + structured_failure=_make_sf(), + timeout_seconds=30, + ) + assert result.verdict == "skipped" + + @pytest.mark.asyncio + async def test_reproduce_skipped_when_empty_cmd(self, tmp_path: Path): + """Empty reproducer_cmd → verdict=skipped.""" + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=_make_sf(), + timeout_seconds=30, + ) + assert result.verdict == "skipped" + + @pytest.mark.asyncio + async def test_reproduce_skipped_when_whitespace_cmd(self, tmp_path: Path): + """Whitespace-only reproducer_cmd → verdict=skipped.""" + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd=" ", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=_make_sf(), + timeout_seconds=30, + ) + assert result.verdict == "skipped" + + @pytest.mark.asyncio + async def test_reproduce_flaky(self, tmp_path: Path): + """exit_code=0 → command passed → CI failure was transient → flaky.""" + proc = _make_proc(returncode=0, stdout=b"All checks passed", stderr=b"") + + with patch("asyncio.create_subprocess_shell", return_value=proc): + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=_make_sf(), + timeout_seconds=30, + ) + + assert result.verdict == "flaky" + assert result.exit_code == 0 + + @pytest.mark.asyncio + async def test_reproduce_confirmed_by_tool_name(self, tmp_path: Path): + """exit_code!=0, tool name in output → confirmed.""" + proc = _make_proc( + returncode=1, + stdout=b"ruff check failed: F401 unused import", + stderr=b"", + ) + + with patch("asyncio.create_subprocess_shell", return_value=proc): + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=_make_sf(tool="ruff"), + timeout_seconds=30, + ) + + assert result.verdict == "confirmed" + assert result.exit_code == 1 + + @pytest.mark.asyncio + async def test_reproduce_confirmed_by_error_code(self, tmp_path: Path): + """exit_code!=0, error code in output (no tool name) → confirmed.""" + proc = _make_proc( + returncode=1, + stdout=b"src/foo.py:1:1: F401 'os' imported but unused", + stderr=b"", + ) + sf = _make_sf(tool="ruff", errors=[{"file": "src/foo.py", "code": "F401"}]) + # Use a tool name that won't match the output to isolate error-code path + sf.tool = "linter" + + with patch("asyncio.create_subprocess_shell", return_value=proc): + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="linter check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=sf, + timeout_seconds=30, + ) + + assert result.verdict == "confirmed" + + @pytest.mark.asyncio + async def test_reproduce_env_mismatch(self, tmp_path: Path): + """exit_code!=0 but output unrelated to original failure → env_mismatch.""" + proc = _make_proc( + returncode=1, + stdout=b"command not found: ruff", + stderr=b"bash: ruff: command not found", + ) + # Use a structured failure whose tool name won't appear in the "not found" output + sf = StructuredFailure( + tool="mypy", + failure_type="type_error", + reproducer_cmd="mypy .", + errors=[{"code": "E999"}], + ) + + with patch("asyncio.create_subprocess_shell", return_value=proc): + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="mypy .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=sf, + timeout_seconds=30, + ) + + assert result.verdict == "env_mismatch" + + @pytest.mark.asyncio + async def test_reproduce_timeout(self, tmp_path: Path): + """Process exceeds timeout → verdict=timeout, process killed.""" + proc = _make_proc(timeout=True) + + with patch("asyncio.create_subprocess_shell", return_value=proc): + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=_make_sf(), + timeout_seconds=1, + ) + + assert result.verdict == "timeout" + proc.kill.assert_called_once() + proc.wait.assert_awaited_once() + + @pytest.mark.asyncio + async def test_reproduce_result_fields(self, tmp_path: Path): + """Result includes reproducer_cmd and truncated output.""" + long_output = b"F401 " * 1000 # > 4000 chars + proc = _make_proc(returncode=1, stdout=long_output, stderr=b"") + + with patch("asyncio.create_subprocess_shell", return_value=proc): + agent = ReproducerAgent() + result = await agent.reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(), + structured_failure=_make_sf(tool="ruff"), + timeout_seconds=30, + ) + + assert result.reproducer_cmd == "ruff check ." + assert len(result.output) <= 4000 + + +# ── _output_matches_failure ─────────────────────────────────────────────────── + + +class TestOutputMatchesFailure: + def test_matches_by_tool_name(self): + agent = ReproducerAgent() + sf = _make_sf(tool="ruff") + assert agent._output_matches_failure("ruff check found 3 errors", sf) is True + + def test_matches_tool_name_case_insensitive(self): + agent = ReproducerAgent() + sf = _make_sf(tool="Ruff") + assert agent._output_matches_failure("RUFF check: error F401", sf) is True + + def test_matches_by_error_code(self): + agent = ReproducerAgent() + sf = StructuredFailure( + tool="nontool", # won't match output + failure_type="lint", + reproducer_cmd="check .", + errors=[{"code": "E501", "file": "foo.py"}], + ) + assert agent._output_matches_failure("line too long E501 at 120 chars", sf) is True + + def test_no_match_unrelated_output(self): + agent = ReproducerAgent() + sf = StructuredFailure( + tool="mypy", + failure_type="type_error", + reproducer_cmd="mypy .", + errors=[{"code": "E999"}], + ) + # Output has neither "mypy" nor "E999" + assert agent._output_matches_failure("pip install failed: network error", sf) is False + + def test_no_match_empty_output(self): + agent = ReproducerAgent() + sf = _make_sf(tool="ruff") + assert agent._output_matches_failure("", sf) is False + + def test_no_match_empty_errors_no_tool(self): + agent = ReproducerAgent() + sf = StructuredFailure( + tool="pytest", + failure_type="test_regression", + reproducer_cmd="pytest .", + errors=[], + ) + # Output has no "pytest" in it + assert agent._output_matches_failure("FAILED test_foo.py::test_bar", sf) is False + + def test_matches_with_no_code_in_error_dict(self): + """Errors with no 'code' key should not raise.""" + agent = ReproducerAgent() + sf = StructuredFailure( + tool="ruff", + failure_type="lint", + reproducer_cmd="ruff check .", + errors=[{"file": "foo.py", "line": 1}], # no 'code' key + ) + # Tool name match should still work + assert agent._output_matches_failure("ruff: 1 error found", sf) is True + + +# ── Container exec path ─────────────────────────────────────────────────────── + + +class TestReproducerContainerExec: + """Tests for the docker exec path when sandbox_result.container_id is set.""" + + def _make_sandbox_with_container(self, container_id: str = "ctr-abc123") -> object: + return _make_sandbox(available=True, container_id=container_id) + + @pytest.mark.asyncio + async def test_reproduce_uses_docker_exec_when_container_id_set(self, tmp_path): + """When container_id is set, command is wrapped with docker exec.""" + proc = _make_proc(returncode=1, stdout=b"ruff: 1 error", stderr=b"") + + captured_args = [] + + async def fake_exec(*args, **kwargs): + captured_args.extend(args) + return proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + result = await ReproducerAgent().reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=self._make_sandbox_with_container("ctr-abc123"), + structured_failure=_make_sf(tool="ruff"), + timeout_seconds=30, + ) + + assert result.verdict == "confirmed" + assert "docker" in captured_args + assert "ctr-abc123" in captured_args + + @pytest.mark.asyncio + async def test_reproduce_local_subprocess_when_no_container_id(self, tmp_path): + """When container_id is empty, uses local subprocess shell.""" + proc = _make_proc(returncode=0, stdout=b"clean", stderr=b"") + + with patch("asyncio.create_subprocess_shell", return_value=proc): + result = await ReproducerAgent().reproduce( + reproducer_cmd="ruff check .", + workspace_path=tmp_path, + sandbox_result=_make_sandbox(available=True), # no container_id + structured_failure=_make_sf(tool="ruff"), + timeout_seconds=30, + ) + + assert result.verdict == "flaky" + + @pytest.mark.asyncio + async def test_run_subprocess_with_container_id(self, tmp_path): + """_run_subprocess with container_id uses create_subprocess_exec.""" + proc = _make_proc(returncode=0, stdout=b"ok", stderr=b"") + + captured = [] + + async def fake_exec(*args, **kwargs): + captured.extend(args) + return proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + step = await ReproducerAgent()._run_subprocess( + cmd="ruff check .", + cwd=tmp_path, + timeout_seconds=30, + container_id="ctr-xyz", + ) + + assert step.exit_code == 0 + assert "ctr-xyz" in captured + assert "sh" in captured + + @pytest.mark.asyncio + async def test_run_subprocess_without_container_id(self, tmp_path): + """_run_subprocess without container_id uses create_subprocess_shell.""" + proc = _make_proc(returncode=0, stdout=b"clean", stderr=b"") + + with patch("asyncio.create_subprocess_shell", return_value=proc): + step = await ReproducerAgent()._run_subprocess( + cmd="ruff check .", + cwd=tmp_path, + timeout_seconds=30, + container_id="", + ) + + assert step.exit_code == 0 + + +# ── ReproductionAttempt dataclass ───────────────────────────────────────────── + + +class TestReproductionAttempt: + def test_defaults(self): + a = ReproductionAttempt( + cmd="ruff check .", + exit_code=1, + stdout="out", + stderr="err", + elapsed_seconds=0.5, + ) + assert a.timed_out is False + + def test_timed_out_flag(self): + a = ReproductionAttempt( + cmd="ruff check .", + exit_code=-1, + stdout="", + stderr="", + elapsed_seconds=30.0, + timed_out=True, + ) + assert a.timed_out is True diff --git a/tests/unit/test_ci_fixer_sandbox.py b/tests/unit/test_ci_fixer_sandbox.py new file mode 100644 index 00000000..a6aa84d7 --- /dev/null +++ b/tests/unit/test_ci_fixer_sandbox.py @@ -0,0 +1,350 @@ +""" +Tests for phalanx.ci_fixer.sandbox — SandboxProvisioner + SandboxResult. + +Coverage targets: + - detect_stack: all 5 stacks (python/node/go/rust/unknown) + priority order + - provision: happy path with pool checkout, disabled, unique IDs, stack_hint + - provision: pool checkout timeout → available=False fallback + - provision: Docker error → available=False fallback + - release: container_id empty (no-op), container_id set → pool.checkin + - SandboxResult: field defaults including new container_id + mount_path +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from phalanx.ci_fixer.sandbox import SandboxProvisioner, SandboxResult +from phalanx.ci_fixer.sandbox_pool import SandboxUnavailableError + +if TYPE_CHECKING: + from pathlib import Path + + +def _mock_pool(container_id: str = "ctr-abc123") -> MagicMock: + """Return a mock SandboxPool that returns a container on checkout.""" + from phalanx.ci_fixer.sandbox_pool import PooledContainer + + pool = MagicMock() + container = PooledContainer( + container_id=container_id, + stack="python", + image="phalanx-sandbox-python:latest", + ) + pool.checkout = AsyncMock(return_value=container) + pool.checkin = AsyncMock() + return pool + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + + +def _make_workspace(tmp_path: Path, *filenames: str) -> Path: + """Create a temp directory with the given marker files.""" + for name in filenames: + (tmp_path / name).touch() + return tmp_path + + +# ── detect_stack ────────────────────────────────────────────────────────────── + + +class TestDetectStack: + def test_detect_stack_python_pyproject(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "pyproject.toml") + assert SandboxProvisioner().detect_stack(ws) == "python" + + def test_detect_stack_python_requirements(self, tmp_path: Path): + """requirements.txt alone should also detect python.""" + ws = _make_workspace(tmp_path, "requirements.txt") + assert SandboxProvisioner().detect_stack(ws) == "python" + + def test_detect_stack_python_setup_py(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "setup.py") + assert SandboxProvisioner().detect_stack(ws) == "python" + + def test_detect_stack_node(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "package.json") + assert SandboxProvisioner().detect_stack(ws) == "node" + + def test_detect_stack_go(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "go.mod") + assert SandboxProvisioner().detect_stack(ws) == "go" + + def test_detect_stack_rust(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "Cargo.toml") + assert SandboxProvisioner().detect_stack(ws) == "rust" + + def test_detect_stack_unknown(self, tmp_path: Path): + """Empty workspace has no markers → unknown.""" + assert SandboxProvisioner().detect_stack(tmp_path) == "unknown" + + def test_detect_stack_python_wins_over_node(self, tmp_path: Path): + """Python is checked first — monorepo with both pyproject + package.json resolves to python.""" + ws = _make_workspace(tmp_path, "pyproject.toml", "package.json") + assert SandboxProvisioner().detect_stack(ws) == "python" + + def test_detect_stack_nonexistent_path(self, tmp_path: Path): + """Path that doesn't exist returns unknown without raising.""" + missing = tmp_path / "nonexistent" + result = SandboxProvisioner().detect_stack(missing) + assert result == "unknown" + + +# ── SandboxProvisioner.provision ────────────────────────────────────────────── + + +class TestSandboxProvision: + def _mock_settings(self, enabled: bool = True) -> MagicMock: + s = MagicMock() + s.sandbox_enabled = enabled + s.sandbox_checkout_timeout_seconds = 30 + return s + + @pytest.mark.asyncio + async def test_provision_returns_sandbox_result_with_container_id(self, tmp_path: Path): + """Happy path: pool checkout succeeds → SandboxResult has container_id set.""" + ws = _make_workspace(tmp_path, "pyproject.toml") + pool = _mock_pool(container_id="ctr-abc123") + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + provisioner = SandboxProvisioner() + with patch.object(provisioner, "_bind_workspace", new_callable=AsyncMock): + result = await provisioner.provision(ws) + + assert result is not None + assert result.stack == "python" + assert result.image == "python:3.12-slim" + assert result.workspace_path == str(ws) + assert result.sandbox_id.startswith("phalanx-sandbox-") + assert result.container_id == "ctr-abc123" + assert result.available is True + + @pytest.mark.asyncio + async def test_provision_disabled_returns_none(self, tmp_path: Path): + """sandbox_enabled=False → provision returns None immediately.""" + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings(enabled=False)): + result = await SandboxProvisioner().provision(tmp_path) + + assert result is None + + @pytest.mark.asyncio + async def test_provision_generates_unique_ids(self, tmp_path: Path): + """Each provision call generates a different sandbox_id.""" + pool = _mock_pool() + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + p = SandboxProvisioner() + with patch.object(p, "_bind_workspace", new_callable=AsyncMock): + r1 = await p.provision(tmp_path) + r2 = await p.provision(tmp_path) + + assert r1 is not None and r2 is not None + assert r1.sandbox_id != r2.sandbox_id + + @pytest.mark.asyncio + async def test_provision_pool_timeout_returns_available_false(self, tmp_path: Path): + """Pool checkout times out → SandboxResult with available=False, no exception.""" + pool = MagicMock() + pool.checkout = AsyncMock(side_effect=SandboxUnavailableError("timeout")) + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + result = await SandboxProvisioner().provision(tmp_path) + + assert result is not None + assert result.available is False + assert result.container_id == "" + + @pytest.mark.asyncio + async def test_provision_docker_error_returns_available_false(self, tmp_path: Path): + """Any unexpected exception → SandboxResult with available=False.""" + pool = MagicMock() + pool.checkout = AsyncMock(side_effect=RuntimeError("docker daemon not found")) + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + result = await SandboxProvisioner().provision(tmp_path) + + assert result is not None + assert result.available is False + + @pytest.mark.asyncio + async def test_provision_stack_hint_overrides_detection(self, tmp_path: Path): + """stack_hint bypasses file-existence detection.""" + pool = _mock_pool() + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + provisioner = SandboxProvisioner() + with patch.object(provisioner, "_bind_workspace", new_callable=AsyncMock): + result = await provisioner.provision(tmp_path, stack_hint="node") + + assert result is not None + assert result.stack == "node" + assert result.image == "node:20-slim" + + @pytest.mark.asyncio + async def test_provision_unknown_stack_uses_ubuntu(self, tmp_path: Path): + """Empty workspace → unknown stack → ubuntu:22.04 image.""" + pool = _mock_pool() + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + provisioner = SandboxProvisioner() + with patch.object(provisioner, "_bind_workspace", new_callable=AsyncMock): + result = await provisioner.provision(tmp_path) + + assert result is not None + assert result.stack == "unknown" + assert result.image == "ubuntu:22.04" + + @pytest.mark.asyncio + async def test_provision_go_workspace(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "go.mod") + pool = _mock_pool() + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + provisioner = SandboxProvisioner() + with patch.object(provisioner, "_bind_workspace", new_callable=AsyncMock): + result = await provisioner.provision(ws) + + assert result is not None + assert result.stack == "go" + assert result.image == "golang:1.22-alpine" + + @pytest.mark.asyncio + async def test_provision_rust_workspace(self, tmp_path: Path): + ws = _make_workspace(tmp_path, "Cargo.toml") + pool = _mock_pool() + + with patch("phalanx.ci_fixer.sandbox.settings", self._mock_settings()): + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + provisioner = SandboxProvisioner() + with patch.object(provisioner, "_bind_workspace", new_callable=AsyncMock): + result = await provisioner.provision(ws) + + assert result is not None + assert result.stack == "rust" + assert result.image == "rust:1.77-slim" + + +class TestSandboxProvisionerRelease: + @pytest.mark.asyncio + async def test_release_no_op_when_no_container_id(self, tmp_path: Path): + """release() with empty container_id is a no-op — no pool call.""" + result = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path=str(tmp_path), + container_id="", + ) + pool = MagicMock() + pool.checkin = AsyncMock() + + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + await SandboxProvisioner().release(result) + + pool.checkin.assert_not_called() + + @pytest.mark.asyncio + async def test_release_calls_pool_checkin(self, tmp_path: Path): + """release() with container_id → pool.checkin called.""" + result = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="phalanx-sandbox-python:latest", + workspace_path=str(tmp_path), + container_id="ctr-abc123", + ) + pool = MagicMock() + pool.checkin = AsyncMock() + + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + await SandboxProvisioner().release(result) + + pool.checkin.assert_awaited_once() + + @pytest.mark.asyncio + async def test_release_swallows_pool_error(self, tmp_path: Path): + """pool.checkin raises → release() swallows the error.""" + result = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="img", + workspace_path=str(tmp_path), + container_id="ctr-abc123", + ) + pool = MagicMock() + pool.checkin = AsyncMock(side_effect=RuntimeError("pool gone")) + + with patch("phalanx.ci_fixer.sandbox.get_sandbox_pool", AsyncMock(return_value=pool)): + await SandboxProvisioner().release(result) # must not raise + + +# ── SandboxResult dataclass ─────────────────────────────────────────────────── + + +class TestSandboxResult: + def test_sandbox_result_extra_defaults_empty(self): + r = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + ) + assert r.extra == {} + + def test_sandbox_result_available_default(self): + r = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + ) + assert r.available is True + + def test_sandbox_result_available_can_be_false(self): + r = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + available=False, + ) + assert r.available is False + + def test_sandbox_result_container_id_default_empty(self): + r = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + ) + assert r.container_id == "" + + def test_sandbox_result_mount_path_default(self): + r = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + ) + assert r.mount_path == "/workspace" + + def test_sandbox_result_container_id_set(self): + r = SandboxResult( + sandbox_id="phalanx-sandbox-abc12345", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + container_id="abc123def456", + ) + assert r.container_id == "abc123def456" diff --git a/tests/unit/test_ci_fixer_success_path.py b/tests/unit/test_ci_fixer_success_path.py index b9998625..9c495a12 100644 --- a/tests/unit/test_ci_fixer_success_path.py +++ b/tests/unit/test_ci_fixer_success_path.py @@ -17,14 +17,12 @@ from __future__ import annotations import json -from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest from phalanx.agents.ci_fixer import CIFixerAgent - # ── helpers ──────────────────────────────────────────────────────────────────── @@ -112,13 +110,17 @@ async def mock_execute(_stmt): mock_flaky = MagicMock() - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="some log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed_with_errors), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[mock_flaky]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=True), \ - patch.object(agent, "_mark_failed", new_callable=AsyncMock) as mock_mark: + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="some log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed_with_errors), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object( + agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[mock_flaky] + ), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=True), + patch.object(agent, "_mark_failed", new_callable=AsyncMock) as mock_mark, + ): result = await agent._execute_inner() assert result.success is False @@ -165,16 +167,18 @@ async def mock_execute(_stmt): ) low_conf_plan = FixPlan(confidence="low", root_cause="can't fix this") - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="some log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed_with_errors), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), \ - patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, \ - patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock) as mock_mark, \ - patch("phalanx.ci_fixer.analyst.FixPlan"): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="some log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed_with_errors), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=True), + patch("phalanx.agents.ci_fixer.RootCauseAnalyst") as MockAnalyst, + patch.object(agent, "_mark_failed_with_fields", new_callable=AsyncMock), + patch("phalanx.ci_fixer.analyst.FixPlan"), + ): mock_analyst_inst = MagicMock() mock_analyst_inst.analyze.return_value = low_conf_plan MockAnalyst.return_value = mock_analyst_inst @@ -221,14 +225,16 @@ async def mock_execute(_stmt): lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="unused")], ) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="some log"), \ - patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed_with_errors), \ - patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), \ - patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), \ - patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=False), \ - patch.object(agent, "_mark_failed", new_callable=AsyncMock) as mock_mark: + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch.object(agent, "_fetch_logs", new_callable=AsyncMock, return_value="some log"), + patch("phalanx.agents.ci_fixer.parse_log", return_value=parsed_with_errors), + patch.object(agent, "_persist_fingerprint", new_callable=AsyncMock), + patch.object(agent, "_load_flaky_patterns", new_callable=AsyncMock, return_value=[]), + patch("phalanx.agents.ci_fixer.is_flaky_suppressed", return_value=False), + patch.object(agent, "_clone_repo", new_callable=AsyncMock, return_value=False), + patch.object(agent, "_mark_failed", new_callable=AsyncMock) as mock_mark, + ): result = await agent._execute_inner() assert result.success is False @@ -350,8 +356,10 @@ async def test_open_draft_pr_auto_merge_calls_enable(): mock_client.__aexit__ = AsyncMock(return_value=None) mock_client.post = AsyncMock(return_value=mock_resp) - with patch("httpx.AsyncClient", return_value=mock_client), \ - patch.object(agent, "_enable_github_auto_merge", new_callable=AsyncMock) as mock_auto: + with ( + patch("httpx.AsyncClient", return_value=mock_client), + patch.object(agent, "_enable_github_auto_merge", new_callable=AsyncMock) as mock_auto, + ): result = await agent._open_draft_pr( integration=integration, ci_run=ci_run, @@ -411,8 +419,6 @@ async def test_enable_auto_merge_gql_error(): gql_resp.json.return_value = {"errors": [{"message": "auto-merge not enabled"}]} gql_resp.text = '{"errors": [...]}' - call_count = {"n": 0} - async def side_effect_client(): pass @@ -645,8 +651,10 @@ async def test_async_lookup_history_unreliable_returns_none(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.ci_fixer.should_use_history", return_value=False): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch("phalanx.agents.ci_fixer.should_use_history", return_value=False), + ): result = await agent._async_lookup_fix_history("fp_hash_abc") assert result is None @@ -670,8 +678,10 @@ async def test_async_lookup_history_corrupt_json(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.ci_fixer.should_use_history", return_value=True): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch("phalanx.agents.ci_fixer.should_use_history", return_value=True), + ): result = await agent._async_lookup_fix_history("fp_hash_abc") assert result is None @@ -682,7 +692,15 @@ async def test_async_lookup_history_hit(): """Valid history → returns patch list.""" agent = _make_agent() - patches = [{"path": "src/foo.py", "start_line": 1, "end_line": 2, "corrected_lines": ["x\n"], "reason": ""}] + patches = [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": 2, + "corrected_lines": ["x\n"], + "reason": "", + } + ] mock_fp = MagicMock() mock_fp.success_count = 5 mock_fp.failure_count = 1 @@ -696,8 +714,10 @@ async def test_async_lookup_history_hit(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.ci_fixer.should_use_history", return_value=True): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch("phalanx.agents.ci_fixer.should_use_history", return_value=True), + ): result = await agent._async_lookup_fix_history("fp_hash_abc") assert result is not None @@ -727,7 +747,11 @@ async def test_update_fingerprint_run_missing(): with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx): await agent._update_fingerprint_on_success( fingerprint_hash="fp_abc", - patches=[FilePatch(path="src/foo.py", start_line=1, end_line=2, corrected_lines=["x\n"], reason="")], + patches=[ + FilePatch( + path="src/foo.py", start_line=1, end_line=2, corrected_lines=["x\n"], reason="" + ) + ], tool_version="ruff 0.4.0", parsed_log=ParsedLog(tool="ruff"), ) @@ -745,7 +769,11 @@ async def test_update_fingerprint_exception_logged(): await agent._update_fingerprint_on_success( fingerprint_hash="fp_abc", - patches=[FilePatch(path="src/foo.py", start_line=1, end_line=2, corrected_lines=["x\n"], reason="")], + patches=[ + FilePatch( + path="src/foo.py", start_line=1, end_line=2, corrected_lines=["x\n"], reason="" + ) + ], tool_version="ruff 0.4.0", parsed_log=ParsedLog(tool="ruff"), ) @@ -790,7 +818,11 @@ async def mock_execute(_stmt): with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx): await agent._update_fingerprint_on_success( fingerprint_hash="fp_abc", - patches=[FilePatch(path="src/foo.py", start_line=1, end_line=2, corrected_lines=["x\n"], reason="")], + patches=[ + FilePatch( + path="src/foo.py", start_line=1, end_line=2, corrected_lines=["x\n"], reason="" + ) + ], tool_version="ruff 0.4.0", parsed_log=ParsedLog(tool="ruff"), ) @@ -839,9 +871,11 @@ async def test_commit_to_safe_branch_no_changes(tmp_path): mock_repo.untracked_files = [] mock_repo.remotes = [] - with patch("phalanx.agents.ci_fixer.CIFixerAgent._commit_to_safe_branch", - new_callable=AsyncMock, - return_value={"sha": None, "message": "no_changes"}): + with patch( + "phalanx.agents.ci_fixer.CIFixerAgent._commit_to_safe_branch", + new_callable=AsyncMock, + return_value={"sha": None, "message": "no_changes"}, + ): result = await agent._commit_to_safe_branch( workspace=tmp_path, source_branch="main", diff --git a/tests/unit/test_ci_fixer_unit.py b/tests/unit/test_ci_fixer_unit.py index 576e0b16..b002d786 100644 --- a/tests/unit/test_ci_fixer_unit.py +++ b/tests/unit/test_ci_fixer_unit.py @@ -5,8 +5,6 @@ from __future__ import annotations -import pytest - from phalanx.ci_fixer.classifier import classify_failure, extract_failing_files from phalanx.ci_fixer.events import CIFailureEvent from phalanx.ci_fixer.log_fetcher import _extract_failure_section, _truncate @@ -322,8 +320,6 @@ def test_agent_role_is_ci_fixer(self): # ── RootCauseAnalyst — JSON parsing (mocked _call_llm) ──────────────────────── -from unittest.mock import patch # noqa: E402 - class TestRootCauseAnalyst: """Tests for the RootCauseAnalyst LLM confirmation step (windowed API).""" @@ -403,15 +399,17 @@ def test_patch_delta_stored(self, tmp_path): self._write_file(tmp_path, "src/foo.py") analyst = self._make_analyst(self._patch_response("src/foo.py")) plan = analyst.analyze(self._make_parsed_log(), tmp_path) - assert plan.patches[0].delta == -1 # removed 1 line (import os) + assert plan.patches[0].delta == -1 # removed 1 line (import os) # ── Low confidence / no patches ─────────────────────────────────────────── def test_low_confidence_returns_empty_patches(self, tmp_path): self._write_file(tmp_path, "src/foo.py") import json as _j - response = _j.dumps({"confidence": "low", "root_cause": "unclear", - "patches": [], "needs_new_test": False}) + + response = _j.dumps( + {"confidence": "low", "root_cause": "unclear", "patches": [], "needs_new_test": False} + ) analyst = self._make_analyst(response) plan = analyst.analyze(self._make_parsed_log(), tmp_path) assert plan.confidence == "low" @@ -437,17 +435,23 @@ def test_patch_for_unknown_file_rejected(self, tmp_path): """LLM returns a patch for a file we never sent → rejected → no actionable patches.""" self._write_file(tmp_path, "src/foo.py") import json as _j - response = _j.dumps({ - "confidence": "high", - "root_cause": "x", - "patches": [{ - "path": "src/invented_file.py", - "start_line": 1, "end_line": 3, - "corrected_lines": ["x = 1\n"], - "reason": "invented", - }], - "needs_new_test": False, - }) + + response = _j.dumps( + { + "confidence": "high", + "root_cause": "x", + "patches": [ + { + "path": "src/invented_file.py", + "start_line": 1, + "end_line": 3, + "corrected_lines": ["x = 1\n"], + "reason": "invented", + } + ], + "needs_new_test": False, + } + ) analyst = self._make_analyst(response) plan = analyst.analyze(self._make_parsed_log(), tmp_path) # All patches rejected → downgraded to low @@ -458,17 +462,23 @@ def test_patch_for_test_file_rejected(self, tmp_path): """Patches targeting test files are always rejected.""" self._write_file(tmp_path, "tests/test_foo.py") import json as _j - response = _j.dumps({ - "confidence": "high", - "root_cause": "x", - "patches": [{ - "path": "tests/test_foo.py", - "start_line": 1, "end_line": 3, - "corrected_lines": ["x = 1\n"], - "reason": "bad", - }], - "needs_new_test": False, - }) + + response = _j.dumps( + { + "confidence": "high", + "root_cause": "x", + "patches": [ + { + "path": "tests/test_foo.py", + "start_line": 1, + "end_line": 3, + "corrected_lines": ["x = 1\n"], + "reason": "bad", + } + ], + "needs_new_test": False, + } + ) parsed = self._make_parsed_log(file="tests/test_foo.py") analyst = self._make_analyst(response) plan = analyst.analyze(parsed, tmp_path) @@ -478,19 +488,25 @@ def test_patch_delta_too_large_rejected(self, tmp_path): """corrected_lines that differ by > MAX_LINE_DELTA from the window → rejected.""" self._write_file(tmp_path, "src/foo.py") import json as _j + # Window is 5 lines; returning 50 lines → delta = 45 → rejected big_lines = [f"line {i}\n" for i in range(50)] - response = _j.dumps({ - "confidence": "high", - "root_cause": "x", - "patches": [{ - "path": "src/foo.py", - "start_line": 1, "end_line": len(self._FILE_LINES), - "corrected_lines": big_lines, - "reason": "too big", - }], - "needs_new_test": False, - }) + response = _j.dumps( + { + "confidence": "high", + "root_cause": "x", + "patches": [ + { + "path": "src/foo.py", + "start_line": 1, + "end_line": len(self._FILE_LINES), + "corrected_lines": big_lines, + "reason": "too big", + } + ], + "needs_new_test": False, + } + ) analyst = self._make_analyst(response) plan = analyst.analyze(self._make_parsed_log(), tmp_path) assert len(plan.patches) == 0 diff --git a/tests/unit/test_ci_fixer_verifier.py b/tests/unit/test_ci_fixer_verifier.py new file mode 100644 index 00000000..df7920e6 --- /dev/null +++ b/tests/unit/test_ci_fixer_verifier.py @@ -0,0 +1,453 @@ +""" +Tests for phalanx.ci_fixer.verifier — VerifierAgent. + +Coverage targets: + - verify(): all 4 verdicts (passed, failed, skipped, timeout) + - verify(): unknown stack → skipped (no profile) + - verify(): python with pytest infrastructure → prepends pytest step + - verify(): python without pytest → ruff only + - verify(): first failing step short-circuits remaining steps + - verify(): all steps timeout → verdict=timeout + - _get_profile(): known and unknown stacks + - _has_pytest(): detects pyproject.toml, pytest.ini, setup.cfg, absent + - _run_cmd(): FileNotFoundError → VerificationStep with tool-not-found output + - VerificationStep dataclass defaults +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from phalanx.ci_fixer.context import VerificationResult +from phalanx.ci_fixer.sandbox import SandboxResult +from phalanx.ci_fixer.verifier import VerificationStep, VerifierAgent + +if TYPE_CHECKING: + from pathlib import Path + +# ── Helpers ─────────────────────────────────────────────────────────────────── + + +def _make_proc( + returncode: int = 0, + stdout: bytes = b"", + stderr: bytes = b"", + timeout: bool = False, + not_found: bool = False, +) -> MagicMock: + """Return a mock asyncio.Process.""" + proc = MagicMock() + proc.returncode = returncode + proc.kill = MagicMock() + proc.wait = AsyncMock() + if timeout: + proc.communicate = AsyncMock(side_effect=TimeoutError()) + elif not_found: + proc.communicate = AsyncMock(side_effect=FileNotFoundError()) + else: + proc.communicate = AsyncMock(return_value=(stdout, stderr)) + return proc + + +def _make_workspace(tmp_path: Path, *filenames: str) -> Path: + for name in filenames: + (tmp_path / name).touch() + return tmp_path + + +# ── _has_pytest ─────────────────────────────────────────────────────────────── + + +class TestHasPytest: + def test_detects_pyproject_toml(self, tmp_path: Path): + _make_workspace(tmp_path, "pyproject.toml") + assert VerifierAgent()._has_pytest(tmp_path) is True + + def test_detects_pytest_ini(self, tmp_path: Path): + _make_workspace(tmp_path, "pytest.ini") + assert VerifierAgent()._has_pytest(tmp_path) is True + + def test_detects_setup_cfg(self, tmp_path: Path): + _make_workspace(tmp_path, "setup.cfg") + assert VerifierAgent()._has_pytest(tmp_path) is True + + def test_absent(self, tmp_path: Path): + assert VerifierAgent()._has_pytest(tmp_path) is False + + +# ── _get_profile ────────────────────────────────────────────────────────────── + + +class TestGetProfile: + def test_python_profile(self): + profile = VerifierAgent()._get_profile("python") + assert len(profile) >= 1 + labels = [label for label, _ in profile] + assert "ruff_full" in labels + + def test_node_profile(self): + profile = VerifierAgent()._get_profile("node") + assert any("npm" in " ".join(cmd) for _, cmd in profile) + + def test_go_profile(self): + profile = VerifierAgent()._get_profile("go") + assert any("go" in cmd[0] for _, cmd in profile) + + def test_rust_profile(self): + profile = VerifierAgent()._get_profile("rust") + assert any("cargo" in cmd[0] for _, cmd in profile) + + def test_unknown_stack_empty_profile(self): + assert VerifierAgent()._get_profile("unknown") == [] + + +# ── verify() — core verdicts ────────────────────────────────────────────────── + + +class TestVerifyVerdicts: + @pytest.mark.asyncio + async def test_verify_skipped_unknown_stack(self, tmp_path: Path): + """Unknown stack → no profile → verdict=skipped immediately.""" + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="unknown", + sandbox_result=None, + timeout_seconds=30, + ) + assert result.verdict == "skipped" + assert isinstance(result, VerificationResult) + + @pytest.mark.asyncio + async def test_verify_passed_python_no_pytest(self, tmp_path: Path): + """Python workspace without pytest infra → ruff only → exit 0 → passed.""" + proc = _make_proc(returncode=0, stdout=b"All checks passed", stderr=b"") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="python", + sandbox_result=None, + timeout_seconds=30, + ) + + assert result.verdict == "passed" + + @pytest.mark.asyncio + async def test_verify_passed_python_with_pytest(self, tmp_path: Path): + """Python workspace with pyproject.toml → pytest + ruff → both pass.""" + _make_workspace(tmp_path, "pyproject.toml") + proc = _make_proc(returncode=0, stdout=b"passed", stderr=b"") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="python", + sandbox_result=None, + timeout_seconds=30, + ) + + assert result.verdict == "passed" + + @pytest.mark.asyncio + async def test_verify_failed_on_first_step(self, tmp_path: Path): + """First step fails → verdict=failed, short-circuit.""" + proc = _make_proc(returncode=1, stdout=b"", stderr=b"FAILED test_foo.py") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="python", + sandbox_result=None, + timeout_seconds=30, + ) + + assert result.verdict == "failed" + assert "FAILED" in result.output + + @pytest.mark.asyncio + async def test_verify_timeout_single_step(self, tmp_path: Path): + """Single step times out → all_timed_out → verdict=timeout.""" + proc = _make_proc(timeout=True) + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="go", + sandbox_result=None, + timeout_seconds=1, + ) + + assert result.verdict == "timeout" + + @pytest.mark.asyncio + async def test_verify_timeout_step_does_not_block_other_steps(self, tmp_path: Path): + """Timeout on one step is skipped; if remaining steps pass → passed.""" + _make_workspace(tmp_path, "pyproject.toml") + + call_count = 0 + + async def fake_exec(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + # First call (pytest) times out + return _make_proc(timeout=True) + # Subsequent calls (ruff) pass + return _make_proc(returncode=0, stdout=b"clean", stderr=b"") + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="python", + sandbox_result=None, + timeout_seconds=1, + ) + + # ruff passed even though pytest timed out → overall passed + assert result.verdict == "passed" + + @pytest.mark.asyncio + async def test_verify_go_passed(self, tmp_path: Path): + proc = _make_proc(returncode=0, stdout=b"ok example.com/pkg", stderr=b"") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="go", + sandbox_result=None, + timeout_seconds=30, + ) + + assert result.verdict == "passed" + + @pytest.mark.asyncio + async def test_verify_rust_failed(self, tmp_path: Path): + proc = _make_proc(returncode=1, stdout=b"", stderr=b"error[E0308]: mismatched types") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="rust", + sandbox_result=None, + timeout_seconds=30, + ) + + assert result.verdict == "failed" + + @pytest.mark.asyncio + async def test_verify_cmd_run_populated(self, tmp_path: Path): + """cmd_run contains the command that was executed.""" + proc = _make_proc(returncode=0) + + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await VerifierAgent().verify( + workspace_path=tmp_path, + stack="go", + sandbox_result=None, + timeout_seconds=30, + ) + + assert result.cmd_run != "" + assert "go" in result.cmd_run + + +# ── _run_cmd ────────────────────────────────────────────────────────────────── + + +class TestRunCmd: + @pytest.mark.asyncio + async def test_run_cmd_tool_not_found(self, tmp_path: Path): + """FileNotFoundError → VerificationStep with tool-not-found message, no raise.""" + with patch( + "asyncio.create_subprocess_exec", + side_effect=FileNotFoundError("notool"), + ): + step = await VerifierAgent()._run_cmd( + label="test_label", + cmd_args=["notool", "--check"], + cwd=tmp_path, + timeout_seconds=30, + ) + + assert step.exit_code == -1 + assert "not found" in step.output + assert step.timed_out is False + + @pytest.mark.asyncio + async def test_run_cmd_success(self, tmp_path: Path): + proc = _make_proc(returncode=0, stdout=b"clean", stderr=b"") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + step = await VerifierAgent()._run_cmd( + label="ruff_full", + cmd_args=["ruff", "check", "."], + cwd=tmp_path, + timeout_seconds=30, + ) + + assert step.exit_code == 0 + assert step.timed_out is False + assert "clean" in step.output + + @pytest.mark.asyncio + async def test_run_cmd_timeout(self, tmp_path: Path): + proc = _make_proc(timeout=True) + + with patch("asyncio.create_subprocess_exec", return_value=proc): + step = await VerifierAgent()._run_cmd( + label="slow_check", + cmd_args=["slow", "cmd"], + cwd=tmp_path, + timeout_seconds=1, + ) + + assert step.timed_out is True + assert step.exit_code == -1 + proc.kill.assert_called_once() + + +# ── Container exec path ─────────────────────────────────────────────────────── + + +def _make_sandbox_result(container_id: str = "") -> SandboxResult: + return SandboxResult( + sandbox_id="phalanx-sandbox-test1234", + stack="python", + image="python:3.12-slim", + workspace_path="/tmp/ws", + available=True, + container_id=container_id, + ) + + +class TestVerifierContainerExec: + @pytest.mark.asyncio + async def test_run_cmd_with_container_id_uses_docker_exec(self, tmp_path): + """When container_id is set, command is wrapped with docker exec.""" + proc = _make_proc(returncode=0, stdout=b"clean", stderr=b"") + + captured_args = [] + + async def fake_exec(*args, **kwargs): + captured_args.extend(args) + return proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + step = await VerifierAgent()._run_cmd( + label="ruff_full", + cmd_args=["ruff", "check", "."], + cwd=tmp_path, + timeout_seconds=30, + container_id="ctr-abc123", + ) + + assert step.exit_code == 0 + assert "docker" in captured_args + assert "ctr-abc123" in captured_args + assert "ruff" in captured_args + + @pytest.mark.asyncio + async def test_run_cmd_without_container_id_runs_locally(self, tmp_path): + """When container_id is empty, runs locally (original behaviour).""" + proc = _make_proc(returncode=0, stdout=b"ok", stderr=b"") + + with patch("asyncio.create_subprocess_exec", return_value=proc): + step = await VerifierAgent()._run_cmd( + label="go_test", + cmd_args=["go", "test", "./..."], + cwd=tmp_path, + timeout_seconds=30, + container_id="", + ) + + assert step.exit_code == 0 + + @pytest.mark.asyncio + async def test_verify_passes_container_id_to_run_cmd(self, tmp_path): + """verify() extracts container_id from sandbox_result and threads it through.""" + proc = _make_proc(returncode=0, stdout=b"ok", stderr=b"") + captured_container_ids = [] + + original_run_cmd = VerifierAgent._run_cmd + + async def recording_run_cmd(self, label, cmd_args, cwd, timeout_seconds, container_id=""): + captured_container_ids.append(container_id) + return await original_run_cmd( + self, label, cmd_args, cwd, timeout_seconds, container_id=container_id + ) + + with patch("asyncio.create_subprocess_exec", return_value=proc): + with patch.object(VerifierAgent, "_run_cmd", recording_run_cmd): + await VerifierAgent().verify( + workspace_path=tmp_path, + stack="go", + sandbox_result=_make_sandbox_result(container_id="ctr-xyz"), + timeout_seconds=30, + ) + + assert all(cid == "ctr-xyz" for cid in captured_container_ids) + + @pytest.mark.asyncio + async def test_verify_no_container_id_when_sandbox_none(self, tmp_path): + """sandbox_result=None → container_id="" → local subprocess path.""" + proc = _make_proc(returncode=0, stdout=b"ok", stderr=b"") + captured_container_ids = [] + + original_run_cmd = VerifierAgent._run_cmd + + async def recording_run_cmd(self, label, cmd_args, cwd, timeout_seconds, container_id=""): + captured_container_ids.append(container_id) + return await original_run_cmd( + self, label, cmd_args, cwd, timeout_seconds, container_id=container_id + ) + + with patch("asyncio.create_subprocess_exec", return_value=proc): + with patch.object(VerifierAgent, "_run_cmd", recording_run_cmd): + await VerifierAgent().verify( + workspace_path=tmp_path, + stack="go", + sandbox_result=None, + timeout_seconds=30, + ) + + assert all(cid == "" for cid in captured_container_ids) + + def test_container_id_helper_no_sandbox(self): + assert VerifierAgent()._container_id(None) == "" + + def test_container_id_helper_with_container(self): + sr = _make_sandbox_result(container_id="ctr-123") + assert VerifierAgent()._container_id(sr) == "ctr-123" + + def test_container_id_helper_empty_container(self): + sr = _make_sandbox_result(container_id="") + assert VerifierAgent()._container_id(sr) == "" + + +# ── VerificationStep dataclass ──────────────────────────────────────────────── + + +class TestVerificationStep: + def test_defaults(self): + step = VerificationStep( + label="ruff", + cmd="ruff check .", + exit_code=0, + output="clean", + elapsed_seconds=1.2, + ) + assert step.timed_out is False + + def test_timed_out_flag(self): + step = VerificationStep( + label="pytest", + cmd="pytest", + exit_code=-1, + output="", + elapsed_seconds=120.0, + timed_out=True, + ) + assert step.timed_out is True diff --git a/tests/unit/test_ci_validator_unit.py b/tests/unit/test_ci_validator_unit.py index bbd9b4c7..f22871fe 100644 --- a/tests/unit/test_ci_validator_unit.py +++ b/tests/unit/test_ci_validator_unit.py @@ -8,10 +8,8 @@ from unittest.mock import MagicMock, patch -import pytest - from phalanx.ci_fixer.log_parser import LintError, ParsedLog, TestFailure, TypeError -from phalanx.ci_fixer.validator import ValidationResult, validate_fix +from phalanx.ci_fixer.validator import validate_fix def _parsed(tool: str, **kwargs) -> ParsedLog: @@ -27,60 +25,87 @@ def _mock_run(self, returncode: int, stdout: str = "", stderr: str = ""): return result def test_ruff_pass(self, tmp_path): - parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) + parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) with patch("subprocess.run", return_value=self._mock_run(0, "All good")): result = validate_fix(parsed, tmp_path) assert result.passed is True assert result.tool == "ruff" def test_ruff_fail(self, tmp_path): - parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) - with patch("subprocess.run", return_value=self._mock_run(1, "", "phalanx/foo.py:1:1: F401")): + parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) + with patch( + "subprocess.run", return_value=self._mock_run(1, "", "phalanx/foo.py:1:1: F401") + ): result = validate_fix(parsed, tmp_path) assert result.passed is False def test_mypy_pass(self, tmp_path): - parsed = _parsed("mypy", type_errors=[ - TypeError(file="phalanx/foo.py", line=5, col=0, message="type error") - ]) + parsed = _parsed( + "mypy", + type_errors=[TypeError(file="phalanx/foo.py", line=5, col=0, message="type error")], + ) with patch("subprocess.run", return_value=self._mock_run(0)): result = validate_fix(parsed, tmp_path) assert result.passed is True assert result.tool == "mypy" def test_pytest_pass(self, tmp_path): - parsed = _parsed("pytest", test_failures=[ - TestFailure(test_id="tests/unit/test_foo.py::test_bar", file="tests/unit/test_foo.py", message="") - ]) + parsed = _parsed( + "pytest", + test_failures=[ + TestFailure( + test_id="tests/unit/test_foo.py::test_bar", + file="tests/unit/test_foo.py", + message="", + ) + ], + ) with patch("subprocess.run", return_value=self._mock_run(0)): result = validate_fix(parsed, tmp_path) assert result.passed is True assert result.tool == "pytest" def test_pytest_fail(self, tmp_path): - parsed = _parsed("pytest", test_failures=[ - TestFailure(test_id="tests/unit/test_foo.py::test_bar", file="tests/unit/test_foo.py", message="") - ]) + parsed = _parsed( + "pytest", + test_failures=[ + TestFailure( + test_id="tests/unit/test_foo.py::test_bar", + file="tests/unit/test_foo.py", + message="", + ) + ], + ) with patch("subprocess.run", return_value=self._mock_run(1, "", "FAILED")): result = validate_fix(parsed, tmp_path) assert result.passed is False def test_tsc_pass(self, tmp_path): - parsed = _parsed("tsc", type_errors=[ - TypeError(file="src/foo.ts", line=1, col=1, message="TS2345: error") - ]) + parsed = _parsed( + "tsc", + type_errors=[TypeError(file="src/foo.ts", line=1, col=1, message="TS2345: error")], + ) with patch("subprocess.run", return_value=self._mock_run(0)): result = validate_fix(parsed, tmp_path) assert result.passed is True def test_eslint_pass(self, tmp_path): - parsed = _parsed("eslint", lint_errors=[ - LintError(file="src/foo.js", line=1, col=1, code="eslint", message="no-unused-vars") - ]) + parsed = _parsed( + "eslint", + lint_errors=[ + LintError(file="src/foo.js", line=1, col=1, code="eslint", message="no-unused-vars") + ], + ) with patch("subprocess.run", return_value=self._mock_run(0)): result = validate_fix(parsed, tmp_path) assert result.passed is True @@ -92,9 +117,12 @@ def test_unknown_tool_skips_validation(self, tmp_path): assert "skipped" in result.output def test_tool_not_found_returns_fail(self, tmp_path): - parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) + parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) with patch("subprocess.run", side_effect=FileNotFoundError): result = validate_fix(parsed, tmp_path) assert result.passed is False @@ -102,53 +130,79 @@ def test_tool_not_found_returns_fail(self, tmp_path): def test_timeout_returns_fail(self, tmp_path): import subprocess - parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) - with patch("subprocess.run", side_effect=subprocess.TimeoutExpired(cmd="ruff", timeout=120)): + + parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) + with patch( + "subprocess.run", side_effect=subprocess.TimeoutExpired(cmd="ruff", timeout=120) + ): result = validate_fix(parsed, tmp_path) assert result.passed is False assert "timed out" in result.output def test_mypy_pass(self, tmp_path): - parsed = _parsed("mypy", type_errors=[ - TypeError(file="phalanx/foo.py", line=5, col=0, message="type error") - ]) + parsed = _parsed( + "mypy", + type_errors=[TypeError(file="phalanx/foo.py", line=5, col=0, message="type error")], + ) with patch("subprocess.run", return_value=self._mock_run(0)): result = validate_fix(parsed, tmp_path) assert result.passed is True assert result.tool == "mypy" def test_mypy_fail(self, tmp_path): - parsed = _parsed("mypy", type_errors=[ - TypeError(file="phalanx/foo.py", line=5, col=0, message="type error") - ]) + parsed = _parsed( + "mypy", + type_errors=[TypeError(file="phalanx/foo.py", line=5, col=0, message="type error")], + ) with patch("subprocess.run", return_value=self._mock_run(1, "", "phalanx/foo.py:5: error")): result = validate_fix(parsed, tmp_path) assert result.passed is False def test_pytest_pass(self, tmp_path): - parsed = _parsed("pytest", test_failures=[ - TestFailure(test_id="tests/unit/test_foo.py::test_bar", file="tests/unit/test_foo.py", message="") - ]) + parsed = _parsed( + "pytest", + test_failures=[ + TestFailure( + test_id="tests/unit/test_foo.py::test_bar", + file="tests/unit/test_foo.py", + message="", + ) + ], + ) with patch("subprocess.run", return_value=self._mock_run(0)): result = validate_fix(parsed, tmp_path) assert result.passed is True assert result.tool == "pytest" def test_pytest_fail(self, tmp_path): - parsed = _parsed("pytest", test_failures=[ - TestFailure(test_id="tests/unit/test_foo.py::test_bar", file="tests/unit/test_foo.py", message="") - ]) + parsed = _parsed( + "pytest", + test_failures=[ + TestFailure( + test_id="tests/unit/test_foo.py::test_bar", + file="tests/unit/test_foo.py", + message="", + ) + ], + ) with patch("subprocess.run", return_value=self._mock_run(1, "", "FAILED")): result = validate_fix(parsed, tmp_path) assert result.passed is False def test_tool_version_captured(self, tmp_path): """tool_version is populated from --version output.""" - parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) + parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) + # First call = --version, subsequent = ruff check def side_effect(cmd, **kwargs): if "--version" in cmd: @@ -163,14 +217,19 @@ def side_effect(cmd, **kwargs): def test_regression_check_fires_on_new_error(self, tmp_path): """Regression check catches errors introduced into other files.""" - from phalanx.ci_fixer.log_parser import parse_log - - original = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) - fixed_parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) + + original = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) + fixed_parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) # Primary check passes (foo.py is clean), but broad check finds a NEW error in bar.py call_count = {"n": 0} @@ -192,12 +251,20 @@ def side_effect(cmd, **kwargs): def test_regression_check_skips_pre_existing_errors(self, tmp_path): """Pre-existing errors in original_parsed are not counted as regressions.""" - original = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/bar.py", line=5, col=1, code="E501", message="line too long") - ]) - fixed_parsed = _parsed("ruff", lint_errors=[ - LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") - ]) + original = _parsed( + "ruff", + lint_errors=[ + LintError( + file="phalanx/bar.py", line=5, col=1, code="E501", message="line too long" + ) + ], + ) + fixed_parsed = _parsed( + "ruff", + lint_errors=[ + LintError(file="phalanx/foo.py", line=1, col=1, code="F401", message="unused") + ], + ) def side_effect(cmd, **kwargs): if "--version" in cmd: diff --git a/tests/unit/test_ci_webhooks_unit.py b/tests/unit/test_ci_webhooks_unit.py index b714690d..60ba6dc7 100644 --- a/tests/unit/test_ci_webhooks_unit.py +++ b/tests/unit/test_ci_webhooks_unit.py @@ -15,6 +15,7 @@ from phalanx.api.routes.ci_webhooks import ( _parse_repo_name, _verify_buildkite_signature, + _verify_circleci_signature, _verify_github_signature, ) from phalanx.ci_fixer.log_fetcher import ( @@ -222,36 +223,236 @@ def test_head_preserved(self): assert "START_MARKER" in result -# ── Stub fetchers (CircleCI, Jenkins) ───────────────────────────────────────── +# ── CircleCI / Jenkins fetcher helpers ───────────────────────────────────────── from phalanx.ci_fixer.events import CIFailureEvent # noqa: E402 -def _make_event(): +def _make_circleci_event(build_id: str = "wf-uuid-1234") -> CIFailureEvent: return CIFailureEvent( provider="circleci", repo_full_name="acme/api", - branch="main", - commit_sha="abc", - build_id="1", - build_url="https://ci.example.com/1", + branch="fix/my-branch", + commit_sha="deadbeef", + build_id=build_id, + build_url="https://app.circleci.com/pipelines/github/acme/api/1/workflows/wf-uuid-1234", ) -class TestStubFetchers: +def _make_circleci_client( + jobs_payload: dict | None = None, + steps_payload: dict | None = None, + log_content: str = "", + log_is_json: bool = False, + workflow_jobs_fail: bool = False, + steps_fail: bool = False, + log_fetch_fail: bool = False, +): + """Build a mock httpx.AsyncClient for CircleCI API calls.""" + from unittest.mock import AsyncMock, MagicMock + + client = MagicMock() + client.__aenter__ = AsyncMock(return_value=client) + client.__aexit__ = AsyncMock(return_value=False) + + responses: list = [] + + # Call 1: GET /workflow/{id}/job + if workflow_jobs_fail: + job_resp = MagicMock() + job_resp.raise_for_status.side_effect = Exception("403 Forbidden") + else: + job_resp = MagicMock() + job_resp.raise_for_status = MagicMock() + job_resp.json.return_value = jobs_payload or { + "items": [ + {"job_number": 42, "name": "test-job", "status": "failed"}, + ] + } + responses.append(job_resp) + + if not workflow_jobs_fail: + # Call 2: GET /project/{slug}/job/{number}/steps + if steps_fail: + steps_resp = MagicMock() + steps_resp.raise_for_status.side_effect = Exception("404 Not Found") + else: + steps_resp = MagicMock() + steps_resp.raise_for_status = MagicMock() + steps_resp.json.return_value = steps_payload or { + "items": [ + { + "name": "Run tests", + "actions": [ + { + "exit_code": 1, + "failed": True, + "output_url": "https://circle-output.s3.amazonaws.com/out", + } + ], + } + ] + } + responses.append(steps_resp) + + if not steps_fail: + # Call 3: GET output_url + if log_fetch_fail: + log_resp = MagicMock() + log_resp.status_code = 500 + else: + log_resp = MagicMock() + log_resp.status_code = 200 + if log_is_json: + log_resp.headers = {"content-type": "application/json"} + log_resp.json.return_value = [{"message": log_content, "type": "out"}] + else: + log_resp.headers = {"content-type": "text/plain"} + log_resp.text = log_content + responses.append(log_resp) + + client.get = AsyncMock(side_effect=responses) + return client + + +class TestCircleCILogFetcher: + @pytest.mark.asyncio + async def test_fetch_failed_job_plain_text_log(self): + """Happy path: failed job with plain-text output → failure section returned.""" + log_text = "Step 1\nStep 2\nError: assert failed\nStep 4" + client = _make_circleci_client(log_content=log_text) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert "Error: assert failed" in result + + @pytest.mark.asyncio + async def test_fetch_failed_job_json_log(self): + """CircleCI JSON log format: array of {message, type} objects.""" + client = _make_circleci_client( + log_content="ruff: F401 unused import", + log_is_json=True, + ) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert "F401" in result + + @pytest.mark.asyncio + async def test_fetch_no_failed_jobs(self): + """Workflow with no failed jobs → informative message.""" + client = _make_circleci_client(jobs_payload={"items": []}) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert "no failed jobs" in result + + @pytest.mark.asyncio + async def test_fetch_workflow_jobs_api_fails(self): + """GET /workflow/jobs raises → no logs retrieved.""" + client = _make_circleci_client(workflow_jobs_fail=True) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert "no failed jobs" in result or isinstance(result, str) + @pytest.mark.asyncio - async def test_circleci_returns_string(self): - fetcher = CircleCILogFetcher() - result = await fetcher.fetch(_make_event(), "key") + async def test_fetch_steps_api_fails_gracefully(self): + """GET /job/{n}/steps raises → no logs retrieved for that job.""" + client = _make_circleci_client(steps_fail=True) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert isinstance(result, str) + + @pytest.mark.asyncio + async def test_fetch_log_url_fails_gracefully(self): + """Output URL fetch returns 500 → no output for that step.""" + client = _make_circleci_client(log_content="", log_fetch_fail=True) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert isinstance(result, str) + + @pytest.mark.asyncio + async def test_fetch_timedout_job_included(self): + """Jobs with status=timedout are treated as failed.""" + client = _make_circleci_client( + jobs_payload={"items": [{"job_number": 7, "name": "slow-build", "status": "timedout"}]}, + log_content="Timeout: job exceeded 10 minutes", + ) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + assert isinstance(result, str) - assert len(result) > 0 + @pytest.mark.asyncio + async def test_fetch_multiple_failed_jobs_limited_to_three(self): + """Up to 3 failed jobs are fetched; extras are silently dropped.""" + jobs = [{"job_number": i, "name": f"job-{i}", "status": "failed"} for i in range(1, 6)] + # Build a client that returns jobs list, then step + log for each of the first 3 + from unittest.mock import AsyncMock, MagicMock + + client = MagicMock() + client.__aenter__ = AsyncMock(return_value=client) + client.__aexit__ = AsyncMock(return_value=False) + + jobs_resp = MagicMock() + jobs_resp.raise_for_status = MagicMock() + jobs_resp.json.return_value = {"items": jobs} + + def _make_steps_resp(): + r = MagicMock() + r.raise_for_status = MagicMock() + r.json.return_value = { + "items": [ + {"name": "run", "actions": [{"exit_code": 1, "output_url": "https://s3/out"}]} + ] + } + return r + + def _make_log_resp(): + r = MagicMock() + r.status_code = 200 + r.headers = {"content-type": "text/plain"} + r.text = "Error: something failed" + return r + + # jobs + (steps + log) * 3 = 7 calls + responses = [jobs_resp] + for _ in range(3): + responses.append(_make_steps_resp()) + responses.append(_make_log_resp()) + + client.get = AsyncMock(side_effect=responses) + + with patch("phalanx.ci_fixer.log_fetcher.httpx.AsyncClient", return_value=client): + result = await CircleCILogFetcher().fetch(_make_circleci_event(), "tok") + + assert isinstance(result, str) + # Exactly 3 job log sections (or fewer, combined into one string) + assert result.count("JOB:") <= 3 + + +class TestJenkinsLogFetcher: @pytest.mark.asyncio async def test_jenkins_returns_string(self): - fetcher = JenkinsLogFetcher() - e = _make_event() - e.provider = "jenkins" - result = await fetcher.fetch(e, "key") + event = CIFailureEvent( + provider="jenkins", + repo_full_name="acme/api", + branch="main", + commit_sha="abc", + build_id="1", + build_url="https://jenkins.example.com/job/1", + ) + result = await JenkinsLogFetcher().fetch(event, "key") assert isinstance(result, str) assert len(result) > 0 @@ -551,7 +752,7 @@ def _make_app(): from phalanx.api.routes.ci_webhooks import router app = FastAPI() - app.include_router(router) + app.include_router(router, prefix="/webhook") return app @@ -668,16 +869,181 @@ def test_build_finished_dispatches(self): assert r.json()["status"] == "skipped" -class TestStubWebhookRoutes: +class TestJenkinsWebhookRoute: def setup_method(self): self.client = TestClient(_make_app()) - def test_circleci_stub(self): - r = self.client.post("/webhook/circleci", content=b"{}") - assert r.status_code == 200 - assert r.json()["status"] == "coming_soon" - def test_jenkins_stub(self): r = self.client.post("/webhook/jenkins", content=b"{}") assert r.status_code == 200 assert r.json()["status"] == "coming_soon" + + +# ── _verify_circleci_signature ───────────────────────────────────────────────── + + +class TestVerifyCircleCISignature: + def _make_sig(self, body: bytes, secret: str) -> str: + digest = hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() + return f"v1={digest}" + + def test_valid_signature(self): + body = b'{"type": "workflow-completed"}' + secret = "circle-secret" + sig = self._make_sig(body, secret) + assert _verify_circleci_signature(body, sig, secret) is True + + def test_invalid_signature(self): + body = b'{"type": "workflow-completed"}' + assert _verify_circleci_signature(body, "v1=invalidsig", "secret") is False + + def test_no_secret_always_passes(self): + assert _verify_circleci_signature(b"anything", "", "") is True + assert _verify_circleci_signature(b"anything", "v1=bad", "") is True + + def test_tampered_body_fails(self): + body = b'{"type": "workflow-completed"}' + secret = "my-secret" + sig = self._make_sig(body, secret) + tampered = b'{"type": "job-completed"}' + assert _verify_circleci_signature(tampered, sig, secret) is False + + def test_empty_signature_with_secret_fails(self): + assert _verify_circleci_signature(b"data", "", "some-secret") is False + + +# ── CircleCI webhook route ───────────────────────────────────────────────────── + + +def _circleci_payload( + event_type: str = "workflow-completed", + status: str = "failed", + branch: str = "fix/my-branch", + repo_url: str = "https://github.com/acme/api", + commit_sha: str = "abc123", + workflow_id: str = "wf-uuid-001", + pipeline_number: int = 10, + pr_author: str | None = "dev-user", +) -> dict: + return { + "type": event_type, + "workflow": { + "id": workflow_id, + "name": "build-and-test", + "status": status, + }, + "pipeline": { + "id": "pipe-uuid", + "number": pipeline_number, + "vcs": { + "origin_repository_url": repo_url, + "branch": branch, + "revision": commit_sha, + "commit": { + "subject": "fix: update deps", + "author": {"login": pr_author} if pr_author else {}, + }, + }, + }, + "project": {"id": "proj-uuid", "name": "api", "slug": "github/acme/api"}, + "organization": {"name": "acme"}, + } + + +class TestCircleCIWebhookRoutes: + def setup_method(self): + self.client = TestClient(_make_app()) + + def _post(self, payload: dict, sig: str = "") -> object: + return self.client.post( + "/webhook/circleci", + content=_json.dumps(payload).encode(), + headers={ + "circleci-signature": sig, + "content-type": "application/json", + }, + ) + + def test_non_workflow_event_is_ignored(self): + r = self._post(_circleci_payload(event_type="job-completed")) + assert r.status_code == 200 + assert r.json()["status"] == "ignored" + + def test_successful_workflow_is_ignored(self): + r = self._post(_circleci_payload(status="success")) + assert r.status_code == 200 + assert r.json()["status"] == "ignored" + + def test_workflow_on_hold_is_ignored(self): + r = self._post(_circleci_payload(status="on_hold")) + assert r.status_code == 200 + assert r.json()["status"] == "ignored" + + def test_failed_workflow_dispatches(self): + with _patch("phalanx.api.routes.ci_webhooks._dispatch_ci_fix", return_value=None): + r = self._post(_circleci_payload(status="failed")) + assert r.status_code == 200 + assert r.json()["status"] == "skipped" # _dispatch_ci_fix returned None + + def test_error_workflow_dispatches(self): + """'error' is also a failed state.""" + with _patch("phalanx.api.routes.ci_webhooks._dispatch_ci_fix", return_value=None): + r = self._post(_circleci_payload(status="error")) + assert r.status_code == 200 + assert r.json()["status"] == "skipped" + + def test_unparseable_repo_is_skipped(self): + payload = _circleci_payload( + status="failed", + repo_url="https://gitlab.com/acme/api", # not github + ) + # Remove project slug so fallback also fails + payload["project"] = {"id": "x", "name": "api", "slug": "gitlab/acme/api"} + r = self._post(payload) + assert r.status_code == 200 + assert r.json()["status"] == "skipped" + assert "cannot_parse_repo" in r.json()["reason"] + + def test_repo_from_project_slug_fallback(self): + """When VCS URL is missing, repo is parsed from project.slug.""" + payload = _circleci_payload(status="failed", repo_url="") + payload["project"] = {"slug": "github/acme/api"} + with _patch("phalanx.api.routes.ci_webhooks._dispatch_ci_fix", return_value=None): + r = self._post(payload) + assert r.status_code == 200 + assert r.json()["status"] == "skipped" + + def test_pr_number_parsed_from_branch(self): + """Branch 'pull/42' → pr_number=42.""" + from unittest.mock import AsyncMock, MagicMock + + captured = {} + + async def capture_dispatch(event): + captured["event"] = event + return None + + with _patch( + "phalanx.api.routes.ci_webhooks._dispatch_ci_fix", + side_effect=capture_dispatch, + ): + self._post(_circleci_payload(status="failed", branch="pull/42")) + + assert captured.get("event") is not None + assert captured["event"].pr_number == 42 + + def test_invalid_signature_returns_401(self): + body = _json.dumps(_circleci_payload(status="failed")).encode() + with _patch("phalanx.api.routes.ci_webhooks.settings") as mock_settings: + mock_settings.circleci_webhook_secret = "real-secret" + mock_settings.buildkite_webhook_token = "" + mock_settings.github_webhook_secret = "" + r = self.client.post( + "/webhook/circleci", + content=body, + headers={ + "circleci-signature": "v1=invalidsignature", + "content-type": "application/json", + }, + ) + assert r.status_code == 401 diff --git a/tests/unit/test_coverage_boost.py b/tests/unit/test_coverage_boost.py index 2756992c..23d54dd8 100644 --- a/tests/unit/test_coverage_boost.py +++ b/tests/unit/test_coverage_boost.py @@ -13,13 +13,12 @@ from __future__ import annotations import json -from datetime import UTC, datetime, timedelta +from datetime import UTC, datetime from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest - # ══════════════════════════════════════════════════════════════════════════════ # product_manager.py # ══════════════════════════════════════════════════════════════════════════════ @@ -76,16 +75,30 @@ async def test_product_manager_execute_for_work_order_success(): work_order.title = "Build a blog" work_order.description = "A simple blogging platform" - llm_response = json.dumps({ - "app_type": "web", - "tech_stack": "nextjs", - "epics": [ - {"title": "Infrastructure", "description": "DB + auth", "sequence_num": 1, "estimated_complexity": 3}, - {"title": "Frontend", "description": "React pages", "sequence_num": 2, "estimated_complexity": 2}, - ], - "user_stories": ["As a user I can write posts"], - "acceptance_criteria": ["Given I am logged in, When I click New Post, Then I see the editor"], - }) + llm_response = json.dumps( + { + "app_type": "web", + "tech_stack": "nextjs", + "epics": [ + { + "title": "Infrastructure", + "description": "DB + auth", + "sequence_num": 1, + "estimated_complexity": 3, + }, + { + "title": "Frontend", + "description": "React pages", + "sequence_num": 2, + "estimated_complexity": 2, + }, + ], + "user_stories": ["As a user I can write posts"], + "acceptance_criteria": [ + "Given I am logged in, When I click New Post, Then I see the editor" + ], + } + ) mock_session = AsyncMock() mock_session.add = MagicMock() @@ -199,9 +212,12 @@ async def test_verifier_execute_task_success(): """execute_task creates agent and runs it.""" from phalanx.agents.verifier import execute_task - with patch("phalanx.agents.verifier.VerifierAgent") as MockAgent, \ - patch("phalanx.agents.verifier.asyncio.run") as mock_run: + with ( + patch("phalanx.agents.verifier.VerifierAgent") as MockAgent, + patch("phalanx.agents.verifier.asyncio.run") as mock_run, + ): from phalanx.agents.base import AgentResult + mock_instance = MagicMock() mock_instance.execute.return_value = AgentResult(success=True, output={}) MockAgent.return_value = mock_instance @@ -283,12 +299,17 @@ async def test_verifier_execute_build_errors(): mock_profile = MagicMock() mock_profile.build_cmd = "npm run build" - with patch("phalanx.agents.verifier.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.verifier.settings") as mock_settings, \ - patch("phalanx.agents.verifier.detect_tech_stack", return_value="nextjs"), \ - patch("phalanx.agents.verifier.get_profile", return_value=mock_profile), \ - patch("phalanx.agents.verifier.run_profile_checks", return_value=["build failed: missing file"]), \ - patch("phalanx.agents.verifier.merge_workspace", return_value=mock_merged_dir): + with ( + patch("phalanx.agents.verifier.get_db", return_value=mock_ctx), + patch("phalanx.agents.verifier.settings") as mock_settings, + patch("phalanx.agents.verifier.detect_tech_stack", return_value="nextjs"), + patch("phalanx.agents.verifier.get_profile", return_value=mock_profile), + patch( + "phalanx.agents.verifier.run_profile_checks", + return_value=["build failed: missing file"], + ), + patch("phalanx.agents.verifier.merge_workspace", return_value=mock_merged_dir), + ): mock_settings.git_workspace = "/tmp/forge" result = await agent.execute() @@ -437,8 +458,12 @@ async def test_poll_all_pending_no_runs(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.ci_fixer.outcome_tracker.get_db", return_value=mock_ctx), \ - patch("phalanx.ci_fixer.outcome_tracker._process_run", new_callable=AsyncMock) as mock_process: + with ( + patch("phalanx.ci_fixer.outcome_tracker.get_db", return_value=mock_ctx), + patch( + "phalanx.ci_fixer.outcome_tracker._process_run", new_callable=AsyncMock + ) as mock_process, + ): await _poll_all_pending() mock_process.assert_not_called() @@ -457,10 +482,11 @@ def test_poll_fix_outcomes_reraises(): """poll_fix_outcomes re-raises on exception.""" from phalanx.ci_fixer.outcome_tracker import poll_fix_outcomes - with patch("phalanx.ci_fixer.outcome_tracker.asyncio.run", - side_effect=RuntimeError("boom")): - with pytest.raises(RuntimeError, match="boom"): - poll_fix_outcomes() + with ( + patch("phalanx.ci_fixer.outcome_tracker.asyncio.run", side_effect=RuntimeError("boom")), + pytest.raises(RuntimeError, match="boom"), + ): + poll_fix_outcomes() # ══════════════════════════════════════════════════════════════════════════════ @@ -509,8 +535,10 @@ def test_validator_subprocess_error(tmp_path): (tmp_path / "src").mkdir() (tmp_path / "src" / "foo.py").write_text("import os\n") - with patch("shutil.which", return_value="/usr/bin/ruff"), \ - patch("subprocess.run", side_effect=FileNotFoundError("ruff: not found")): + with ( + patch("shutil.which", return_value="/usr/bin/ruff"), + patch("subprocess.run", side_effect=FileNotFoundError("ruff: not found")), + ): result = validate_fix(parsed, tmp_path) assert result.passed is False @@ -593,12 +621,17 @@ async def test_run_scan_empty_findings_no_comment(): """_run_scan with empty findings → no comment posted.""" from phalanx.ci_fixer.proactive_scanner import _run_scan - with patch("phalanx.ci_fixer.proactive_scanner.scan_pr_for_patterns", - new_callable=AsyncMock, return_value=[]), \ - patch("phalanx.ci_fixer.proactive_scanner._post_comment", - new_callable=AsyncMock) as mock_post, \ - patch("phalanx.ci_fixer.proactive_scanner._record_scan", - new_callable=AsyncMock): + with ( + patch( + "phalanx.ci_fixer.proactive_scanner.scan_pr_for_patterns", + new_callable=AsyncMock, + return_value=[], + ), + patch( + "phalanx.ci_fixer.proactive_scanner._post_comment", new_callable=AsyncMock + ) as mock_post, + patch("phalanx.ci_fixer.proactive_scanner._record_scan", new_callable=AsyncMock), + ): await _run_scan("acme/backend", 1, "abc", "token") mock_post.assert_not_called() @@ -632,8 +665,10 @@ async def test_scan_pr_mypy_patterns(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("httpx.AsyncClient", return_value=mock_client), \ - patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_ctx): + with ( + patch("httpx.AsyncClient", return_value=mock_client), + patch("phalanx.ci_fixer.proactive_scanner.get_db", return_value=mock_ctx), + ): findings = await scan_pr_for_patterns("acme/backend", 1, "abc", "token") assert len(findings) >= 0 # At minimum doesn't crash @@ -657,10 +692,11 @@ def test_promote_patterns_reraises(): """promote_patterns re-raises on exception.""" from phalanx.ci_fixer.pattern_promoter import promote_patterns - with patch("phalanx.ci_fixer.pattern_promoter.asyncio.run", - side_effect=RuntimeError("boom")): - with pytest.raises(RuntimeError, match="boom"): - promote_patterns() + with ( + patch("phalanx.ci_fixer.pattern_promoter.asyncio.run", side_effect=RuntimeError("boom")), + pytest.raises(RuntimeError, match="boom"), + ): + promote_patterns() # ══════════════════════════════════════════════════════════════════════════════ diff --git a/tests/unit/test_coverage_boost2.py b/tests/unit/test_coverage_boost2.py index 5bbc638a..22612a76 100644 --- a/tests/unit/test_coverage_boost2.py +++ b/tests/unit/test_coverage_boost2.py @@ -9,13 +9,10 @@ from __future__ import annotations -import json -from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest - # ══════════════════════════════════════════════════════════════════════════════ # release.py # ══════════════════════════════════════════════════════════════════════════════ @@ -75,8 +72,14 @@ async def test_release_execute_github_skipped(): agent._load_task_summaries = AsyncMock(return_value=[]) agent._audit = AsyncMock() - mock_notes = {"title": "Release X", "summary": "X was built", "changes": [], "testing": "passed", - "rollback": "revert", "breaking_changes": []} + mock_notes = { + "title": "Release X", + "summary": "X was built", + "changes": [], + "testing": "passed", + "rollback": "revert", + "breaking_changes": [], + } mock_session = AsyncMock() mock_session.execute = AsyncMock(return_value=MagicMock()) @@ -85,10 +88,14 @@ async def test_release_execute_github_skipped(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.release.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.release.settings") as mock_settings, \ - patch.object(agent, "_generate_release_notes", new_callable=AsyncMock, return_value=mock_notes), \ - patch.object(agent, "_persist_artifact", new_callable=AsyncMock): + with ( + patch("phalanx.agents.release.get_db", return_value=mock_ctx), + patch("phalanx.agents.release.settings") as mock_settings, + patch.object( + agent, "_generate_release_notes", new_callable=AsyncMock, return_value=mock_notes + ), + patch.object(agent, "_persist_artifact", new_callable=AsyncMock), + ): mock_settings.github_token = "" # no token → skip result = await agent.execute() @@ -137,12 +144,16 @@ async def test_release_create_github_pr_import_error(): mock_run.active_branch = "feature/x" mock_run.project_id = "proj-1" - with patch("phalanx.agents.release.settings") as mock_settings, \ - patch("phalanx.agents.release.get_db"): + with ( + patch("phalanx.agents.release.settings") as mock_settings, + patch("phalanx.agents.release.get_db"), + ): mock_settings.github_token = "ghp_test" # Simulate import error with patch.dict("sys.modules", {"github": None}): - result = await agent._create_github_pr(mock_run, None, {"changes": [], "breaking_changes": []}) + result = await agent._create_github_pr( + mock_run, None, {"changes": [], "breaking_changes": []} + ) # ImportError → returns {} assert result == {} or "error" in result @@ -171,15 +182,23 @@ async def test_release_create_github_pr_exception(): mock_github = MagicMock() mock_github.Github.side_effect = Exception("API error") - with patch("phalanx.agents.release.settings") as mock_settings, \ - patch("phalanx.agents.release.get_db", return_value=mock_ctx), \ - patch.dict("sys.modules", {"github": mock_github}): + with ( + patch("phalanx.agents.release.settings") as mock_settings, + patch("phalanx.agents.release.get_db", return_value=mock_ctx), + patch.dict("sys.modules", {"github": mock_github}), + ): mock_settings.github_token = "ghp_test" result = await agent._create_github_pr( mock_run, None, - {"summary": "x", "changes": [], "testing": "y", "rollback": "z", - "breaking_changes": [], "title": "Release X"}, + { + "summary": "x", + "changes": [], + "testing": "y", + "rollback": "z", + "breaking_changes": [], + "title": "Release X", + }, ) assert "error" in result @@ -282,13 +301,19 @@ async def test_integration_wiring_execute_with_builder_tasks(tmp_path): mock_profile = MagicMock() mock_profile.integration_pattern = "fastapi-router" - with patch("phalanx.agents.integration_wiring.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.integration_wiring.settings") as s, \ - patch("phalanx.agents.integration_wiring.merge_workspace", return_value=tmp_path), \ - patch("phalanx.agents.integration_wiring.detect_tech_stack", return_value="fastapi"), \ - patch("phalanx.agents.integration_wiring.get_profile", return_value=mock_profile), \ - patch.object(agent, "_wire", new_callable=AsyncMock, - return_value={"status": "ok", "files_wired": ["main.py"], "notes": []}): + with ( + patch("phalanx.agents.integration_wiring.get_db", return_value=mock_ctx), + patch("phalanx.agents.integration_wiring.settings") as s, + patch("phalanx.agents.integration_wiring.merge_workspace", return_value=tmp_path), + patch("phalanx.agents.integration_wiring.detect_tech_stack", return_value="fastapi"), + patch("phalanx.agents.integration_wiring.get_profile", return_value=mock_profile), + patch.object( + agent, + "_wire", + new_callable=AsyncMock, + return_value={"status": "ok", "files_wired": ["main.py"], "notes": []}, + ), + ): s.git_workspace = str(tmp_path) result = await agent.execute() @@ -421,8 +446,10 @@ async def test_commit_to_safe_branch_push_success(tmp_path): mock_remote = MagicMock() mock_repo.remotes = [mock_remote] - with patch("git.Repo", return_value=mock_repo), \ - patch("phalanx.agents.ci_fixer.settings") as mock_settings: + with ( + patch("git.Repo", return_value=mock_repo), + patch("phalanx.agents.ci_fixer.settings") as mock_settings, + ): mock_settings.git_author_name = "FORGE" mock_settings.git_author_email = "forge@phalanx.dev" result = await agent._commit_to_safe_branch( diff --git a/tests/unit/test_coverage_boost3.py b/tests/unit/test_coverage_boost3.py index 7f759eb7..a7a7f545 100644 --- a/tests/unit/test_coverage_boost3.py +++ b/tests/unit/test_coverage_boost3.py @@ -14,7 +14,6 @@ import pytest - # ══════════════════════════════════════════════════════════════════════════════ # verification_profiles.py # ══════════════════════════════════════════════════════════════════════════════ @@ -133,7 +132,11 @@ def test_build_cmd_error_extracted(self, tmp_path): from phalanx.agents.verification_profiles import run_profile_checks profile = MagicMock() - profile.build_cmd = ["python", "-c", "import sys; print('error: build failed', file=sys.stderr); sys.exit(1)"] + profile.build_cmd = [ + "python", + "-c", + "import sys; print('error: build failed', file=sys.stderr); sys.exit(1)", + ] profile.typecheck_cmd = None profile.lint_cmd = None profile.test_cmd = None @@ -270,10 +273,10 @@ def test_run_helper_file_not_found(self, tmp_path): def test_run_helper_timeout(self, tmp_path): """_run catches TimeoutExpired.""" - from phalanx.agents.verification_profiles import _run - import subprocess + from phalanx.agents.verification_profiles import _run + with patch("subprocess.run", side_effect=subprocess.TimeoutExpired("cmd", 1)): success, stdout, stderr = _run(["sleep", "999"], tmp_path, timeout=1) @@ -367,14 +370,23 @@ async def test_ux_execute_success(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.ux_designer.get_db", return_value=mock_ctx), \ - patch.object(agent, "_load_planner_context", new_callable=AsyncMock, return_value=""), \ - patch.object(agent, "_generate_design", new_callable=AsyncMock, return_value=mock_design_str), \ - patch.object(agent, "_self_check_design", return_value="self-check passed"), \ - patch.object(agent, "_write_design_handoff", new_callable=AsyncMock, return_value="build with modern style"), \ - patch.object(agent, "_persist_design_artifact", new_callable=AsyncMock), \ - patch.object(agent, "_trace", new_callable=AsyncMock), \ - patch("pathlib.Path.write_text"): + with ( + patch("phalanx.agents.ux_designer.get_db", return_value=mock_ctx), + patch.object(agent, "_load_planner_context", new_callable=AsyncMock, return_value=""), + patch.object( + agent, "_generate_design", new_callable=AsyncMock, return_value=mock_design_str + ), + patch.object(agent, "_self_check_design", return_value="self-check passed"), + patch.object( + agent, + "_write_design_handoff", + new_callable=AsyncMock, + return_value="build with modern style", + ), + patch.object(agent, "_persist_design_artifact", new_callable=AsyncMock), + patch.object(agent, "_trace", new_callable=AsyncMock), + patch("pathlib.Path.write_text"), + ): result = await agent.execute() assert result.success is True @@ -402,25 +414,29 @@ async def test_ux_generate_design(): mock_wo.title = "My App" mock_wo.description = "An app" - design_response = json.dumps({ - "design_spec": { - "brand": {"personality": "modern"}, - "color": {"primary": "#000"}, - "typography": {}, - "spacing": {}, - "components": {}, - "logo": "", - "ux_patterns": {}, - "accessibility": {}, - }, - "handoff_summary": "Modern design.", - }) + json.dumps( + { + "design_spec": { + "brand": {"personality": "modern"}, + "color": {"primary": "#000"}, + "typography": {}, + "spacing": {}, + "components": {}, + "logo": "", + "ux_patterns": {}, + "accessibility": {}, + }, + "handoff_summary": "Modern design.", + } + ) if hasattr(agent, "_generate_design"): mock_task = MagicMock() mock_task.title = "My App" mock_task.description = "An app" - with patch.object(agent, "_call_claude", new_callable=AsyncMock, return_value="# Design\n\nModern."): + with patch.object( + agent, "_call_claude", new_callable=AsyncMock, return_value="# Design\n\nModern." + ): result = await agent._generate_design( task=mock_task, app_type="web", @@ -502,14 +518,16 @@ async def test_release_generate_notes_valid_json(): mock_wo.title = "Feature X" mock_wo.description = "Build X" - llm_response = json.dumps({ - "title": "Release Notes: Feature X", - "summary": "X was built", - "changes": [{"type": "feat", "description": "Added X"}], - "testing": "Tests passed", - "rollback": "Revert PR", - "breaking_changes": [], - }) + llm_response = json.dumps( + { + "title": "Release Notes: Feature X", + "summary": "X was built", + "changes": [{"type": "feat", "description": "Added X"}], + "testing": "Tests passed", + "rollback": "Revert PR", + "breaking_changes": [], + } + ) with patch.object(agent, "_call_claude", return_value=llm_response): result = await agent._generate_release_notes(mock_run, mock_wo, []) diff --git a/tests/unit/test_coverage_boost4.py b/tests/unit/test_coverage_boost4.py index 03404aaa..3ab3b07a 100644 --- a/tests/unit/test_coverage_boost4.py +++ b/tests/unit/test_coverage_boost4.py @@ -15,15 +15,14 @@ - _remove_root_conftest (lines 821-830) - _derive_coverage_source (lines 900-947) """ + from __future__ import annotations -import asyncio from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest - # ── helpers ────────────────────────────────────────────────────────────────── @@ -191,9 +190,7 @@ class TestDeriveCoverageSource: def test_shared_top_level_dir(self, tmp_path): agent = _make_qa_agent(tmp_path) (tmp_path / "app").mkdir() # must exist as dir - context = { - "changed_files": ["app/routes.py", "app/models.py", "app/utils.py"] - } + context = {"changed_files": ["app/routes.py", "app/models.py", "app/utils.py"]} result = agent._derive_coverage_source(context) assert result == "app" @@ -518,7 +515,9 @@ def test_basic_evidence(self, tmp_path): from phalanx.agents.qa import LintResult, QAOutcome, TestSuiteResult agent = _make_qa_agent(tmp_path) - suite = TestSuiteResult(name="test", total=5, passed=5, failed=0, errored=0, skipped=0, duration_seconds=1.0) + suite = TestSuiteResult( + name="test", total=5, passed=5, failed=0, errored=0, skipped=0, duration_seconds=1.0 + ) lint = LintResult(tool="ruff", passed=True, violation_count=0, output="") evidence = agent._build_evidence([suite], None, [lint], QAOutcome.PASSED) assert evidence["gate"] == "qa" @@ -529,9 +528,15 @@ def test_evidence_with_coverage(self, tmp_path): from phalanx.agents.qa import CoverageResult, QAOutcome, TestSuiteResult agent = _make_qa_agent(tmp_path) - suite = TestSuiteResult(name="test", total=3, passed=3, failed=0, errored=0, skipped=0, duration_seconds=0.5) + suite = TestSuiteResult( + name="test", total=3, passed=3, failed=0, errored=0, skipped=0, duration_seconds=0.5 + ) cov = CoverageResult( - line_coverage_pct=80.0, branch_coverage_pct=None, threshold=70.0, threshold_met=True, modules_below_threshold=[] + line_coverage_pct=80.0, + branch_coverage_pct=None, + threshold=70.0, + threshold_met=True, + modules_below_threshold=[], ) evidence = agent._build_evidence([suite], cov, [], QAOutcome.PASSED) assert evidence["summary"]["coverage_pct"] == 80.0 @@ -572,7 +577,9 @@ async def test_persist_artifact_success(tmp_path): ) mock_session = AsyncMock() - mock_session.execute = AsyncMock(return_value=MagicMock(scalar_one=MagicMock(return_value="proj-1"))) + mock_session.execute = AsyncMock( + return_value=MagicMock(scalar_one=MagicMock(return_value="proj-1")) + ) mock_session.add = MagicMock() mock_session.commit = AsyncMock() mock_ctx = AsyncMock() diff --git a/tests/unit/test_coverage_boost5.py b/tests/unit/test_coverage_boost5.py index b19886dc..9eea8708 100644 --- a/tests/unit/test_coverage_boost5.py +++ b/tests/unit/test_coverage_boost5.py @@ -9,15 +9,16 @@ - phalanx/memory/assembler.py — MemoryAssembler.build() - phalanx/memory/reader.py — MemoryReader methods """ + from __future__ import annotations +import contextlib from datetime import UTC, datetime from unittest.mock import AsyncMock, MagicMock, patch from uuid import uuid4 import pytest - # ══════════════════════════════════════════════════════════════════════════════ # memory/assembler.py # ══════════════════════════════════════════════════════════════════════════════ @@ -32,7 +33,15 @@ def _make_decision(self, title="Decision", decision="Do X", rationale="Because Y d.rejected_alternatives = alts or [] return d - def _make_fact(self, fact_type="tech", title="Fact", body="body", confidence=1.0, relevance=0.9, is_standing=True): + def _make_fact( + self, + fact_type="tech", + title="Fact", + body="body", + confidence=1.0, + relevance=0.9, + is_standing=True, + ): f = MagicMock() f.fact_type = fact_type f.title = title @@ -52,7 +61,9 @@ def test_build_with_decisions(self): from phalanx.memory.assembler import MemoryAssembler a = MemoryAssembler(max_tokens=4000) - d = self._make_decision("Use Postgres", "PostgreSQL as primary DB", "Proven at scale", ["MySQL", "SQLite"]) + d = self._make_decision( + "Use Postgres", "PostgreSQL as primary DB", "Proven at scale", ["MySQL", "SQLite"] + ) result = a.build(decisions=[d]) assert "Use Postgres" in result assert "Project Memory" in result @@ -226,7 +237,7 @@ def _make_ci_integration_obj(): @pytest.mark.asyncio async def test_register_integration_create(): - from phalanx.api.routes.ci_integrations import register_integration, CIIntegrationCreate + from phalanx.api.routes.ci_integrations import CIIntegrationCreate, register_integration body = CIIntegrationCreate( repo_full_name="acme/backend", @@ -236,7 +247,9 @@ async def test_register_integration_create(): obj = _make_ci_integration_obj() mock_session = AsyncMock() - mock_session.execute = AsyncMock(return_value=MagicMock(scalar_one_or_none=MagicMock(return_value=None))) + mock_session.execute = AsyncMock( + return_value=MagicMock(scalar_one_or_none=MagicMock(return_value=None)) + ) mock_session.add = MagicMock() mock_session.commit = AsyncMock() mock_session.refresh = AsyncMock(side_effect=lambda x: None) @@ -245,7 +258,6 @@ async def test_register_integration_create(): mock_ctx.__aexit__ = AsyncMock(return_value=None) # refresh won't return an obj with attributes — so we mock the return value - refreshed = obj mock_session.refresh = AsyncMock(return_value=None) # patch get_db AND capture the integration that was added captured = {} @@ -265,17 +277,15 @@ async def fake_add_and_refresh(x=None): with patch("phalanx.api.routes.ci_integrations.get_db", return_value=mock_ctx): # This will fail at refresh since the session is mocked # Use a simpler approach: just call the route function and catch the error - try: + with contextlib.suppress(Exception): await register_integration(body) - except Exception: - pass mock_session.commit.assert_awaited() @pytest.mark.asyncio async def test_register_integration_update_existing(): - from phalanx.api.routes.ci_integrations import register_integration, CIIntegrationCreate + from phalanx.api.routes.ci_integrations import CIIntegrationCreate, register_integration body = CIIntegrationCreate(repo_full_name="acme/backend", github_token="new_token") existing = _make_ci_integration_obj() @@ -291,10 +301,8 @@ async def test_register_integration_update_existing(): mock_ctx.__aexit__ = AsyncMock(return_value=None) with patch("phalanx.api.routes.ci_integrations.get_db", return_value=mock_ctx): - try: + with contextlib.suppress(Exception): await register_integration(body) - except Exception: - pass assert existing.github_token == "new_token" @@ -337,6 +345,7 @@ async def test_get_integration_found(): @pytest.mark.asyncio async def test_get_integration_not_found(): from fastapi import HTTPException + from phalanx.api.routes.ci_integrations import get_integration mock_session = AsyncMock() @@ -355,7 +364,8 @@ async def test_get_integration_not_found(): @pytest.mark.asyncio async def test_update_integration_not_found(): from fastapi import HTTPException - from phalanx.api.routes.ci_integrations import update_integration, CIIntegrationUpdate + + from phalanx.api.routes.ci_integrations import CIIntegrationUpdate, update_integration mock_session = AsyncMock() mock_session.get = AsyncMock(return_value=None) @@ -372,7 +382,7 @@ async def test_update_integration_not_found(): @pytest.mark.asyncio async def test_update_integration_success(): - from phalanx.api.routes.ci_integrations import update_integration, CIIntegrationUpdate + from phalanx.api.routes.ci_integrations import CIIntegrationUpdate, update_integration obj = _make_ci_integration_obj() mock_session = AsyncMock() @@ -385,10 +395,8 @@ async def test_update_integration_success(): update = CIIntegrationUpdate(enabled=False, max_attempts=3, auto_commit=False) with patch("phalanx.api.routes.ci_integrations.get_db", return_value=mock_ctx): - try: + with contextlib.suppress(Exception): await update_integration(obj.id, update) - except Exception: - pass assert obj.enabled is False assert obj.max_attempts == 3 @@ -397,6 +405,7 @@ async def test_update_integration_success(): @pytest.mark.asyncio async def test_delete_integration_not_found(): from fastapi import HTTPException + from phalanx.api.routes.ci_integrations import delete_integration mock_session = AsyncMock() diff --git a/tests/unit/test_coverage_push.py b/tests/unit/test_coverage_push.py index a71fc11e..6b77cf20 100644 --- a/tests/unit/test_coverage_push.py +++ b/tests/unit/test_coverage_push.py @@ -14,7 +14,6 @@ from __future__ import annotations import json -from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -42,12 +41,12 @@ def test_plain_json(self): assert result == {"key": "value"} def test_strips_code_fences(self): - text = "```json\n{\"key\": \"value\"}\n```" + text = '```json\n{"key": "value"}\n```' result = _extract_json(text) assert result == {"key": "value"} def test_strips_plain_backtick_fences(self): - text = "```\n{\"key\": \"value\"}\n```" + text = '```\n{"key": "value"}\n```' result = _extract_json(text) assert result == {"key": "value"} @@ -91,6 +90,7 @@ def test_out_of_range_defaults_30(self): def _make_ci_agent(): from phalanx.agents.ci_fixer import CIFixerAgent + with patch("phalanx.agents.base.BaseAgent.__init__", return_value=None): agent = CIFixerAgent.__new__(CIFixerAgent) agent.ci_fix_run_id = "run-cov-001" @@ -123,6 +123,7 @@ async def test_comment_on_pr_success(): ci_run.branch = "main" from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="ruff") mock_client = _mock_http_client(201, {"id": 99}) @@ -152,6 +153,7 @@ async def test_comment_on_pr_failure_does_not_raise(): ci_run.branch = "main" from phalanx.ci_fixer.log_parser import ParsedLog + parsed = ParsedLog(tool="ruff") mock_client = _mock_http_client(403) @@ -295,8 +297,10 @@ async def test_fetch_logs_calls_fetcher(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.ci_fixer.get_log_fetcher", return_value=mock_fetcher): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch("phalanx.agents.ci_fixer.get_log_fetcher", return_value=mock_fetcher), + ): result = await agent._fetch_logs(event, integration) assert result == "raw log content" @@ -332,8 +336,10 @@ async def test_fetch_logs_returns_fallback_on_error(): mock_ctx.__aenter__ = AsyncMock(return_value=mock_session) mock_ctx.__aexit__ = AsyncMock(return_value=None) - with patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), \ - patch("phalanx.agents.ci_fixer.get_log_fetcher", return_value=mock_fetcher): + with ( + patch("phalanx.agents.ci_fixer.get_db", return_value=mock_ctx), + patch("phalanx.agents.ci_fixer.get_log_fetcher", return_value=mock_fetcher), + ): result = await agent._fetch_logs(event, integration) # Falls back to cached failure_summary @@ -347,14 +353,14 @@ async def test_fetch_logs_returns_fallback_on_error(): async def test_clone_repo_gitpython_missing(tmp_path): agent = _make_ci_agent() - with patch("phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", - new_callable=AsyncMock) as mock_clone: + with patch("phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", new_callable=AsyncMock): # Simulate ImportError path (gitpython not available) # Test directly by patching the import inside the method pass # Test directly without patching the method itself import builtins + real_import = builtins.__import__ def mock_import(name, *args, **kwargs): @@ -372,8 +378,11 @@ def mock_import(name, *args, **kwargs): async def test_clone_repo_exception_returns_false(tmp_path): agent = _make_ci_agent() - with patch("phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", - new_callable=AsyncMock, return_value=False) as mock_clone: + with patch( + "phalanx.agents.ci_fixer.CIFixerAgent._clone_repo", + new_callable=AsyncMock, + return_value=False, + ): result = await agent._clone_repo(tmp_path, "acme/backend", "main", "abc123", "token") assert result is False @@ -397,7 +406,7 @@ def test_ruff_with_no_line_number(self): assert not result.lint_errors def test_mypy_error_format(self): - log = "src/foo.py:10: error: Argument 1 to \"foo\" has incompatible type\n" + log = 'src/foo.py:10: error: Argument 1 to "foo" has incompatible type\n' result = parse_log(log) # mypy errors should be parsed assert result.tool in ("mypy", "unknown") or len(result.type_errors) >= 0 @@ -428,14 +437,17 @@ def test_parsed_log_as_text(self): def test_has_errors_false_when_empty(self): from phalanx.ci_fixer.log_parser import ParsedLog + p = ParsedLog(tool="unknown") assert not p.has_errors def test_has_errors_true_with_lint_error(self): from phalanx.ci_fixer.log_parser import LintError, ParsedLog - p = ParsedLog(tool="ruff", lint_errors=[ - LintError(file="f.py", line=1, col=1, code="F401", message="x") - ]) + + p = ParsedLog( + tool="ruff", + lint_errors=[LintError(file="f.py", line=1, col=1, code="F401", message="x")], + ) assert p.has_errors @@ -445,12 +457,14 @@ def test_has_errors_true_with_lint_error(self): class TestAnalystEdgeCases: def test_read_files_shim_no_files(self, tmp_path): from phalanx.ci_fixer.analyst import RootCauseAnalyst + analyst = RootCauseAnalyst(call_llm=lambda **_: "") result = analyst._read_files(tmp_path, []) assert "no files found" in result.lower() or isinstance(result, str) def test_read_files_shim_missing_file(self, tmp_path): from phalanx.ci_fixer.analyst import RootCauseAnalyst + analyst = RootCauseAnalyst(call_llm=lambda **_: "") result = analyst._read_files(tmp_path, ["nonexistent.py"]) assert isinstance(result, str) @@ -458,6 +472,7 @@ def test_read_files_shim_missing_file(self, tmp_path): def test_analyze_with_no_errors_returns_low_confidence(self, tmp_path): from phalanx.ci_fixer.analyst import RootCauseAnalyst from phalanx.ci_fixer.log_parser import ParsedLog + analyst = RootCauseAnalyst(call_llm=lambda **_: "{}") plan = analyst.analyze(ParsedLog(tool="unknown"), tmp_path) assert plan.confidence == "low" @@ -475,7 +490,7 @@ def bad_llm(**_): analyst = RootCauseAnalyst(call_llm=bad_llm) parsed = ParsedLog( tool="ruff", - lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")] + lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")], ) plan = analyst.analyze(parsed, tmp_path) assert plan.confidence == "low" @@ -490,7 +505,7 @@ def test_analyze_malformed_json_returns_low_confidence(self, tmp_path): analyst = RootCauseAnalyst(call_llm=lambda **_: "not json at all") parsed = ParsedLog( tool="ruff", - lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")] + lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")], ) plan = analyst.analyze(parsed, tmp_path) assert plan.confidence == "low" @@ -503,6 +518,7 @@ class TestValidatorEdgeCases: def test_validate_unknown_tool(self, tmp_path): from phalanx.ci_fixer.log_parser import ParsedLog from phalanx.ci_fixer.validator import validate_fix + parsed = ParsedLog(tool="unknown_tool") result = validate_fix(parsed, tmp_path) # Unknown tool → should pass or return a graceful result @@ -511,9 +527,10 @@ def test_validate_unknown_tool(self, tmp_path): def test_validate_ruff_with_empty_workspace(self, tmp_path): from phalanx.ci_fixer.log_parser import LintError, ParsedLog from phalanx.ci_fixer.validator import validate_fix + parsed = ParsedLog( tool="ruff", - lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")] + lint_errors=[LintError(file="src/foo.py", line=1, col=1, code="F401", message="x")], ) # Run ruff against empty workspace — ruff not installed in test env → graceful result = validate_fix(parsed, tmp_path) @@ -523,9 +540,10 @@ def test_validate_ruff_with_empty_workspace(self, tmp_path): def test_validate_mypy_with_empty_workspace(self, tmp_path): from phalanx.ci_fixer.log_parser import ParsedLog, TypeError from phalanx.ci_fixer.validator import validate_fix + parsed = ParsedLog( tool="mypy", - type_errors=[TypeError(file="src/foo.py", line=1, col=0, message="type error")] + type_errors=[TypeError(file="src/foo.py", line=1, col=0, message="type error")], ) result = validate_fix(parsed, tmp_path) assert hasattr(result, "passed") @@ -533,13 +551,16 @@ def test_validate_mypy_with_empty_workspace(self, tmp_path): def test_validate_pytest_with_empty_workspace(self, tmp_path): from phalanx.ci_fixer.log_parser import ParsedLog, TestFailure from phalanx.ci_fixer.validator import validate_fix + parsed = ParsedLog( tool="pytest", - test_failures=[TestFailure( - test_id="tests/test_foo.py::test_bar", - file="tests/test_foo.py", - message="AssertionError" - )] + test_failures=[ + TestFailure( + test_id="tests/test_foo.py::test_bar", + file="tests/test_foo.py", + message="AssertionError", + ) + ], ) result = validate_fix(parsed, tmp_path) assert hasattr(result, "passed") diff --git a/tests/unit/test_log_parser_unit.py b/tests/unit/test_log_parser_unit.py index 59cf33ac..3cfa856c 100644 --- a/tests/unit/test_log_parser_unit.py +++ b/tests/unit/test_log_parser_unit.py @@ -7,8 +7,6 @@ from __future__ import annotations -import pytest - from phalanx.ci_fixer.log_parser import ( ParsedLog, clean_log, @@ -107,8 +105,7 @@ def test_parses_mypy_error(self): def test_multiple_mypy_errors(self): log = ( - "src/foo.py:10: error: Item has no attribute\n" - "src/bar.py:20: error: Argument of type\n" + "src/foo.py:10: error: Item has no attribute\nsrc/bar.py:20: error: Argument of type\n" ) parsed = parse_log(log) assert len(parsed.type_errors) == 2 diff --git a/tests/unit/test_outcome_tracker_unit.py b/tests/unit/test_outcome_tracker_unit.py index 58d5ecf1..debd0be1 100644 --- a/tests/unit/test_outcome_tracker_unit.py +++ b/tests/unit/test_outcome_tracker_unit.py @@ -6,7 +6,6 @@ from __future__ import annotations -import json import uuid from datetime import UTC, datetime, timedelta from unittest.mock import AsyncMock, MagicMock, patch @@ -24,7 +23,6 @@ ) from phalanx.db.models import CIFailureFingerprint, CIFixOutcome, CIFixRun - # ── Helpers ──────────────────────────────────────────────────────────────────── @@ -87,8 +85,14 @@ async def test_check_pr_outcome_merged(): mock_client.__aexit__ = AsyncMock(return_value=None) mock_client.get = AsyncMock(return_value=mock_response) - with patch("phalanx.ci_fixer.outcome_tracker._get_github_token", new_callable=AsyncMock, return_value="ghp_token"), \ - patch("httpx.AsyncClient", return_value=mock_client): + with ( + patch( + "phalanx.ci_fixer.outcome_tracker._get_github_token", + new_callable=AsyncMock, + return_value="ghp_token", + ), + patch("httpx.AsyncClient", return_value=mock_client), + ): result = await _check_pr_outcome(run) assert result["outcome"] == "merged" @@ -116,8 +120,14 @@ async def test_check_pr_outcome_closed_unmerged(): mock_client.__aexit__ = AsyncMock(return_value=None) mock_client.get = AsyncMock(return_value=mock_response) - with patch("phalanx.ci_fixer.outcome_tracker._get_github_token", new_callable=AsyncMock, return_value="ghp_token"), \ - patch("httpx.AsyncClient", return_value=mock_client): + with ( + patch( + "phalanx.ci_fixer.outcome_tracker._get_github_token", + new_callable=AsyncMock, + return_value="ghp_token", + ), + patch("httpx.AsyncClient", return_value=mock_client), + ): result = await _check_pr_outcome(run) assert result["outcome"] == "closed_unmerged" @@ -144,8 +154,14 @@ async def test_check_pr_outcome_open(): mock_client.__aexit__ = AsyncMock(return_value=None) mock_client.get = AsyncMock(return_value=mock_response) - with patch("phalanx.ci_fixer.outcome_tracker._get_github_token", new_callable=AsyncMock, return_value="ghp_token"), \ - patch("httpx.AsyncClient", return_value=mock_client): + with ( + patch( + "phalanx.ci_fixer.outcome_tracker._get_github_token", + new_callable=AsyncMock, + return_value="ghp_token", + ), + patch("httpx.AsyncClient", return_value=mock_client), + ): result = await _check_pr_outcome(run) assert result["outcome"] == "open" @@ -164,8 +180,14 @@ async def test_check_pr_outcome_not_found(): mock_client.__aexit__ = AsyncMock(return_value=None) mock_client.get = AsyncMock(return_value=mock_response) - with patch("phalanx.ci_fixer.outcome_tracker._get_github_token", new_callable=AsyncMock, return_value="ghp_token"), \ - patch("httpx.AsyncClient", return_value=mock_client): + with ( + patch( + "phalanx.ci_fixer.outcome_tracker._get_github_token", + new_callable=AsyncMock, + return_value="ghp_token", + ), + patch("httpx.AsyncClient", return_value=mock_client), + ): result = await _check_pr_outcome(run) assert result["outcome"] == "not_found" @@ -176,7 +198,11 @@ async def test_check_pr_outcome_no_token(): """No GitHub token → returns 'open' without calling GitHub.""" run = _make_run() - with patch("phalanx.ci_fixer.outcome_tracker._get_github_token", new_callable=AsyncMock, return_value=None): + with patch( + "phalanx.ci_fixer.outcome_tracker._get_github_token", + new_callable=AsyncMock, + return_value=None, + ): result = await _check_pr_outcome(run) assert result["outcome"] == "open" @@ -187,8 +213,14 @@ async def test_check_pr_outcome_network_error(): """Network error → returns 'open' without raising.""" run = _make_run() - with patch("phalanx.ci_fixer.outcome_tracker._get_github_token", new_callable=AsyncMock, return_value="ghp_token"), \ - patch("httpx.AsyncClient", side_effect=Exception("connection refused")): + with ( + patch( + "phalanx.ci_fixer.outcome_tracker._get_github_token", + new_callable=AsyncMock, + return_value="ghp_token", + ), + patch("httpx.AsyncClient", side_effect=Exception("connection refused")), + ): result = await _check_pr_outcome(run) assert result["outcome"] == "open" @@ -214,8 +246,12 @@ async def test_record_outcome_writes_row(): await _record_outcome( run, poll_number=1, - outcome={"outcome": "merged", "pr_state": "closed", - "merged_at": datetime.now(UTC), "closed_at": None}, + outcome={ + "outcome": "merged", + "pr_state": "closed", + "merged_at": datetime.now(UTC), + "closed_at": None, + }, ) mock_session.add.assert_called_once() @@ -333,12 +369,21 @@ async def test_process_run_poll1_due(): run = _make_run(created_hours_ago=5.0) now = datetime.now(UTC) - with patch("phalanx.ci_fixer.outcome_tracker._check_pr_outcome", new_callable=AsyncMock) as mock_check, \ - patch("phalanx.ci_fixer.outcome_tracker._record_outcome", new_callable=AsyncMock) as mock_record, \ - patch("phalanx.ci_fixer.outcome_tracker._update_fingerprint", new_callable=AsyncMock) as mock_update, \ - patch("phalanx.ci_fixer.outcome_tracker._mark_outcome_checked", new_callable=AsyncMock) as mock_mark, \ - patch("phalanx.ci_fixer.outcome_tracker.get_db") as mock_db: - + with ( + patch( + "phalanx.ci_fixer.outcome_tracker._check_pr_outcome", new_callable=AsyncMock + ) as mock_check, + patch( + "phalanx.ci_fixer.outcome_tracker._record_outcome", new_callable=AsyncMock + ) as mock_record, + patch( + "phalanx.ci_fixer.outcome_tracker._update_fingerprint", new_callable=AsyncMock + ) as mock_update, + patch( + "phalanx.ci_fixer.outcome_tracker._mark_outcome_checked", new_callable=AsyncMock + ) as mock_mark, + patch("phalanx.ci_fixer.outcome_tracker.get_db") as mock_db, + ): # No polls done yet mock_result = MagicMock() mock_result.all.return_value = [] @@ -350,8 +395,10 @@ async def test_process_run_poll1_due(): mock_db.return_value = mock_ctx mock_check.return_value = { - "outcome": "merged", "pr_state": "closed", - "merged_at": datetime.now(UTC), "closed_at": None + "outcome": "merged", + "pr_state": "closed", + "merged_at": datetime.now(UTC), + "closed_at": None, } await _process_run(run, now) @@ -379,7 +426,9 @@ async def test_process_run_all_polls_done(): mock_ctx.__aexit__ = AsyncMock(return_value=None) mock_db.return_value = mock_ctx - with patch("phalanx.ci_fixer.outcome_tracker._check_pr_outcome", new_callable=AsyncMock) as mock_check: + with patch( + "phalanx.ci_fixer.outcome_tracker._check_pr_outcome", new_callable=AsyncMock + ) as mock_check: await _process_run(run, now) # Nothing new to check @@ -393,7 +442,9 @@ async def test_process_run_no_created_at(): run.created_at = None now = datetime.now(UTC) - with patch("phalanx.ci_fixer.outcome_tracker._check_pr_outcome", new_callable=AsyncMock) as mock_check: + with patch( + "phalanx.ci_fixer.outcome_tracker._check_pr_outcome", new_callable=AsyncMock + ) as mock_check: await _process_run(run, now) mock_check.assert_not_called() @@ -441,8 +492,10 @@ async def broken_process(run, now): call_count["n"] += 1 raise RuntimeError("simulated DB error") - with patch("phalanx.ci_fixer.outcome_tracker.get_db", return_value=mock_ctx), \ - patch("phalanx.ci_fixer.outcome_tracker._process_run", side_effect=broken_process): + with ( + patch("phalanx.ci_fixer.outcome_tracker.get_db", return_value=mock_ctx), + patch("phalanx.ci_fixer.outcome_tracker._process_run", side_effect=broken_process), + ): await _poll_all_pending() # Both runs were attempted despite the first one failing diff --git a/tests/unit/test_sandbox_pool.py b/tests/unit/test_sandbox_pool.py new file mode 100644 index 00000000..10bf3dca --- /dev/null +++ b/tests/unit/test_sandbox_pool.py @@ -0,0 +1,887 @@ +""" +Tests for phalanx.ci_fixer.sandbox_pool — SandboxPool, PooledContainer, +get_sandbox_pool, wrap_cmd_for_container, wrap_shell_cmd_for_container. + +Coverage targets: + - SandboxPool._warmup(): min_size=0 (skip), min_size>0 (starts containers) + - SandboxPool.checkout(): happy path, timeout, health check fail + retry + - SandboxPool.checkin(): reset ok → re-enqueue; reset fail → replace; unhealthy after reset → replace + - SandboxPool.borrow(): context manager guarantees checkin on raise + - SandboxPool.shutdown(): drains queues, kills checked-out containers + - SandboxPool._reaper_loop(): kills stale checked-out containers + - SandboxPool._resolve_image(): preferred present → preferred; preferred absent → fallback + - SandboxPool._start_and_enqueue(): pool full → kills extra container + - SandboxUnavailableError raised when pool for unknown stack + - get_sandbox_pool(): lazy singleton, returns same instance on repeat calls + - reset_pool_for_testing(): clears singleton + - wrap_cmd_for_container(): correct docker exec prefix + - wrap_shell_cmd_for_container(): correct sh -c wrapping +""" + +from __future__ import annotations + +import asyncio +import time +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from phalanx.ci_fixer.sandbox_pool import ( + PooledContainer, + SandboxPool, + SandboxUnavailableError, + get_sandbox_pool, + reset_pool_for_testing, + wrap_cmd_for_container, + wrap_shell_cmd_for_container, +) + +# ── Helpers ─────────────────────────────────────────────────────────────────── + + +def _make_container( + container_id: str = "abc123", + stack: str = "python", + image: str = "phalanx-sandbox-python:latest", + checked_out_seconds_ago: float = 0, +) -> PooledContainer: + c = PooledContainer(container_id=container_id, stack=stack, image=image) + c.checked_out_at = time.monotonic() - checked_out_seconds_ago + return c + + +def _make_proc(returncode: int = 0, stdout: bytes = b"ok", stderr: bytes = b"") -> MagicMock: + proc = MagicMock() + proc.returncode = returncode + proc.communicate = AsyncMock(return_value=(stdout, stderr)) + return proc + + +def _mock_settings( + min_size: int = 1, + max_size: int = 2, + checkout_timeout: int = 5, + max_hold: int = 300, + reaper_interval: int = 60, + docker_cmd: str = "docker", +): + s = MagicMock() + s.sandbox_pool_min_size = min_size + s.sandbox_pool_max_size = max_size + s.sandbox_checkout_timeout_seconds = checkout_timeout + s.sandbox_max_hold_seconds = max_hold + s.sandbox_reaper_interval_seconds = reaper_interval + s.sandbox_docker_cmd = docker_cmd + return s + + +# ── wrap helpers ────────────────────────────────────────────────────────────── + + +class TestWrapHelpers: + def test_wrap_cmd_for_container(self): + result = wrap_cmd_for_container("ctr123", ["ruff", "check", "."], "/workspace") + assert result == ["docker", "exec", "-w", "/workspace", "ctr123", "ruff", "check", "."] + + def test_wrap_cmd_custom_docker_cmd(self): + result = wrap_cmd_for_container( + "ctr123", ["go", "test", "./..."], "/ws", docker_cmd="podman" + ) + assert result[0] == "podman" + assert "ctr123" in result + + def test_wrap_shell_cmd_for_container(self): + result = wrap_shell_cmd_for_container("ctr123", "ruff check .") + assert result == [ + "docker", + "exec", + "-w", + "/workspace", + "ctr123", + "sh", + "-c", + "ruff check .", + ] + + def test_wrap_shell_cmd_custom_docker(self): + result = wrap_shell_cmd_for_container("ctr456", "npm test", docker_cmd="podman") + assert result[0] == "podman" + assert "sh" in result + assert "npm test" in result + + +# ── PooledContainer ─────────────────────────────────────────────────────────── + + +class TestPooledContainer: + def test_defaults(self): + c = PooledContainer(container_id="abc", stack="python", image="img:latest") + assert c.healthy is True + assert c.container_id == "abc" + assert isinstance(c.checked_out_at, float) + + def test_fields(self): + c = _make_container(container_id="xyz", stack="go", image="golang:1.22-alpine") + assert c.stack == "go" + assert c.image == "golang:1.22-alpine" + + +# ── SandboxPool._warmup ─────────────────────────────────────────────────────── + + +class TestSandboxPoolWarmup: + @pytest.mark.asyncio + async def test_warmup_min_size_zero_skips(self): + """min_size=0 → no containers started, queues initialised empty.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + assert "python" in pool._queues + assert pool._queues["python"].qsize() == 0 + assert pool._reaper_task is None + + @pytest.mark.asyncio + async def test_warmup_starts_containers(self): + """min_size=1 → _start_and_enqueue called for each stack.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=1, max_size=2) + + start_calls = [] + + async def fake_start_and_enqueue(stack): + container = _make_container(container_id=f"ctr-{stack}", stack=stack) + await pool._queues[stack].put(container) + start_calls.append(stack) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_start_and_enqueue", side_effect=fake_start_and_enqueue): + with patch.object(pool, "_reaper_loop", new_callable=AsyncMock): + await pool._warmup() + + assert len(start_calls) >= 1 + # Reaper task should have been created + assert pool._reaper_task is not None + pool._reaper_task.cancel() + + @pytest.mark.asyncio + async def test_warmup_errors_swallowed(self): + """Errors during warmup don't raise — pool starts empty.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=1) + + async def failing_start(stack): + raise RuntimeError("docker not found") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_start_and_enqueue", side_effect=failing_start): + with patch.object(pool, "_reaper_loop", new_callable=AsyncMock): + await pool._warmup() # should not raise + + # Queues exist but are empty + assert pool._queues["python"].qsize() == 0 + + +# ── SandboxPool.checkout ────────────────────────────────────────────────────── + + +class TestSandboxPoolCheckout: + @pytest.mark.asyncio + async def test_checkout_happy_path(self): + """Container in queue → returned immediately, removed from queue.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, checkout_timeout=5) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container(container_id="ctr1", stack="python") + await pool._queues["python"].put(container) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_health_check", return_value=True): + with patch.object(pool, "_refill", new_callable=AsyncMock): + result = await pool.checkout("python", timeout=5) + + assert result.container_id == "ctr1" + assert "ctr1" in pool._checked_out + + @pytest.mark.asyncio + async def test_checkout_timeout_raises(self): + """Empty queue + short timeout → SandboxUnavailableError.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with pytest.raises(SandboxUnavailableError): + await pool.checkout("python", timeout=1) + + @pytest.mark.asyncio + async def test_checkout_unknown_stack_raises(self): + """Stack not in pool → SandboxUnavailableError immediately.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + with pytest.raises(SandboxUnavailableError, match="no pool"): + await pool.checkout("cobol", timeout=1) + + @pytest.mark.asyncio + async def test_checkout_unhealthy_container_triggers_retry(self): + """Unhealthy container is killed, fresh one started, retry succeeds.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, checkout_timeout=5) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + bad = _make_container("bad-ctr", "python") + good = _make_container("good-ctr", "python") + await pool._queues["python"].put(bad) + + health_calls = [] + + async def fake_health(c): + health_calls.append(c.container_id) + return c.container_id == "good-ctr" + + async def fake_kill(cid): + pass + + async def fake_start_enqueue(stack): + await pool._queues[stack].put(good) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_health_check", side_effect=fake_health): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + with patch.object(pool, "_start_and_enqueue", side_effect=fake_start_enqueue): + with patch.object(pool, "_refill", new_callable=AsyncMock): + result = await pool.checkout("python", timeout=5) + + assert result.container_id == "good-ctr" + assert "bad-ctr" in health_calls + + @pytest.mark.asyncio + async def test_checkout_refill_triggered(self): + """checkout() triggers _refill as a background task that eventually runs.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("ctr1", "go") + await pool._queues["go"].put(container) + + refill_calls = [] + refill_event = asyncio.Event() + + async def fake_refill(stack): + refill_calls.append(stack) + refill_event.set() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_health_check", return_value=True): + with patch.object(pool, "_refill", side_effect=fake_refill): + await pool.checkout("go", timeout=5) + # Give the background task a chance to run + await asyncio.wait_for(refill_event.wait(), timeout=2) + + assert "go" in refill_calls + + +# ── SandboxPool.checkin ─────────────────────────────────────────────────────── + + +class TestSandboxPoolCheckin: + @pytest.mark.asyncio + async def test_checkin_re_enqueues_after_reset(self): + """Reset succeeds + health ok → container back in queue.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("ctr1", "python") + pool._checked_out["ctr1"] = container + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_reset_container", return_value=True): + with patch.object(pool, "_health_check", return_value=True): + await pool.checkin(container) + + assert pool._queues["python"].qsize() == 1 + assert "ctr1" not in pool._checked_out + + @pytest.mark.asyncio + async def test_checkin_reset_fails_replaces_container(self): + """Reset fails → container killed, new one started asynchronously.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("bad-ctr", "python") + pool._checked_out["bad-ctr"] = container + + kill_calls = [] + start_calls = [] + start_event = asyncio.Event() + + async def fake_kill(cid): + kill_calls.append(cid) + + async def fake_start(stack): + start_calls.append(stack) + start_event.set() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_reset_container", return_value=False): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + with patch.object(pool, "_start_and_enqueue", side_effect=fake_start): + await pool.checkin(container) + await asyncio.wait_for(start_event.wait(), timeout=2) + + assert "bad-ctr" in kill_calls + assert "python" in start_calls + assert pool._queues["python"].qsize() == 0 # no re-enqueue + + @pytest.mark.asyncio + async def test_checkin_unhealthy_after_reset_replaces(self): + """Reset ok but health check fails → kill and replace.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("sick-ctr", "go") + pool._checked_out["sick-ctr"] = container + + kill_calls = [] + start_calls = [] + start_event = asyncio.Event() + + async def fake_kill(cid): + kill_calls.append(cid) + + async def fake_start(stack): + start_calls.append(stack) + start_event.set() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_reset_container", return_value=True): + with patch.object(pool, "_health_check", return_value=False): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + with patch.object(pool, "_start_and_enqueue", side_effect=fake_start): + await pool.checkin(container) + await asyncio.wait_for(start_event.wait(), timeout=2) + + assert "sick-ctr" in kill_calls + assert pool._queues["go"].qsize() == 0 + + @pytest.mark.asyncio + async def test_checkin_during_shutdown_kills_container(self): + """When pool is shutting down, checked-in container is killed not re-queued.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + pool._shutdown = True + container = _make_container("ctr1", "python") + + kill_calls = [] + + async def fake_kill(cid): + kill_calls.append(cid) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + await pool.checkin(container) + + assert "ctr1" in kill_calls + assert pool._queues["python"].qsize() == 0 + + +# ── SandboxPool.borrow ──────────────────────────────────────────────────────── + + +class TestSandboxPoolBorrow: + @pytest.mark.asyncio + async def test_borrow_checks_in_on_success(self): + """borrow() context manager checks container back in after normal exit.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("ctr1", "python") + await pool._queues["python"].put(container) + + checkin_calls = [] + + async def fake_checkin(c): + checkin_calls.append(c.container_id) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_health_check", return_value=True): + with patch.object(pool, "_refill", new_callable=AsyncMock): + with patch.object(pool, "checkin", side_effect=fake_checkin): + async with pool.borrow("python", timeout=5) as borrowed: + assert borrowed.container_id == "ctr1" + + assert "ctr1" in checkin_calls + + @pytest.mark.asyncio + async def test_borrow_checks_in_on_exception(self): + """borrow() guarantees checkin even when the body raises.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("ctr1", "python") + await pool._queues["python"].put(container) + + checkin_calls = [] + + async def fake_checkin(c): + checkin_calls.append(c.container_id) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_health_check", return_value=True): + with patch.object(pool, "_refill", new_callable=AsyncMock): + with patch.object(pool, "checkin", side_effect=fake_checkin): + with pytest.raises(ValueError): + async with pool.borrow("python", timeout=5): + raise ValueError("fix run crashed") + + assert "ctr1" in checkin_calls + + +# ── SandboxPool.shutdown ────────────────────────────────────────────────────── + + +class TestSandboxPoolShutdown: + @pytest.mark.asyncio + async def test_shutdown_kills_queued_containers(self): + """shutdown() kills all containers in queues.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + c1 = _make_container("ctr1", "python") + c2 = _make_container("ctr2", "go") + await pool._queues["python"].put(c1) + await pool._queues["go"].put(c2) + + kill_calls = [] + + async def fake_kill(cid): + kill_calls.append(cid) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + await pool.shutdown() + + assert "ctr1" in kill_calls + assert "ctr2" in kill_calls + + @pytest.mark.asyncio + async def test_shutdown_kills_checked_out_containers(self): + """shutdown() also kills containers currently checked out.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + container = _make_container("live-ctr", "rust") + pool._checked_out["live-ctr"] = container + + kill_calls = [] + + async def fake_kill(cid): + kill_calls.append(cid) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + await pool.shutdown() + + assert "live-ctr" in kill_calls + + +# ── SandboxPool._reaper_loop ────────────────────────────────────────────────── + + +class TestSandboxPoolReaper: + @pytest.mark.asyncio + async def test_reaper_kills_stale_container(self): + """Container checked out > max_hold_seconds → reaped and replaced.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, max_hold=10, reaper_interval=1) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + stale = _make_container("stale-ctr", "python", checked_out_seconds_ago=20) + pool._checked_out["stale-ctr"] = stale + + kill_calls = [] + start_calls = [] + done_event = asyncio.Event() + + async def fake_sleep(secs): + pass # instant + + async def fake_kill(cid): + kill_calls.append(cid) + + async def fake_start(stack): + start_calls.append(stack) + pool._shutdown = True # stop loop after this iteration + done_event.set() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + with patch.object(pool, "_start_and_enqueue", side_effect=fake_start): + with patch("asyncio.sleep", side_effect=fake_sleep): + task = asyncio.create_task(pool._reaper_loop()) + await asyncio.wait_for(done_event.wait(), timeout=5) + await task + + assert "stale-ctr" in kill_calls + assert "python" in start_calls + + @pytest.mark.asyncio + async def test_reaper_leaves_fresh_container_alone(self): + """Container checked out recently → not reaped.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, max_hold=300, reaper_interval=1) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + fresh = _make_container("fresh-ctr", "python", checked_out_seconds_ago=5) + pool._checked_out["fresh-ctr"] = fresh + + kill_calls = [] + slept = asyncio.Event() + + async def fake_sleep(secs): + slept.set() + pool._shutdown = True # stop after first iteration + + async def fake_kill(cid): + kill_calls.append(cid) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + with patch("asyncio.sleep", side_effect=fake_sleep): + task = asyncio.create_task(pool._reaper_loop()) + await asyncio.wait_for(slept.wait(), timeout=5) + await task + + assert "fresh-ctr" not in kill_calls + + @pytest.mark.asyncio + async def test_reaper_stops_on_cancelled(self): + """CancelledError exits the loop cleanly.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, reaper_interval=1) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + async def raise_cancel(secs): + raise asyncio.CancelledError() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.sleep", side_effect=raise_cancel): + await pool._reaper_loop() # should return cleanly, not propagate + + +# ── SandboxPool._resolve_image ──────────────────────────────────────────────── + + +class TestResolveImage: + @pytest.mark.asyncio + async def test_preferred_image_present(self): + """docker image inspect returns 0 → preferred image used.""" + pool = SandboxPool() + mock_settings = _mock_settings() + proc = _make_proc(returncode=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._resolve_image("python") + + assert result == "phalanx-sandbox-python:latest" + + @pytest.mark.asyncio + async def test_preferred_image_absent_uses_fallback(self): + """docker image inspect returns non-zero → fallback image used.""" + pool = SandboxPool() + mock_settings = _mock_settings() + proc = _make_proc(returncode=1) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._resolve_image("python") + + assert result == "python:3.12-slim" + + @pytest.mark.asyncio + async def test_unknown_stack_returns_ubuntu(self): + """Unknown stack → ubuntu:22.04 fallback.""" + pool = SandboxPool() + mock_settings = _mock_settings() + proc = _make_proc(returncode=1) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._resolve_image("unknown") + + assert result == "ubuntu:22.04" + + +# ── SandboxPool._start_and_enqueue ─────────────────────────────────────────── + + +class TestStartAndEnqueue: + @pytest.mark.asyncio + async def test_enqueues_when_pool_not_full(self): + """Container started + pool has room → added to queue.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, max_size=2) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + async def fake_start(stack): + return "new-ctr" + + async def fake_resolve(stack): + return "phalanx-sandbox-python:latest" + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_start_container", side_effect=fake_start): + with patch.object(pool, "_resolve_image", side_effect=fake_resolve): + await pool._start_and_enqueue("python") + + assert pool._queues["python"].qsize() == 1 + + @pytest.mark.asyncio + async def test_kills_extra_when_pool_full(self): + """Container started but pool already at max_size → kill the extra.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0, max_size=1) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + # Pre-fill the queue to max_size + existing = _make_container("existing", "python") + await pool._queues["python"].put(existing) + + kill_calls = [] + + async def fake_start(stack): + return "overflow-ctr" + + async def fake_resolve(stack): + return "img:latest" + + async def fake_kill(cid): + kill_calls.append(cid) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_start_container", side_effect=fake_start): + with patch.object(pool, "_resolve_image", side_effect=fake_resolve): + with patch.object(pool, "_kill_container", side_effect=fake_kill): + await pool._start_and_enqueue("python") + + assert "overflow-ctr" in kill_calls + assert pool._queues["python"].qsize() == 1 # still just the existing one + + @pytest.mark.asyncio + async def test_start_failure_is_swallowed(self): + """_start_container raises → error logged, no exception propagated.""" + pool = SandboxPool() + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + await pool._warmup() + + async def fake_start(stack): + raise RuntimeError("docker daemon not found") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch.object(pool, "_start_container", side_effect=fake_start): + await pool._start_and_enqueue("python") # must not raise + + assert pool._queues["python"].qsize() == 0 + + +# ── get_sandbox_pool singleton ──────────────────────────────────────────────── + + +class TestGetSandboxPool: + def setup_method(self): + reset_pool_for_testing() + + def teardown_method(self): + reset_pool_for_testing() + + @pytest.mark.asyncio + async def test_returns_pool_instance(self): + """get_sandbox_pool() returns a SandboxPool.""" + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch( + "phalanx.ci_fixer.sandbox_pool.SandboxPool._warmup", + new_callable=AsyncMock, + ): + pool = await get_sandbox_pool() + + assert isinstance(pool, SandboxPool) + + @pytest.mark.asyncio + async def test_returns_same_instance_on_repeat_calls(self): + """Second call returns the same singleton.""" + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch( + "phalanx.ci_fixer.sandbox_pool.SandboxPool._warmup", + new_callable=AsyncMock, + ): + p1 = await get_sandbox_pool() + p2 = await get_sandbox_pool() + + assert p1 is p2 + + @pytest.mark.asyncio + async def test_reset_allows_new_instance(self): + """reset_pool_for_testing() clears singleton → next call creates fresh pool.""" + mock_settings = _mock_settings(min_size=0) + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch( + "phalanx.ci_fixer.sandbox_pool.SandboxPool._warmup", + new_callable=AsyncMock, + ): + p1 = await get_sandbox_pool() + + reset_pool_for_testing() + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch( + "phalanx.ci_fixer.sandbox_pool.SandboxPool._warmup", + new_callable=AsyncMock, + ): + p2 = await get_sandbox_pool() + + assert p1 is not p2 + + +# ── SandboxPool._health_check ───────────────────────────────────────────────── + + +class TestHealthCheck: + @pytest.mark.asyncio + async def test_healthy_container(self): + pool = SandboxPool() + mock_settings = _mock_settings() + container = _make_container("ctr1") + proc = _make_proc(returncode=0, stdout=b"ok") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._health_check(container) + + assert result is True + + @pytest.mark.asyncio + async def test_unhealthy_container_nonzero_exit(self): + pool = SandboxPool() + mock_settings = _mock_settings() + container = _make_container("ctr1") + proc = _make_proc(returncode=1, stdout=b"") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._health_check(container) + + assert result is False + + @pytest.mark.asyncio + async def test_health_check_exception_returns_false(self): + pool = SandboxPool() + mock_settings = _mock_settings() + container = _make_container("ctr1") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", side_effect=FileNotFoundError("docker")): + result = await pool._health_check(container) + + assert result is False + + +# ── SandboxPool._reset_container ───────────────────────────────────────────── + + +class TestResetContainer: + @pytest.mark.asyncio + async def test_reset_success(self): + pool = SandboxPool() + mock_settings = _mock_settings() + container = _make_container("ctr1") + proc = _make_proc(returncode=0, stdout=b"done") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._reset_container(container) + + assert result is True + + @pytest.mark.asyncio + async def test_reset_failure_nonzero(self): + pool = SandboxPool() + mock_settings = _mock_settings() + container = _make_container("ctr1") + proc = _make_proc(returncode=1, stdout=b"") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", return_value=proc): + result = await pool._reset_container(container) + + assert result is False + + @pytest.mark.asyncio + async def test_reset_exception_returns_false(self): + pool = SandboxPool() + mock_settings = _mock_settings() + container = _make_container("ctr1") + + with patch("phalanx.ci_fixer.sandbox_pool.settings", mock_settings): + with patch("asyncio.create_subprocess_exec", side_effect=Exception("timeout")): + result = await pool._reset_container(container) + + assert result is False