diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7b936da..ba39998f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -314,7 +314,7 @@ jobs: run: pip install -e ".[dev]" - name: Run migrations - run: alembic upgrade head + run: alembic -c alembic/alembic.ini upgrade head - name: Run unit tests run: | @@ -419,7 +419,7 @@ jobs: run: pip install -e ".[dev]" - name: Run migrations - run: alembic upgrade head + run: alembic -c alembic/alembic.ini upgrade head - name: Run integration tests env: @@ -491,6 +491,7 @@ jobs: context: . target: production push: false + load: true cache-from: type=gha cache-to: type=gha,mode=max tags: forge:prod-${{ github.sha }} diff --git a/Dockerfile b/Dockerfile index 62c66670..0d93d305 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,6 +66,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git \ nodejs \ npm \ + ca-certificates \ + gnupg \ + && install -m 0755 -d /etc/apt/keyrings \ + && curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \ + && chmod a+r /etc/apt/keyrings/docker.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian $(. /etc/os-release && echo $VERSION_CODENAME) stable" > /etc/apt/sources.list.d/docker.list \ + && apt-get update && apt-get install -y --no-install-recommends docker-ce-cli \ && rm -rf /var/lib/apt/lists/* RUN groupadd --gid 1001 forge && \ diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index d5344b8a..daa339e0 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -113,6 +113,7 @@ services: --queues=default,commander,planner,reviewer,qa,release,security,builder,ingestion,skill_drills,ci_fixer --concurrency=4 --max-tasks-per-child=100 + user: root environment: PHALANX_WORKER: "1" # forces NullPool — prevents "Future attached to different loop" in forked workers OPENAI_MODEL_REASONING: "gpt-4.1" # reasoning agents: Commander, Planner, QA, Reviewer, Release @@ -120,6 +121,7 @@ services: - forge-repos:/tmp/forge-repos - ./configs:/app/configs:ro - ./skill-registry:/app/skill-registry:ro + - /var/run/docker.sock:/var/run/docker.sock # required: CI fixer spawns sandbox containers depends_on: postgres: condition: service_healthy diff --git a/docker/sandbox/go/Dockerfile b/docker/sandbox/go/Dockerfile index b93a6565..521bf9e2 100644 --- a/docker/sandbox/go/Dockerfile +++ b/docker/sandbox/go/Dockerfile @@ -1,16 +1,15 @@ FROM golang:1.22-alpine -# Install staticcheck and golangci-lint for broader Go lint coverage -RUN go install honnef.co/go/tools/cmd/staticcheck@2024.1.0 && \ - go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.59.1 +RUN apk add --no-cache git -# Create non-root user -RUN adduser -D -u 1000 phalanx +RUN go install honnef.co/go/tools/cmd/staticcheck@v0.5.1 -COPY ../reset.sh /phalanx/reset.sh -RUN chmod +x /phalanx/reset.sh +RUN adduser -D -u 1000 phalanx -RUN mkdir -p /workspace && chown phalanx:phalanx /workspace +COPY reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh && \ + mkdir -p /workspace && \ + chown phalanx /workspace WORKDIR /workspace USER phalanx diff --git a/docker/sandbox/node/Dockerfile b/docker/sandbox/node/Dockerfile index 07a2e6ed..c2ff3ab4 100644 --- a/docker/sandbox/node/Dockerfile +++ b/docker/sandbox/node/Dockerfile @@ -1,19 +1,18 @@ FROM node:20-slim -# Create non-root user -RUN useradd -m -u 1000 -s /bin/bash phalanx 2>/dev/null || true +RUN usermod -l phalanx -d /home/phalanx -m node && \ + groupmod -n phalanx node 2>/dev/null || true -# Install common Node tooling used by the CI fixer. RUN npm install -g \ eslint@8.57.0 \ typescript@5.4.5 \ jest@29.7.0 \ --no-fund --no-audit -COPY ../reset.sh /phalanx/reset.sh -RUN chmod +x /phalanx/reset.sh - -RUN mkdir -p /workspace && chown phalanx:phalanx /workspace +COPY reset.sh /phalanx/reset.sh +RUN chmod +x /phalanx/reset.sh && \ + mkdir -p /workspace && \ + chown phalanx:phalanx /workspace 2>/dev/null || chown phalanx /workspace WORKDIR /workspace USER phalanx diff --git a/phalanx/agents/ci_fixer.py b/phalanx/agents/ci_fixer.py index aa6a5475..2e224392 100644 --- a/phalanx/agents/ci_fixer.py +++ b/phalanx/agents/ci_fixer.py @@ -1460,7 +1460,18 @@ async def _update_fingerprint_on_success( # ── Auth helpers ──────────────────────────────────────────────────────────── def _decrypt_key(self, encrypted_key: str) -> str: - return encrypted_key # Phase 2: KMS decrypt + if not encrypted_key: + return "" + enc_key = getattr(settings, "encryption_key", None) + if not enc_key: + return encrypted_key + try: + from cryptography.fernet import Fernet, InvalidToken # noqa: PLC0415 + + f = Fernet(enc_key.encode()) + return f.decrypt(encrypted_key.encode()).decode() + except (InvalidToken, Exception): + return encrypted_key def _get_github_token(self, integration: CIIntegration) -> str: return integration.github_token or settings.github_token diff --git a/phalanx/ci_fixer/analyst.py b/phalanx/ci_fixer/analyst.py index 68f72c51..c568455a 100644 --- a/phalanx/ci_fixer/analyst.py +++ b/phalanx/ci_fixer/analyst.py @@ -123,8 +123,9 @@ def is_actionable(self) -> bool: "corrected_lines" (a JSON array of strings, one per line, each ending with \\n). 3. "corrected_lines" may differ from the original window by at most \ {max_line_delta} lines (adding or removing). Do NOT rewrite the whole file. -4. NEVER modify test files (paths containing /test or test_). -5. For unused imports (F401): delete the import line only. +4. NEVER rewrite test logic — only mechanical lint fixes (unused imports, line length) + in test files are allowed. Do NOT change assertions, test structure, or test data. +5. For unused imports (F401): delete the import line only. Set corrected_lines to []. 6. For line-too-long (E501): wrap or shorten the line only. 7. For future-import order (F404): move the __future__ import to line 1 only. 8. If you cannot produce a high or medium confidence fix, set \ @@ -205,11 +206,18 @@ def analyze( root_cause="Could not read any of the failing files from workspace", ) + lint_only = bool( + parsed_log.lint_errors + and not parsed_log.type_errors + and not parsed_log.test_failures + and not parsed_log.build_errors + ) + # ── Phase 2: history check ───────────────────────────────────────────── if fingerprint_hash and self._history_lookup is not None: cached = self._history_lookup(fingerprint_hash) if cached: - patches = self._parse_and_validate_patches(cached, windows) + patches = self._parse_and_validate_patches(cached, windows, lint_only=lint_only) if patches: log.info( "ci_analyst.history_hit", @@ -258,7 +266,9 @@ def analyze( log.warning("ci_analyst.json_parse_failed", error=str(exc), raw=raw[:500]) return FixPlan(confidence="low", root_cause="LLM returned non-JSON response") - patches = self._parse_and_validate_patches(data.get("patches", []), windows) + patches = self._parse_and_validate_patches( + data.get("patches", []), windows, lint_only=lint_only + ) confidence = data.get("confidence", "low") # If patch validation rejected everything, downgrade to low @@ -283,10 +293,10 @@ def _read_windows(self, workspace: Path, parsed_log: ParsedLog) -> list[FileWind """ # Build map: file_path → list of error line numbers error_lines_by_file: dict[str, list[int]] = {} - for e in parsed_log.lint_errors: - error_lines_by_file.setdefault(e.file, []).append(e.line) - for e in parsed_log.type_errors: - error_lines_by_file.setdefault(e.file, []).append(e.line) + for le in parsed_log.lint_errors: + error_lines_by_file.setdefault(le.file, []).append(le.line) + for te in parsed_log.type_errors: + error_lines_by_file.setdefault(te.file, []).append(te.line) # For test failures we have no line number — read top of file for f in parsed_log.test_failures: error_lines_by_file.setdefault(f.file, []).append(1) @@ -333,7 +343,7 @@ def _read_windows(self, workspace: Path, parsed_log: ParsedLog) -> list[FileWind # ── Patch validation ─────────────────────────────────────────────────────── def _parse_and_validate_patches( - self, raw_patches: list, windows: list[FileWindow] + self, raw_patches: list, windows: list[FileWindow], lint_only: bool = False ) -> list[FilePatch]: """ Parse LLM patch dicts, apply guard rails, return only safe patches. @@ -342,7 +352,8 @@ def _parse_and_validate_patches( - path not in the windows we sent (LLM invented a file) - start_line / end_line don't match the window we sent (off-by-more-than-2) - |delta| > _MAX_LINE_DELTA (LLM rewrote too much) - - path looks like a test file + - path looks like a test file (unless lint_only=True — lint fixes in test + files are valid, e.g. removing unused imports) """ window_by_path = {w.path: w for w in windows} safe: list[FilePatch] = [] @@ -358,14 +369,14 @@ def _parse_and_validate_patches( log.warning("ci_analyst.patch_unknown_file", path=path) continue - # Guard: never touch test files - if _is_test_file(path): + # Guard: never touch test files unless it's a lint-only fix + if _is_test_file(path) and not lint_only: log.warning("ci_analyst.patch_test_file_rejected", path=path) continue - # Guard: corrected_lines must be a non-empty list of strings - if not isinstance(corrected, list) or not corrected: - log.warning("ci_analyst.patch_empty_corrected_lines", path=path) + # Guard: corrected_lines must be a list (empty = delete the lines) + if not isinstance(corrected, list): + log.warning("ci_analyst.patch_invalid_corrected_lines", path=path) continue # Ensure every line ends with \n @@ -379,18 +390,23 @@ def _parse_and_validate_patches( log.warning("ci_analyst.patch_missing_line_range", path=path) continue - if abs(start - window.start_line) > 2 or abs(end - window.end_line) > 2: + # Accept sub-ranges (LLM targeting a specific line within the window is correct). + # Clamp to window bounds if the patch extends outside. + if start < window.start_line or end > window.end_line: log.warning( - "ci_analyst.patch_line_range_mismatch", + "ci_analyst.patch_line_range_outside_window", path=path, - expected_start=window.start_line, - expected_end=window.end_line, + window_start=window.start_line, + window_end=window.end_line, got_start=start, got_end=end, ) - # Clamp to the window we actually sent — safer than rejecting - start = window.start_line - end = window.end_line + start = window.start_line if start < window.start_line else start + end = window.end_line if end > window.end_line else end + # If clamping made start > end, fall back to the full window + if start > end: + start = window.start_line + end = window.end_line original_size = end - start + 1 delta = len(corrected) - original_size diff --git a/phalanx/ci_fixer/log_parser.py b/phalanx/ci_fixer/log_parser.py index ac22e73a..87535fde 100644 --- a/phalanx/ci_fixer/log_parser.py +++ b/phalanx/ci_fixer/log_parser.py @@ -84,22 +84,22 @@ def all_files(self) -> list[str]: """All unique files mentioned across all error types.""" seen: set[str] = set() files: list[str] = [] - for e in self.lint_errors: - if e.file not in seen: - seen.add(e.file) - files.append(e.file) - for e in self.type_errors: - if e.file not in seen: - seen.add(e.file) - files.append(e.file) - for e in self.test_failures: - if e.file not in seen: - seen.add(e.file) - files.append(e.file) - for e in self.build_errors: - if e.file and e.file not in seen: - seen.add(e.file) - files.append(e.file) + for le in self.lint_errors: + if le.file not in seen: + seen.add(le.file) + files.append(le.file) + for te in self.type_errors: + if te.file not in seen: + seen.add(te.file) + files.append(te.file) + for tf in self.test_failures: + if tf.file not in seen: + seen.add(tf.file) + files.append(tf.file) + for be in self.build_errors: + if be.file and be.file not in seen: + seen.add(be.file) + files.append(be.file) return files def summary(self) -> str: @@ -128,24 +128,24 @@ def as_text(self) -> str: if self.type_errors: lines.append("TYPE ERRORS:") - for e in self.type_errors[:10]: - lines.append(f" {e.file}:{e.line}: {e.message}") + for te in self.type_errors[:10]: + lines.append(f" {te.file}:{te.line}: {te.message}") lines.append("") if self.test_failures: lines.append("TEST FAILURES:") - for f in self.test_failures[:10]: - lines.append(f" {f.test_id}") - if f.message: - for msg_line in f.message.splitlines()[:5]: + for tf in self.test_failures[:10]: + lines.append(f" {tf.test_id}") + if tf.message: + for msg_line in tf.message.splitlines()[:5]: lines.append(f" {msg_line}") lines.append("") if self.build_errors: lines.append("BUILD ERRORS:") - for e in self.build_errors[:5]: - prefix = f" {e.file}: " if e.file else " " - lines.append(f"{prefix}{e.message}") + for be in self.build_errors[:5]: + prefix = f" {be.file}: " if be.file else " " + lines.append(f"{prefix}{be.message}") lines.append("") return "\n".join(lines) @@ -153,13 +153,21 @@ def as_text(self) -> str: # ── Regex patterns ───────────────────────────────────────────────────────────── -# ruff: phalanx/agents/foo.py:1:10: F401 'os' imported but unused +# ruff standard format: phalanx/agents/foo.py:1:10: F401 'os' imported but unused _RUFF_RE = re.compile( r"^([\w./\-]+\.py):(\d+):(\d+):\s+([A-Z]\d+)\s+(.+)$", re.MULTILINE, ) -# mypy: phalanx/agents/foo.py:42: error: Incompatible return value +# ruff rich/diagnostic format (--output-format=full or terminal default): +# F401 [*] `sys` imported but unused +# --> tests/test_eval_outcome.py:259:8 +_RUFF_RICH_RE = re.compile( + r"^([A-Z]\d+)\s+(?:\[\*\]\s+)?(.+?)\n\s+-->\s+([\w./\-]+\.py):(\d+):(\d+)", + re.MULTILINE, +) + +# mypy output format: phalanx/agents/foo.py:42: error: Incompatible return value _MYPY_RE = re.compile( r"^([\w./\-]+\.py):(\d+):\s+error:\s+(.+)$", re.MULTILINE, @@ -248,7 +256,7 @@ def parse_log(raw: str) -> ParsedLog: # Determine primary tool if lint_errors: - tool = "ruff" if _RUFF_RE.search(text) else "eslint" + tool = "ruff" if (_RUFF_RE.search(text) or _RUFF_RICH_RE.search(text)) else "eslint" elif type_errors: tool = "mypy" if _MYPY_RE.search(text) else "tsc" elif test_failures: @@ -272,16 +280,39 @@ def parse_log(raw: str) -> ParsedLog: def _parse_ruff(text: str) -> list[LintError]: errors: list[LintError] = [] + seen: set[tuple] = set() + for m in _RUFF_RE.finditer(text): - errors.append( - LintError( - file=m.group(1), - line=int(m.group(2)), - col=int(m.group(3)), - code=m.group(4), - message=m.group(5).strip(), + key = (m.group(1), int(m.group(2)), m.group(4)) + if key not in seen: + seen.add(key) + errors.append( + LintError( + file=m.group(1), + line=int(m.group(2)), + col=int(m.group(3)), + code=m.group(4), + message=m.group(5).strip(), + ) ) - ) + + # Also parse rich/diagnostic format (--output-format=full or terminal default): + # F401 [*] `sys` imported but unused + # --> tests/test_eval_outcome.py:259:8 + for m in _RUFF_RICH_RE.finditer(text): + key = (m.group(3), int(m.group(4)), m.group(1)) + if key not in seen: + seen.add(key) + errors.append( + LintError( + file=m.group(3), + line=int(m.group(4)), + col=int(m.group(5)), + code=m.group(1), + message=m.group(2).strip(), + ) + ) + return errors diff --git a/phalanx/ci_fixer/outcome_tracker.py b/phalanx/ci_fixer/outcome_tracker.py index 022f63c1..a4b6a1fb 100644 --- a/phalanx/ci_fixer/outcome_tracker.py +++ b/phalanx/ci_fixer/outcome_tracker.py @@ -198,11 +198,12 @@ async def _get_github_token(run: CIFixRun) -> str | None: from phalanx.config.settings import get_settings # noqa: PLC0415 settings = get_settings() - if settings.encryption_key: + enc_key = getattr(settings, "encryption_key", None) + if enc_key: try: from cryptography.fernet import Fernet # noqa: PLC0415 - f = Fernet(settings.encryption_key.encode()) + f = Fernet(enc_key.encode()) return f.decrypt(integration.ci_api_key_enc.encode()).decode() except Exception: pass diff --git a/phalanx/ci_fixer/validator.py b/phalanx/ci_fixer/validator.py index b47f767d..1e9ad0c1 100644 --- a/phalanx/ci_fixer/validator.py +++ b/phalanx/ci_fixer/validator.py @@ -7,6 +7,10 @@ 2. Regression check — after the primary per-file check passes, the broader codebase is scanned for NEW errors introduced by the patch. A fix that breaks other files is treated as failed. + 3. CI-parity discovery — reads .github/workflows/*.yml in the workspace to + discover the exact commands the CI runs (e.g. ruff format --check, mypy flags, + pytest --cov-fail-under). Falls back to sensible defaults when no CI config + is found, so it works for any GitHub Actions repo generically. Supports: ruff, mypy, pytest, tsc, eslint. Unknown tools → skipped (passed=True, explicit log). @@ -14,16 +18,115 @@ from __future__ import annotations +import re import subprocess from dataclasses import dataclass, field +from pathlib import Path # noqa: TC003 from typing import TYPE_CHECKING import structlog if TYPE_CHECKING: - from pathlib import Path + from phalanx.ci_fixer.log_parser import LintError, ParsedLog + from phalanx.ci_fixer.log_parser import TypeError as TypeErr + +# ── CI config discovery ──────────────────────────────────────────────────────── + +# Regexes to extract tool commands from CI YAML step `run:` blocks +_YAML_RUN_RE = re.compile(r"^\s*run:\s*[|>]?\s*(.+)$", re.MULTILINE) +# Multi-line run blocks (|- or |) — capture everything indented under `run:` +_YAML_RUN_BLOCK_RE = re.compile(r"run:\s*\|[-]?\n((?:[ \t]+.+\n?)*)", re.MULTILINE) + + +def _discover_ci_commands(tool: str, workspace: Path) -> dict: + """ + Read .github/workflows/*.yml in the workspace and extract commands relevant + to the given tool. + + Returns a dict with tool-specific flags discovered from CI, e.g.: + ruff → {"run_format_check": True} + mypy → {"extra_flags": ["--ignore-missing-imports"]} + pytest → {"cov_fail_under": 70, "extra_flags": ["-x"]} + + Falls back to empty/False defaults when CI YAML is absent or tool not found. + This ensures the validator stays generic across any GitHub Actions repo. + """ + workflows_dir = workspace / ".github" / "workflows" + if not workflows_dir.is_dir(): + return {} + + all_run_lines: list[str] = [] + for yml_file in workflows_dir.glob("*.yml"): + try: + text = yml_file.read_text(errors="replace") + # Extract inline run: value lines + for m in _YAML_RUN_RE.finditer(text): + all_run_lines.append(m.group(1).strip()) + # Extract multi-line run block lines + for m in _YAML_RUN_BLOCK_RE.finditer(text): + for line in m.group(1).splitlines(): + stripped = line.strip() + if stripped: + all_run_lines.append(stripped) + except Exception: + continue + + if tool == "ruff": + return _discover_ruff_config(all_run_lines) + if tool == "mypy": + return _discover_mypy_config(all_run_lines) + if tool == "pytest": + return _discover_pytest_config(all_run_lines) + return {} + + +def _discover_ruff_config(run_lines: list[str]) -> dict: + """Detect whether CI runs `ruff format --check` in addition to `ruff check`.""" + run_format_check = any( + "ruff" in line + and "format" in line + and ("--check" in line or "check" in line.split("format")[-1]) + for line in run_lines + ) + return {"run_format_check": run_format_check} + + +def _discover_mypy_config(run_lines: list[str]) -> dict: + """Extract extra mypy flags used in CI (e.g. --ignore-missing-imports).""" + extra_flags: list[str] = [] + known_flags = [ + "--ignore-missing-imports", + "--strict", + "--disallow-untyped-defs", + "--no-implicit-optional", + "--warn-return-any", + "--warn-unused-ignores", + "--check-untyped-defs", + ] + for line in run_lines: + if "mypy" not in line: + continue + for flag in known_flags: + if flag in line and flag not in extra_flags: + extra_flags.append(flag) + return {"extra_flags": extra_flags} + + +def _discover_pytest_config(run_lines: list[str]) -> dict: + """Extract pytest --cov-fail-under threshold and common flags from CI.""" + cov_fail_under: int | None = None + extra_flags: list[str] = [] + for line in run_lines: + if "pytest" not in line: + continue + m = re.search(r"--cov-fail-under[=\s]+(\d+)", line) + if m: + cov_fail_under = int(m.group(1)) + for flag in ("-x", "--tb=short", "--tb=long", "--tb=no", "-q", "-v"): + if flag in line.split() and flag not in extra_flags: + extra_flags.append(flag) + return {"cov_fail_under": cov_fail_under, "extra_flags": extra_flags} - from phalanx.ci_fixer.log_parser import ParsedLog log = structlog.get_logger(__name__) @@ -63,12 +166,14 @@ def validate_fix( files = parsed_log.all_files[:6] tool_version = _get_tool_version(tool) + ci_config = _discover_ci_commands(tool, workspace) + if tool == "ruff": - result = _run_ruff(workspace, files, tool_version) + result = _run_ruff(workspace, files, tool_version, ci_config) elif tool == "mypy": - result = _run_mypy(workspace, files, tool_version) + result = _run_mypy(workspace, files, tool_version, ci_config) elif tool == "pytest": - result = _run_pytest(workspace, parsed_log, tool_version) + result = _run_pytest(workspace, parsed_log, tool_version, ci_config) elif tool in ("tsc", "eslint"): result = _run_node_linter(workspace, tool, files, tool_version) else: @@ -111,28 +216,75 @@ def validate_fix( # ── Tool runners ─────────────────────────────────────────────────────────────── -def _run_ruff(workspace: Path, files: list[str], tool_version: str) -> ValidationResult: +def _run_ruff( + workspace: Path, files: list[str], tool_version: str, ci_config: dict | None = None +) -> ValidationResult: targets = files if files else ["."] + ci_config = ci_config or {} + + # Step 1: ruff check (lint) code, output = _run(["ruff", "check"] + targets, workspace) - passed = code == 0 - log.info("ci_validator.ruff", passed=passed, files=files, version=tool_version) - return ValidationResult(passed=passed, tool="ruff", output=output, tool_version=tool_version) + if code != 0: + log.info("ci_validator.ruff_check", passed=False, files=files, version=tool_version) + return ValidationResult(passed=False, tool="ruff", output=output, tool_version=tool_version) + + # Step 2: ruff format --check — only if CI actually runs it + if ci_config.get("run_format_check"): + fmt_code, fmt_output = _run(["ruff", "format", "--check"] + targets, workspace) + combined = (output + "\n" + fmt_output).strip() + passed = fmt_code == 0 + log.info("ci_validator.ruff_format_check", passed=passed, files=files, version=tool_version) + return ValidationResult( + passed=passed, tool="ruff", output=combined, tool_version=tool_version + ) + + log.info("ci_validator.ruff", passed=True, files=files, version=tool_version) + return ValidationResult(passed=True, tool="ruff", output=output, tool_version=tool_version) -def _run_mypy(workspace: Path, files: list[str], tool_version: str) -> ValidationResult: +def _run_mypy( + workspace: Path, files: list[str], tool_version: str, ci_config: dict | None = None +) -> ValidationResult: targets = files if files else ["."] - code, output = _run(["mypy"] + targets, workspace) + ci_config = ci_config or {} + extra_flags: list[str] = ci_config.get("extra_flags", []) + code, output = _run(["mypy"] + extra_flags + targets, workspace) passed = code == 0 - log.info("ci_validator.mypy", passed=passed, files=files, version=tool_version) + log.info( + "ci_validator.mypy", passed=passed, files=files, flags=extra_flags, version=tool_version + ) return ValidationResult(passed=passed, tool="mypy", output=output, tool_version=tool_version) -def _run_pytest(workspace: Path, parsed_log: ParsedLog, tool_version: str) -> ValidationResult: +def _run_pytest( + workspace: Path, parsed_log: ParsedLog, tool_version: str, ci_config: dict | None = None +) -> ValidationResult: + ci_config = ci_config or {} test_files = list({f.file for f in parsed_log.test_failures}) targets = test_files if test_files else ["tests/"] - code, output = _run(["python", "-m", "pytest", "-x", "-q"] + targets, workspace) + + base_flags = ["-x", "-q"] + # Apply extra CI flags discovered (e.g. --tb=short), avoiding duplicates + for flag in ci_config.get("extra_flags", []): + if flag not in base_flags: + base_flags.append(flag) + + # Apply coverage threshold if CI enforces one and we're running the full suite + cov_fail_under: int | None = ci_config.get("cov_fail_under") + cov_flags: list[str] = [] + if cov_fail_under is not None and not test_files: + cov_flags = [f"--cov-fail-under={cov_fail_under}"] + + cmd = ["python", "-m", "pytest"] + base_flags + cov_flags + targets + code, output = _run(cmd, workspace) passed = code == 0 - log.info("ci_validator.pytest", passed=passed, files=targets, version=tool_version) + log.info( + "ci_validator.pytest", + passed=passed, + files=targets, + cov_threshold=cov_fail_under, + version=tool_version, + ) return ValidationResult(passed=passed, tool="pytest", output=output, tool_version=tool_version) @@ -180,19 +332,19 @@ def _regression_check( # Build set of pre-existing (file, code) pairs pre_existing: set[tuple[str, str]] = set() - for e in original_parsed.lint_errors: - pre_existing.add((e.file, e.code)) - for e in original_parsed.type_errors: - pre_existing.add((e.file, getattr(e, "code", e.message[:30]))) - - regressions = [] - for e in new_parsed.lint_errors: - if (e.file, e.code) not in pre_existing: - regressions.append(e) - for e in new_parsed.type_errors: - key = (e.file, getattr(e, "code", e.message[:30])) + for le in original_parsed.lint_errors: + pre_existing.add((le.file, le.code)) + for te in original_parsed.type_errors: + pre_existing.add((te.file, getattr(te, "code", te.message[:30]))) + + regressions: list[LintError | TypeErr] = [] + for le in new_parsed.lint_errors: + if (le.file, le.code) not in pre_existing: + regressions.append(le) + for te in new_parsed.type_errors: + key = (te.file, getattr(te, "code", te.message[:30])) if key not in pre_existing: - regressions.append(e) + regressions.append(te) return regressions diff --git a/phalanx/db/models.py b/phalanx/db/models.py index 218ce5ff..474065c3 100644 --- a/phalanx/db/models.py +++ b/phalanx/db/models.py @@ -454,7 +454,7 @@ class AgentTrace(Base): id: Mapped[str] = mapped_column(UUID(as_uuid=False), primary_key=True, default=_uuid) run_id: Mapped[str] = mapped_column(ForeignKey("runs.id", ondelete="CASCADE"), nullable=False) - task_id: Mapped[str | None] = mapped_column(String(36)) + task_id: Mapped[str | None] = mapped_column(UUID(as_uuid=False)) agent_role: Mapped[str] = mapped_column(String(100), nullable=False) agent_id: Mapped[str] = mapped_column(String(100), nullable=False) trace_type: Mapped[str] = mapped_column(String(50), nullable=False) diff --git a/phalanx/sim_test_scratch.py b/phalanx/sim_test_scratch.py new file mode 100644 index 00000000..1d23a35a --- /dev/null +++ b/phalanx/sim_test_scratch.py @@ -0,0 +1,3 @@ +"""Temporary scratch file for CI fixer simulation test — safe to delete.""" + +x = 1 diff --git a/skill-registry/index.yaml b/skill-registry/index.yaml index 06f6dac7..876d2bf3 100644 --- a/skill-registry/index.yaml +++ b/skill-registry/index.yaml @@ -11,3 +11,4 @@ skills: task-decomposition: "skills/task-decomposition.yaml" system-design: "skills/system-design.yaml" orchestration: "skills/orchestration.yaml" + risk-assessment: "skills/risk-assessment.yaml" diff --git a/skill-registry/skills/risk-assessment.yaml b/skill-registry/skills/risk-assessment.yaml new file mode 100644 index 00000000..18fca369 --- /dev/null +++ b/skill-registry/skills/risk-assessment.yaml @@ -0,0 +1,38 @@ +id: risk-assessment +name: "Risk Assessment" +version: "1.0.0" +domain: "engineering" +category: "quality" +stability: "stable" +applicable_roles: + - commander + - tech_lead +min_level: "ic5" + +prerequisites: + - system-design + - task-decomposition + +quality_criteria: + - "Risks are identified before work begins, not after failure" + - "Each risk has a severity (low/medium/high/critical) and a mitigation" + - "Blocking risks are escalated before task dispatch" + - "Risk assessment is recorded in the run's audit_log" + +principles: + - "Identify unknowns before committing to a plan." + - "High-severity risks require human acknowledgement before proceeding." + - "Mitigations are concrete actions, not vague reassurances." + +procedures: + proficient: + - "Review the work order for scope, dependencies, and external services." + - "Enumerate risks across: correctness, security, rollback-ability, and blast radius." + - "Assign severity and likelihood to each risk." + - "Define a concrete mitigation or fallback for each high/critical risk." + - "Surface critical risks to the human approver before plan approval." + +anti_patterns: + - "Treating risk assessment as a post-mortem step." + - "Marking all risks as 'low' to unblock dispatch." + - "Proceeding past a critical risk without explicit human sign-off."