From 1af4c92091f05b423625a48b81b0cc690f3b0314 Mon Sep 17 00:00:00 2001 From: valerii Date: Tue, 20 Jan 2026 14:07:13 +0100 Subject: [PATCH 01/17] modified gha check --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 862e04a..e9f1698 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,4 +9,4 @@ jobs: steps: - uses: actions/checkout@v4 - name: Say hi - run: echo "Hi from my self-hosted runner" \ No newline at end of file + run: echo "Hello from my self-hosted runner" \ No newline at end of file From 9083757412cc55258e28cbc1b0a3d1fa2c11eeb0 Mon Sep 17 00:00:00 2001 From: valerii Date: Wed, 21 Jan 2026 09:55:55 +0100 Subject: [PATCH 02/17] review action added --- .github/workflows/ci.yml | 12 ---------- .github/workflows/review.yml | 43 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 12 deletions(-) delete mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/review.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index e9f1698..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: code-review - -on: - pull_request: - -jobs: - code-review: - runs-on: [self-hosted, appendix-reviewer] - steps: - - uses: actions/checkout@v4 - - name: Say hi - run: echo "Hello from my self-hosted runner" \ No newline at end of file diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml new file mode 100644 index 0000000..b86e7fc --- /dev/null +++ b/.github/workflows/review.yml @@ -0,0 +1,43 @@ +name: auto code review + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + review: + runs-on: [self-hosted, appendix-reviewer] + permissions: + pull-requests: write + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch PR base + shell: pwsh + run: | + git fetch origin ${{ github.base_ref }} --depth=1 + + - name: Build diff + shell: pwsh + run: | + git diff --unified=3 origin/${{ github.base_ref }}...HEAD > diff.txt + Get-Content diff.txt | Measure-Object -Line -Word -Character | Format-List + + - name: Run multi-agent local review + shell: pwsh + env: + MODEL: qwen2.5.:14b-instruct + run: | + python .github/scripts/multi_agent_review.py + + - name: Post comment to PR + shell: pwsh + env: + GH_TOKEN: ${{ github.token }} + run: | + gh pr comment ${{ github.event.pull_request.number }} --body-file review_comment.md \ No newline at end of file From ae87b2b26f9d72244fa4440988a73ca151f9cd2d Mon Sep 17 00:00:00 2001 From: valerii Date: Wed, 21 Jan 2026 10:50:59 +0100 Subject: [PATCH 03/17] actual review job added --- .github/scripts/multi_agent_review.py | 200 ++++++++++++++++++++++++++ docs/review_guidelines.md | 6 + 2 files changed, 206 insertions(+) create mode 100644 .github/scripts/multi_agent_review.py create mode 100644 docs/review_guidelines.md diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py new file mode 100644 index 0000000..0419ef6 --- /dev/null +++ b/.github/scripts/multi_agent_review.py @@ -0,0 +1,200 @@ +import json +import os +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +DIFF_PATH = Path("diff.txt") +GUIDELINES_PATH = Path("docs/review_guidelines.md") +OUT_MD = Path("review_comment.md") + +MODEL = os.environ.get("MODEL", "qwen2.5:14b-instruct") +MAX_DIFF_CHARS = int(os.environ.get("MAX_DIFF_CHARS", "80000")) + +# --- Utilities --- + +def read_text(path: Path) -> str: + if path.exists(): + return path.read_text(encoding="utf-8", errors="ignore") + return "" + +def run_ollama(model: str, prompt: str) -> str: + # Use stdin to avoid CLI arg limits on Windows. + p = subprocess.run( + ["ollama", "run", model], + input=prompt.encode("utf-8"), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if p.returncode != 0: + err = p.stderr.decode("utf-8", errors="ignore") + raise RuntimeError(f"Ollama failed (code={p.returncode}): {err[:2000]}") + return p.stdout.decode("utf-8", errors="ignore").strip() + +def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + try: + return json.loads(s), None + except Exception as e: + return None, str(e) + +def truncate(s: str, limit: int) -> str: + if len(s) <= limit: + return s + return s[:limit] + "\n\n[TRUNCATED]\n" + +def md_escape(s: str) -> str: + # Minimal escaping to avoid accidental markdown formatting explosions + return s.replace("\r", "").strip() + +# --- Agent definitions --- + +@dataclass +class Agent: + name: str + focus: str + +AGENTS: List[Agent] = [ + Agent( + name="Correctness & Reliability", + focus=( + "Find correctness risks: idempotency, retries/backoff, timeouts, transactional boundaries, " + "race conditions, partial failures, error handling, and data consistency." + ), + ), + Agent( + name="Architecture & Boundaries", + focus=( + "Review architecture: module boundaries, coupling, layering, dependency direction, " + "API contracts, naming of abstractions, and maintainability trade-offs." + ), + ), + Agent( + name="Tests & Observability", + focus=( + "Suggest tests and observability: unit/integration tests worth adding, edge cases, " + "logging/metrics/tracing, and how to reproduce failures." + ), + ), + Agent( + name="Cost & LLM Discipline", + focus=( + "Look for cost/perf traps: unnecessary LLM calls, large payloads, missing caching, " + "missing state+delta pattern, and risk of GPU contention." + ), + ), +] + +# JSON schema agent must output +JSON_SCHEMA = """ +Return ONLY valid JSON (no markdown, no extra text). +Schema: +{ + "summary": "string (1-3 sentences)", + "blocking": [{"issue":"string","evidence":"string","fix":"string"}], + "non_blocking": [{"issue":"string","evidence":"string","fix":"string"}], + "tests_to_add": ["string"], + "questions": ["string"] +} +Rules: +- Use only the provided DIFF and GUIDELINES. +- Do not guess. If uncertain, put it into "questions". +- Keep evidence concrete (file/line hints if visible in diff). +""".strip() + +def build_prompt(agent: Agent, guidelines: str, diff: str) -> str: + return f""" +SYSTEM: +You are a strict senior software engineer acting as a pull request reviewer. + +TASK: +{agent.focus} + +GUIDELINES: +{guidelines if guidelines else "(none provided)"} + +DIFF: +{diff} + +OUTPUT INSTRUCTIONS: +{JSON_SCHEMA} +""".strip() + +# --- Markdown report building --- + +def fmt_issue_list(title: str, issues: List[Dict[str, Any]]) -> str: + if not issues: + return f"**{title}:** None ✅\n" + out = f"**{title}:**\n" + for it in issues: + issue = md_escape(str(it.get("issue", ""))) + evidence = md_escape(str(it.get("evidence", ""))) + fix = md_escape(str(it.get("fix", ""))) + out += f"- **{issue}**\n - Evidence: {evidence}\n - Fix: {fix}\n" + return out + "\n" + +def fmt_list(title: str, items: List[str]) -> str: + if not items: + return f"**{title}:** None\n\n" + out = f"**{title}:**\n" + for x in items: + out += f"- {md_escape(str(x))}\n" + return out + "\n" + +def main() -> None: + diff = read_text(DIFF_PATH) + guidelines = read_text(GUIDELINES_PATH) + + if not diff.strip(): + OUT_MD.write_text("### Local Multi-Agent AI Review\n\nNo diff found.\n", encoding="utf-8") + return + + diff = truncate(diff, MAX_DIFF_CHARS) + + results: List[Tuple[Agent, Optional[Dict[str, Any]], Optional[str], str]] = [] + # tuple: (agent, json, json_error, raw) + + for agent in AGENTS: + prompt = build_prompt(agent, guidelines, diff) + raw = run_ollama(MODEL, prompt) + data, err = safe_json_loads(raw) + results.append((agent, data, err, raw)) + + # Build consolidated markdown + md = "### Local Multi-Agent AI Review\n\n" + md += f"Model: `{MODEL}`\n\n" + if guidelines.strip(): + md += "_Project guidelines were provided._\n\n" + + # High-level rollup + md += "## Rollup\n\n" + rollup_lines = [] + for agent, data, err, raw in results: + if data: + s = md_escape(str(data.get("summary", ""))) + rollup_lines.append(f"- **{agent.name}:** {s}") + else: + rollup_lines.append(f"- **{agent.name}:** ⚠️ invalid JSON output ({err})") + md += "\n".join(rollup_lines) + "\n\n" + + # Detailed sections + for agent, data, err, raw in results: + md += f"---\n\n## {agent.name}\n\n" + if not data: + md += "**⚠️ Agent returned invalid JSON.**\n\n" + md += "Raw output (truncated):\n\n```text\n" + md += truncate(raw, 3000) + md += "\n```\n\n" + continue + + md += f"**Summary:** {md_escape(str(data.get('summary','')))}\n\n" + md += fmt_issue_list("Blocking", data.get("blocking", []) or []) + md += fmt_issue_list("Non-blocking", data.get("non_blocking", []) or []) + md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) + md += fmt_list("Questions", data.get("questions", []) or []) + + OUT_MD.write_text(md, encoding="utf-8") + +if __name__ == "__main__": + main() diff --git a/docs/review_guidelines.md b/docs/review_guidelines.md new file mode 100644 index 0000000..4c1fa73 --- /dev/null +++ b/docs/review_guidelines.md @@ -0,0 +1,6 @@ +# Review Guidelines + +- Kotlin + coroutines allowed. +- Processing is at-least-once; handlers must be idempotent. +- Postgres is source of truth; raw events are immutable; corrections create new versions. +- External calls must have timeouts + retries + backoff. \ No newline at end of file From 601617f249cf2ade9c058a26a8a87773dd322af5 Mon Sep 17 00:00:00 2001 From: Valerii Burmistrov Date: Wed, 21 Jan 2026 11:06:10 +0100 Subject: [PATCH 04/17] Add debug step for PATH and Python location Added a step to debug the PATH environment variable and locate Python. --- .github/workflows/review.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index b86e7fc..006bf8d 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -28,6 +28,12 @@ jobs: git diff --unified=3 origin/${{ github.base_ref }}...HEAD > diff.txt Get-Content diff.txt | Measure-Object -Line -Word -Character | Format-List + - name: Debug PATH + shell: pwsh + run: | + echo $env:PATH + where.exe python + - name: Run multi-agent local review shell: pwsh env: @@ -40,4 +46,4 @@ jobs: env: GH_TOKEN: ${{ github.token }} run: | - gh pr comment ${{ github.event.pull_request.number }} --body-file review_comment.md \ No newline at end of file + gh pr comment ${{ github.event.pull_request.number }} --body-file review_comment.md From 75e48ee0da93c93e0ed0499c67c3dcdcfa1b0130 Mon Sep 17 00:00:00 2001 From: Valerii Burmistrov Date: Wed, 21 Jan 2026 11:09:32 +0100 Subject: [PATCH 05/17] Update Python path command in review.yml --- .github/workflows/review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index 006bf8d..432eb63 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -32,7 +32,7 @@ jobs: shell: pwsh run: | echo $env:PATH - where.exe python + where python - name: Run multi-agent local review shell: pwsh From 777ecfdd556ba10bc68b1b6d2a73eeaa45660b28 Mon Sep 17 00:00:00 2001 From: Valerii Burmistrov Date: Wed, 21 Jan 2026 11:25:04 +0100 Subject: [PATCH 06/17] Enhance Python debugging in review workflow Updated the debug step to provide more detailed Python resolution checks. --- .github/workflows/review.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index 432eb63..d25253a 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -28,11 +28,15 @@ jobs: git diff --unified=3 origin/${{ github.base_ref }}...HEAD > diff.txt Get-Content diff.txt | Measure-Object -Line -Word -Character | Format-List - - name: Debug PATH + - name: Debug Python resolution shell: pwsh run: | - echo $env:PATH - where python + echo "PATH=$env:PATH" + where.exe python || echo "where.exe can't find python" + Get-Command python -All || echo "Get-Command can't find python" + Test-Path "C:\Users\zxc\AppData\Local\Programs\Python\Python314\python.exe" + Get-ChildItem "C:\Users\zxc\AppData\Local\Programs\Python\Python314\" -Filter python*.exe -ErrorAction SilentlyContinue + - name: Run multi-agent local review shell: pwsh From c45d18009844a367c4ef6e0499472195cefa6bae Mon Sep 17 00:00:00 2001 From: valerii Date: Wed, 21 Jan 2026 12:54:06 +0100 Subject: [PATCH 07/17] ollama rest api call instead of a subprocess --- .github/scripts/multi_agent_review.py | 43 +++++++++++++++++++-------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index 0419ef6..29d37ce 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +import urllib.request DIFF_PATH = Path("diff.txt") GUIDELINES_PATH = Path("docs/review_guidelines.md") @@ -11,6 +12,7 @@ MODEL = os.environ.get("MODEL", "qwen2.5:14b-instruct") MAX_DIFF_CHARS = int(os.environ.get("MAX_DIFF_CHARS", "80000")) +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") # --- Utilities --- @@ -20,18 +22,26 @@ def read_text(path: Path) -> str: return "" def run_ollama(model: str, prompt: str) -> str: - # Use stdin to avoid CLI arg limits on Windows. - p = subprocess.run( - ["ollama", "run", model], - input=prompt.encode("utf-8"), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=False, + payload = { + "model": model, + "prompt": prompt, + "stream": False, + } + + req = urllib.request.Request( + f"{OLLAMA_URL}/api/generate", + data=json.dumps(payload).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", ) - if p.returncode != 0: - err = p.stderr.decode("utf-8", errors="ignore") - raise RuntimeError(f"Ollama failed (code={p.returncode}): {err[:2000]}") - return p.stdout.decode("utf-8", errors="ignore").strip() + + try: + with urllib.request.urlopen(req, timeout=300) as resp: + body = resp.read().decode("utf-8", errors="ignore") + data = json.loads(body) + return data.get("response", "").strip() + except Exception as e: + raise RuntimeError(f"Ollama HTTP call failed: {e}") def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: try: @@ -39,15 +49,18 @@ def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: except Exception as e: return None, str(e) + def truncate(s: str, limit: int) -> str: if len(s) <= limit: return s return s[:limit] + "\n\n[TRUNCATED]\n" + def md_escape(s: str) -> str: # Minimal escaping to avoid accidental markdown formatting explosions return s.replace("\r", "").strip() + # --- Agent definitions --- @dataclass @@ -55,6 +68,7 @@ class Agent: name: str focus: str + AGENTS: List[Agent] = [ Agent( name="Correctness & Reliability", @@ -103,6 +117,7 @@ class Agent: - Keep evidence concrete (file/line hints if visible in diff). """.strip() + def build_prompt(agent: Agent, guidelines: str, diff: str) -> str: return f""" SYSTEM: @@ -121,6 +136,7 @@ def build_prompt(agent: Agent, guidelines: str, diff: str) -> str: {JSON_SCHEMA} """.strip() + # --- Markdown report building --- def fmt_issue_list(title: str, issues: List[Dict[str, Any]]) -> str: @@ -134,6 +150,7 @@ def fmt_issue_list(title: str, issues: List[Dict[str, Any]]) -> str: out += f"- **{issue}**\n - Evidence: {evidence}\n - Fix: {fix}\n" return out + "\n" + def fmt_list(title: str, items: List[str]) -> str: if not items: return f"**{title}:** None\n\n" @@ -142,6 +159,7 @@ def fmt_list(title: str, items: List[str]) -> str: out += f"- {md_escape(str(x))}\n" return out + "\n" + def main() -> None: diff = read_text(DIFF_PATH) guidelines = read_text(GUIDELINES_PATH) @@ -188,7 +206,7 @@ def main() -> None: md += "\n```\n\n" continue - md += f"**Summary:** {md_escape(str(data.get('summary','')))}\n\n" + md += f"**Summary:** {md_escape(str(data.get('summary', '')))}\n\n" md += fmt_issue_list("Blocking", data.get("blocking", []) or []) md += fmt_issue_list("Non-blocking", data.get("non_blocking", []) or []) md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) @@ -196,5 +214,6 @@ def main() -> None: OUT_MD.write_text(md, encoding="utf-8") + if __name__ == "__main__": main() From 294d3eb089064739fd6341dee877933c1c147f62 Mon Sep 17 00:00:00 2001 From: valerii Date: Wed, 21 Jan 2026 19:48:56 +0100 Subject: [PATCH 08/17] incremental review --- .github/scripts/multi_agent_review.py | 81 +++++++++++++++++++++++++-- .github/workflows/review.yml | 53 ++++++++++++++---- 2 files changed, 117 insertions(+), 17 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index 29d37ce..514c6ee 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -14,6 +14,11 @@ MAX_DIFF_CHARS = int(os.environ.get("MAX_DIFF_CHARS", "80000")) OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") +# Set in workflow (optional) by parsing previous AI comment: +# PREV_SHA= +PREV_SHA = (os.environ.get("PREV_SHA") or "").strip() + + # --- Utilities --- def read_text(path: Path) -> str: @@ -21,11 +26,32 @@ def read_text(path: Path) -> str: return path.read_text(encoding="utf-8", errors="ignore") return "" + +def git(*args: str) -> str: + p = subprocess.run( + ["git", *args], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if p.returncode != 0: + err = p.stderr.decode("utf-8", errors="ignore") + raise RuntimeError(f"git {' '.join(args)} failed: {err[:2000]}") + return p.stdout.decode("utf-8", errors="ignore").strip() + + +def get_head_sha() -> str: + # Works in GHA runner after checkout + return git("rev-parse", "HEAD") + + def run_ollama(model: str, prompt: str) -> str: payload = { "model": model, "prompt": prompt, "stream": False, + # If you want to aggressively unload after each call: + # "keep_alive": 0, } req = urllib.request.Request( @@ -43,6 +69,7 @@ def run_ollama(model: str, prompt: str) -> str: except Exception as e: raise RuntimeError(f"Ollama HTTP call failed: {e}") + def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: try: return json.loads(s), None @@ -102,7 +129,9 @@ class Agent: # JSON schema agent must output JSON_SCHEMA = """ -Return ONLY valid JSON (no markdown, no extra text). +Return ONLY valid JSON. No markdown. No explanations. No extra keys. No trailing comments. +Any output outside JSON is considered a failure. + Schema: { "summary": "string (1-3 sentences)", @@ -111,17 +140,32 @@ class Agent: "tests_to_add": ["string"], "questions": ["string"] } + Rules: - Use only the provided DIFF and GUIDELINES. -- Do not guess. If uncertain, put it into "questions". +- Do not guess. If uncertain, put it into "questions" and set evidence to "unknown". - Keep evidence concrete (file/line hints if visible in diff). """.strip() -def build_prompt(agent: Agent, guidelines: str, diff: str) -> str: +def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: str, head_sha: str) -> str: + followup_hint = "" + if mode == "FOLLOW_UP": + followup_hint = f""" +FOLLOW-UP CONTEXT: +- This is a follow-up review after previous AI feedback. +- Previous reviewed HEAD SHA was: {prev_sha} +- Current HEAD SHA is: {head_sha} +- The DIFF is expected to contain ONLY changes since the previous review. +- Focus on verifying whether prior likely issues are addressed, and whether new issues were introduced. +""".strip() + return f""" SYSTEM: -You are a strict senior software engineer acting as a pull request reviewer. +You are a strict senior/staff+ software engineer acting as a pull request reviewer. + +MODE: {mode} +{followup_hint} TASK: {agent.focus} @@ -170,18 +214,39 @@ def main() -> None: diff = truncate(diff, MAX_DIFF_CHARS) + head_sha = get_head_sha() + mode = "FOLLOW_UP" if PREV_SHA else "INITIAL" + results: List[Tuple[Agent, Optional[Dict[str, Any]], Optional[str], str]] = [] # tuple: (agent, json, json_error, raw) for agent in AGENTS: - prompt = build_prompt(agent, guidelines, diff) + prompt = build_prompt( + agent=agent, + guidelines=guidelines, + diff=diff, + mode=mode, + prev_sha=PREV_SHA, + head_sha=head_sha, + ) raw = run_ollama(MODEL, prompt) data, err = safe_json_loads(raw) results.append((agent, data, err, raw)) # Build consolidated markdown - md = "### Local Multi-Agent AI Review\n\n" + title = "Local Multi-Agent AI Review" + if mode == "FOLLOW_UP": + title += " (Follow-up)" + else: + title += " (Initial)" + + md = f"### {title}\n\n" md += f"Model: `{MODEL}`\n\n" + md += f"Current HEAD: `{head_sha}`\n" + if PREV_SHA: + md += f"Previous reviewed HEAD: `{PREV_SHA}`\n" + md += "\n" + if guidelines.strip(): md += "_Project guidelines were provided._\n\n" @@ -212,6 +277,10 @@ def main() -> None: md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) md += fmt_list("Questions", data.get("questions", []) or []) + # Marker for next run to pick up + md += "\n---\n\n" + md += f"Reviewed-Head-SHA: {head_sha}\n" + OUT_MD.write_text(md, encoding="utf-8") diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index d25253a..f888539 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -22,26 +22,51 @@ jobs: run: | git fetch origin ${{ github.base_ref }} --depth=1 - - name: Build diff + - name: Get previous AI review SHA (if any) shell: pwsh + env: + GH_TOKEN: ${{ github.token }} run: | - git diff --unified=3 origin/${{ github.base_ref }}...HEAD > diff.txt - Get-Content diff.txt | Measure-Object -Line -Word -Character | Format-List + $pr = "${{ github.event.pull_request.number }}" + $comments = gh api repos/${{ github.repository }}/issues/$pr/comments | ConvertFrom-Json + $ai = $comments | Where-Object { $_.body -match "Reviewed-Head-SHA:" } | Select-Object -Last 1 + if ($null -ne $ai) { + $m = [regex]::Match($ai.body, "Reviewed-Head-SHA:\s*([0-9a-fA-F]+)") + if ($m.Success) { + "PREV_SHA=$($m.Groups[1].Value)" | Out-File -FilePath $env:GITHUB_ENV -Append + Write-Host "Found previous review SHA: $($m.Groups[1].Value)" + } + } else { + Write-Host "No previous AI review comment found." + } - - name: Debug Python resolution + - name: Build diff (incremental if possible) shell: pwsh run: | - echo "PATH=$env:PATH" - where.exe python || echo "where.exe can't find python" - Get-Command python -All || echo "Get-Command can't find python" - Test-Path "C:\Users\zxc\AppData\Local\Programs\Python\Python314\python.exe" - Get-ChildItem "C:\Users\zxc\AppData\Local\Programs\Python\Python314\" -Filter python*.exe -ErrorAction SilentlyContinue + git fetch origin ${{ github.base_ref }} --depth=1 + + if ($env:PREV_SHA) { + # Validate that PREV_SHA exists in this checkout (force-push can break it) + git cat-file -e "$env:PREV_SHA^{commit}" 2>$null + if ($LASTEXITCODE -eq 0) { + git diff --unified=3 $env:PREV_SHA...HEAD > diff.txt + Write-Host "Using incremental diff: $env:PREV_SHA...HEAD" + } else { + Write-Host "PREV_SHA not found locally (maybe force-push). Falling back to full diff." + git diff --unified=3 origin/${{ github.base_ref }}...HEAD > diff.txt + } + } else { + git diff --unified=3 origin/${{ github.base_ref }}...HEAD > diff.txt + Write-Host "Using full PR diff against base." + } + Get-Content diff.txt | Measure-Object -Line -Word -Character | Format-List - name: Run multi-agent local review shell: pwsh env: - MODEL: qwen2.5.:14b-instruct + MODEL: qwen2.5:14b-instruct + MAX_DIFF_CHARS: "80000" run: | python .github/scripts/multi_agent_review.py @@ -50,4 +75,10 @@ jobs: env: GH_TOKEN: ${{ github.token }} run: | - gh pr comment ${{ github.event.pull_request.number }} --body-file review_comment.md + $pr = "${{ github.event.pull_request.number }}" + # Try to edit last comment; if it fails, create a new one. + gh pr comment $pr --edit-last --body-file review_comment.md + if ($LASTEXITCODE -ne 0) { + Write-Host "Edit-last failed, creating a new comment." + gh pr comment $pr --body-file review_comment.md + } From 4ed668217a147fad7da3a8e74ffb2cb2f6fbf184 Mon Sep 17 00:00:00 2001 From: valerii Date: Wed, 21 Jan 2026 20:35:15 +0100 Subject: [PATCH 09/17] incremental review --- .github/scripts/multi_agent_review.py | 2 +- .github/workflows/review.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index 514c6ee..e2d19dc 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -279,7 +279,7 @@ def main() -> None: # Marker for next run to pick up md += "\n---\n\n" - md += f"Reviewed-Head-SHA: {head_sha}\n" + md += f"" OUT_MD.write_text(md, encoding="utf-8") diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index f888539..995ba6c 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -31,7 +31,7 @@ jobs: $comments = gh api repos/${{ github.repository }}/issues/$pr/comments | ConvertFrom-Json $ai = $comments | Where-Object { $_.body -match "Reviewed-Head-SHA:" } | Select-Object -Last 1 if ($null -ne $ai) { - $m = [regex]::Match($ai.body, "Reviewed-Head-SHA:\s*([0-9a-fA-F]+)") + $m = [regex]::Match($ai.body, "AI_REVIEW:HEAD_SHA=([0-9a-fA-F]{7,40})") if ($m.Success) { "PREV_SHA=$($m.Groups[1].Value)" | Out-File -FilePath $env:GITHUB_ENV -Append Write-Host "Found previous review SHA: $($m.Groups[1].Value)" From cce9b1676d8f60f4814dee63c9610b5aedf6f6da Mon Sep 17 00:00:00 2001 From: valerii Date: Wed, 21 Jan 2026 21:18:28 +0100 Subject: [PATCH 10/17] curator introduced --- .github/scripts/multi_agent_review.py | 210 +++++++++++++++++++++----- 1 file changed, 171 insertions(+), 39 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index e2d19dc..cd5c742 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -11,12 +11,15 @@ OUT_MD = Path("review_comment.md") MODEL = os.environ.get("MODEL", "qwen2.5:14b-instruct") +# Optional: use a cheaper/faster model for the curator step +CURATOR_MODEL = os.environ.get("CURATOR_MODEL", MODEL) + MAX_DIFF_CHARS = int(os.environ.get("MAX_DIFF_CHARS", "80000")) -OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") +MAX_PREV_REVIEW_CHARS = int(os.environ.get("MAX_PREV_REVIEW_CHARS", "12000")) -# Set in workflow (optional) by parsing previous AI comment: -# PREV_SHA= +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") PREV_SHA = (os.environ.get("PREV_SHA") or "").strip() +PREV_REVIEW_PATH = Path(os.environ.get("PREV_REVIEW_PATH", "prev_review.txt")) # optional # --- Utilities --- @@ -41,11 +44,21 @@ def git(*args: str) -> str: def get_head_sha() -> str: - # Works in GHA runner after checkout return git("rev-parse", "HEAD") -def run_ollama(model: str, prompt: str) -> str: +def truncate(s: str, limit: int) -> str: + if len(s) <= limit: + return s + return s[:limit] + "\n\n[TRUNCATED]\n" + + +def md_escape(s: str) -> str: + # Minimal escaping to avoid accidental markdown formatting explosions + return s.replace("\r", "").strip() + + +def run_ollama(model: str, prompt: str, timeout_s: int = 300) -> str: payload = { "model": model, "prompt": prompt, @@ -62,7 +75,7 @@ def run_ollama(model: str, prompt: str) -> str: ) try: - with urllib.request.urlopen(req, timeout=300) as resp: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: body = resp.read().decode("utf-8", errors="ignore") data = json.loads(body) return data.get("response", "").strip() @@ -77,15 +90,20 @@ def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: return None, str(e) -def truncate(s: str, limit: int) -> str: - if len(s) <= limit: - return s - return s[:limit] + "\n\n[TRUNCATED]\n" +def cap_list(x: Any, max_items: int) -> List[Any]: + if not isinstance(x, list): + return [] + return x[:max_items] -def md_escape(s: str) -> str: - # Minimal escaping to avoid accidental markdown formatting explosions - return s.replace("\r", "").strip() +def cap_issues(x: Any, max_items: int) -> List[Dict[str, Any]]: + if not isinstance(x, list): + return [] + out = [] + for it in x[:max_items]: + if isinstance(it, dict): + out.append(it) + return out # --- Agent definitions --- @@ -127,20 +145,26 @@ class Agent: ), ] -# JSON schema agent must output + +# JSON schema for each agent (strict + short) JSON_SCHEMA = """ -Return ONLY valid JSON. No markdown. No explanations. No extra keys. No trailing comments. -Any output outside JSON is considered a failure. +Return ONLY valid JSON. No markdown. No explanations. No extra keys. Schema: { - "summary": "string (1-3 sentences)", + "summary": "string (max 2 sentences)", "blocking": [{"issue":"string","evidence":"string","fix":"string"}], "non_blocking": [{"issue":"string","evidence":"string","fix":"string"}], "tests_to_add": ["string"], "questions": ["string"] } +Hard limits: +- blocking: max 2 items +- non_blocking: max 3 items +- tests_to_add: max 6 items +- questions: max 5 items + Rules: - Use only the provided DIFF and GUIDELINES. - Do not guess. If uncertain, put it into "questions" and set evidence to "unknown". @@ -148,6 +172,27 @@ class Agent: """.strip() +CURATOR_SCHEMA = """ +Return ONLY valid JSON. No markdown. No explanations. No extra keys. + +Schema: +{ + "short_summary": ["string (max 3 bullets)"], + "top_actions": ["string (max 5 items)"], + "resolved": ["string (max 5 items)"], + "still_open": ["string (max 5 items)"], + "new_risks": ["string (max 5 items)"] +} + +Rules: +- Deduplicate and prioritize. +- Prefer concrete, actionable items. +- If evidence is weak/unknown, downgrade or omit it. +- If mode is INITIAL: resolved/still_open can be empty. +- If mode is FOLLOW_UP: classify items as resolved/still_open/new_risks when possible. +""".strip() + + def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: str, head_sha: str) -> str: followup_hint = "" if mode == "FOLLOW_UP": @@ -157,7 +202,7 @@ def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: - Previous reviewed HEAD SHA was: {prev_sha} - Current HEAD SHA is: {head_sha} - The DIFF is expected to contain ONLY changes since the previous review. -- Focus on verifying whether prior likely issues are addressed, and whether new issues were introduced. +- Focus on what changed, what got resolved, and what new issues were introduced. """.strip() return f""" @@ -181,7 +226,34 @@ def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: """.strip() -# --- Markdown report building --- +def build_curator_prompt( + mode: str, + prev_sha: str, + head_sha: str, + agent_json: List[Dict[str, Any]], + prev_review_text: str, +) -> str: + payload = { + "mode": mode, + "previous_reviewed_sha": prev_sha or None, + "current_sha": head_sha, + "agent_reviews": agent_json, + "previous_review_text": truncate(prev_review_text, MAX_PREV_REVIEW_CHARS) if prev_review_text else "", + } + + return f""" +SYSTEM: +You are a lead reviewer (staff+). Merge multiple agent reviews into a short, actionable PR comment. + +INPUT (JSON): +{json.dumps(payload, ensure_ascii=False)} + +OUTPUT INSTRUCTIONS: +{CURATOR_SCHEMA} +""".strip() + + +# --- Markdown building --- def fmt_issue_list(title: str, issues: List[Dict[str, Any]]) -> str: if not issues: @@ -204,9 +276,16 @@ def fmt_list(title: str, items: List[str]) -> str: return out + "\n" +def fmt_bullets(items: List[str]) -> str: + if not items: + return "- None\n" + return "".join([f"- {md_escape(str(x))}\n" for x in items]) + + def main() -> None: diff = read_text(DIFF_PATH) guidelines = read_text(GUIDELINES_PATH) + prev_review_text = read_text(PREV_REVIEW_PATH) if not diff.strip(): OUT_MD.write_text("### Local Multi-Agent AI Review\n\nNo diff found.\n", encoding="utf-8") @@ -217,31 +296,60 @@ def main() -> None: head_sha = get_head_sha() mode = "FOLLOW_UP" if PREV_SHA else "INITIAL" + # Run specialist agents results: List[Tuple[Agent, Optional[Dict[str, Any]], Optional[str], str]] = [] - # tuple: (agent, json, json_error, raw) + agent_payloads: List[Dict[str, Any]] = [] for agent in AGENTS: - prompt = build_prompt( - agent=agent, - guidelines=guidelines, - diff=diff, - mode=mode, - prev_sha=PREV_SHA, - head_sha=head_sha, - ) + prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha) raw = run_ollama(MODEL, prompt) data, err = safe_json_loads(raw) + + # Normalize & cap to keep things tight even if model violates limits + if data: + data["summary"] = str(data.get("summary", ""))[:500] + data["blocking"] = cap_issues(data.get("blocking"), 2) + data["non_blocking"] = cap_issues(data.get("non_blocking"), 3) + data["tests_to_add"] = cap_list(data.get("tests_to_add"), 6) + data["questions"] = cap_list(data.get("questions"), 5) + + agent_payloads.append({ + "agent": agent.name, + "summary": data.get("summary", ""), + "blocking": data.get("blocking", []), + "non_blocking": data.get("non_blocking", []), + "tests_to_add": data.get("tests_to_add", []), + "questions": data.get("questions", []), + }) + results.append((agent, data, err, raw)) - # Build consolidated markdown - title = "Local Multi-Agent AI Review" - if mode == "FOLLOW_UP": - title += " (Follow-up)" + # Curator step + curator_prompt = build_curator_prompt(mode, PREV_SHA, head_sha, agent_payloads, prev_review_text) + curator_raw = run_ollama(CURATOR_MODEL, curator_prompt) + curator, curator_err = safe_json_loads(curator_raw) + + if curator: + curator["short_summary"] = cap_list(curator.get("short_summary"), 3) + curator["top_actions"] = cap_list(curator.get("top_actions"), 5) + curator["resolved"] = cap_list(curator.get("resolved"), 5) + curator["still_open"] = cap_list(curator.get("still_open"), 5) + curator["new_risks"] = cap_list(curator.get("new_risks"), 5) else: - title += " (Initial)" + curator = { + "short_summary": [f"⚠️ Curator returned invalid JSON: {curator_err}"], + "top_actions": [], + "resolved": [], + "still_open": [], + "new_risks": [], + } + + # Build markdown + title = "Local Multi-Agent AI Review" + title += " (Follow-up)" if mode == "FOLLOW_UP" else " (Initial)" md = f"### {title}\n\n" - md += f"Model: `{MODEL}`\n\n" + md += f"Models: specialists=`{MODEL}`, curator=`{CURATOR_MODEL}`\n\n" md += f"Current HEAD: `{head_sha}`\n" if PREV_SHA: md += f"Previous reviewed HEAD: `{PREV_SHA}`\n" @@ -250,7 +358,25 @@ def main() -> None: if guidelines.strip(): md += "_Project guidelines were provided._\n\n" - # High-level rollup + md += "## Curated summary\n\n" + md += fmt_bullets(curator.get("short_summary", [])) + "\n" + + md += "### Top actions\n\n" + md += fmt_bullets(curator.get("top_actions", [])) + "\n" + + if mode == "FOLLOW_UP": + md += "### Resolved\n\n" + md += fmt_bullets(curator.get("resolved", [])) + "\n" + + md += "### Still open\n\n" + md += fmt_bullets(curator.get("still_open", [])) + "\n" + + md += "### New risks\n\n" + md += fmt_bullets(curator.get("new_risks", [])) + "\n" + + # Keep details collapsible to reduce noise in PR + md += "
\nAgent details\n\n" + md += "## Rollup\n\n" rollup_lines = [] for agent, data, err, raw in results: @@ -261,7 +387,6 @@ def main() -> None: rollup_lines.append(f"- **{agent.name}:** ⚠️ invalid JSON output ({err})") md += "\n".join(rollup_lines) + "\n\n" - # Detailed sections for agent, data, err, raw in results: md += f"---\n\n## {agent.name}\n\n" if not data: @@ -277,9 +402,16 @@ def main() -> None: md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) md += fmt_list("Questions", data.get("questions", []) or []) - # Marker for next run to pick up - md += "\n---\n\n" - md += f"" + # Curator raw (optional debug) + md += "\n---\n\n## Curator (debug)\n\n" + if curator_raw: + md += "```json\n" + truncate(curator_raw, 3000) + "\n```\n" + + md += "\n
\n\n" + + # Stable marker for next run to pick up + md += "---\n\n" + md += f"\n" OUT_MD.write_text(md, encoding="utf-8") From 41f6f3532beeba13c3f45874e10b38cb9cc049af Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 14:50:26 +0100 Subject: [PATCH 11/17] review adjustments --- .github/scripts/multi_agent_review.py | 64 ++++++++++++++++----------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index cd5c742..0b7afae 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -11,7 +11,6 @@ OUT_MD = Path("review_comment.md") MODEL = os.environ.get("MODEL", "qwen2.5:14b-instruct") -# Optional: use a cheaper/faster model for the curator step CURATOR_MODEL = os.environ.get("CURATOR_MODEL", MODEL) MAX_DIFF_CHARS = int(os.environ.get("MAX_DIFF_CHARS", "80000")) @@ -20,6 +19,7 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") PREV_SHA = (os.environ.get("PREV_SHA") or "").strip() PREV_REVIEW_PATH = Path(os.environ.get("PREV_REVIEW_PATH", "prev_review.txt")) # optional +SHOW_DEBUG = (os.environ.get("SHOW_DEBUG") or "").lower() in ("1", "true", "yes") # --- Utilities --- @@ -54,7 +54,6 @@ def truncate(s: str, limit: int) -> str: def md_escape(s: str) -> str: - # Minimal escaping to avoid accidental markdown formatting explosions return s.replace("\r", "").strip() @@ -63,8 +62,7 @@ def run_ollama(model: str, prompt: str, timeout_s: int = 300) -> str: "model": model, "prompt": prompt, "stream": False, - # If you want to aggressively unload after each call: - # "keep_alive": 0, + # "keep_alive": 0, # uncomment if you want to unload model after each call } req = urllib.request.Request( @@ -99,13 +97,26 @@ def cap_list(x: Any, max_items: int) -> List[Any]: def cap_issues(x: Any, max_items: int) -> List[Dict[str, Any]]: if not isinstance(x, list): return [] - out = [] + out: List[Dict[str, Any]] = [] for it in x[:max_items]: if isinstance(it, dict): out.append(it) return out +def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Remove issues with weak evidence. This is critical to reduce hallucinations. + """ + out: List[Dict[str, Any]] = [] + for it in issues: + ev = str(it.get("evidence", "")).strip().lower() + if not ev or ev == "unknown": + continue + out.append(it) + return out + + # --- Agent definitions --- @dataclass @@ -146,7 +157,6 @@ class Agent: ] -# JSON schema for each agent (strict + short) JSON_SCHEMA = """ Return ONLY valid JSON. No markdown. No explanations. No extra keys. @@ -169,6 +179,7 @@ class Agent: - Use only the provided DIFF and GUIDELINES. - Do not guess. If uncertain, put it into "questions" and set evidence to "unknown". - Keep evidence concrete (file/line hints if visible in diff). +- If you cannot cite evidence from the DIFF, set evidence to "unknown" AND prefer putting it into "questions" instead of issues. """.strip() @@ -177,8 +188,8 @@ class Agent: Schema: { - "short_summary": ["string (max 3 bullets)"], - "top_actions": ["string (max 5 items)"], + "short_summary": ["string (max 2 bullets)"], + "top_actions": ["string (max 3 items)"], "resolved": ["string (max 5 items)"], "still_open": ["string (max 5 items)"], "new_risks": ["string (max 5 items)"] @@ -198,7 +209,6 @@ def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: if mode == "FOLLOW_UP": followup_hint = f""" FOLLOW-UP CONTEXT: -- This is a follow-up review after previous AI feedback. - Previous reviewed HEAD SHA was: {prev_sha} - Current HEAD SHA is: {head_sha} - The DIFF is expected to contain ONLY changes since the previous review. @@ -243,7 +253,7 @@ def build_curator_prompt( return f""" SYSTEM: -You are a lead reviewer (staff+). Merge multiple agent reviews into a short, actionable PR comment. +You are a lead reviewer (staff+). Merge multiple agent reviews into a SHORT, actionable PR comment. INPUT (JSON): {json.dumps(payload, ensure_ascii=False)} @@ -305,19 +315,22 @@ def main() -> None: raw = run_ollama(MODEL, prompt) data, err = safe_json_loads(raw) - # Normalize & cap to keep things tight even if model violates limits if data: data["summary"] = str(data.get("summary", ""))[:500] - data["blocking"] = cap_issues(data.get("blocking"), 2) - data["non_blocking"] = cap_issues(data.get("non_blocking"), 3) + + blocking = drop_weak_issues(cap_issues(data.get("blocking"), 2)) + non_blocking = drop_weak_issues(cap_issues(data.get("non_blocking"), 3)) + + data["blocking"] = blocking + data["non_blocking"] = non_blocking data["tests_to_add"] = cap_list(data.get("tests_to_add"), 6) data["questions"] = cap_list(data.get("questions"), 5) agent_payloads.append({ "agent": agent.name, "summary": data.get("summary", ""), - "blocking": data.get("blocking", []), - "non_blocking": data.get("non_blocking", []), + "blocking": blocking, + "non_blocking": non_blocking, "tests_to_add": data.get("tests_to_add", []), "questions": data.get("questions", []), }) @@ -330,8 +343,8 @@ def main() -> None: curator, curator_err = safe_json_loads(curator_raw) if curator: - curator["short_summary"] = cap_list(curator.get("short_summary"), 3) - curator["top_actions"] = cap_list(curator.get("top_actions"), 5) + curator["short_summary"] = cap_list(curator.get("short_summary"), 2) + curator["top_actions"] = cap_list(curator.get("top_actions"), 3) curator["resolved"] = cap_list(curator.get("resolved"), 5) curator["still_open"] = cap_list(curator.get("still_open"), 5) curator["new_risks"] = cap_list(curator.get("new_risks"), 5) @@ -361,20 +374,20 @@ def main() -> None: md += "## Curated summary\n\n" md += fmt_bullets(curator.get("short_summary", [])) + "\n" - md += "### Top actions\n\n" + md += "## Top actions\n\n" md += fmt_bullets(curator.get("top_actions", [])) + "\n" if mode == "FOLLOW_UP": - md += "### Resolved\n\n" + md += "## Resolved\n\n" md += fmt_bullets(curator.get("resolved", [])) + "\n" - md += "### Still open\n\n" + md += "## Still open\n\n" md += fmt_bullets(curator.get("still_open", [])) + "\n" - md += "### New risks\n\n" + md += "## New risks\n\n" md += fmt_bullets(curator.get("new_risks", [])) + "\n" - # Keep details collapsible to reduce noise in PR + # Details md += "
\nAgent details\n\n" md += "## Rollup\n\n" @@ -402,14 +415,13 @@ def main() -> None: md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) md += fmt_list("Questions", data.get("questions", []) or []) - # Curator raw (optional debug) - md += "\n---\n\n## Curator (debug)\n\n" - if curator_raw: + if SHOW_DEBUG: + md += "\n---\n\n## Curator (debug)\n\n" md += "```json\n" + truncate(curator_raw, 3000) + "\n```\n" md += "\n
\n\n" - # Stable marker for next run to pick up + # Stable marker for next run md += "---\n\n" md += f"\n" From 0d6d5786837f0eca990dbd97940a8356c10b3ecb Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 15:02:49 +0100 Subject: [PATCH 12/17] review adjustments --- .github/scripts/multi_agent_review.py | 88 +++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index 0b7afae..6220ba5 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -62,7 +62,7 @@ def run_ollama(model: str, prompt: str, timeout_s: int = 300) -> str: "model": model, "prompt": prompt, "stream": False, - # "keep_alive": 0, # uncomment if you want to unload model after each call + # "keep_alive": 0, # uncomment to unload model after each call } req = urllib.request.Request( @@ -104,6 +104,31 @@ def cap_issues(x: Any, max_items: int) -> List[Dict[str, Any]]: return out +def extract_changed_files(diff: str) -> List[str]: + """ + Parse `git diff` output and extract changed file paths. We use these paths to: + - constrain model references + - filter hallucinated evidence + """ + files: List[str] = [] + for line in diff.splitlines(): + if line.startswith("diff --git "): + parts = line.split() + if len(parts) >= 4: + b = parts[3] # b/... + if b.startswith("b/"): + files.append(b[2:]) + + # dedupe, keep order + seen = set() + out: List[str] = [] + for f in files: + if f not in seen: + out.append(f) + seen.add(f) + return out + + def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Remove issues with weak evidence. This is critical to reduce hallucinations. @@ -117,6 +142,23 @@ def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: return out +def drop_issues_not_in_files(issues: List[Dict[str, Any]], allowed_files: List[str]) -> List[Dict[str, Any]]: + """ + If evidence doesn't mention any changed file, the model likely hallucinated the reference. + Drop it. + """ + if not allowed_files: + return issues + + allowed_lower = [f.lower() for f in allowed_files] + out: List[Dict[str, Any]] = [] + for it in issues: + ev = str(it.get("evidence", "")).lower() + if any(f in ev for f in allowed_lower): + out.append(it) + return out + + # --- Agent definitions --- @dataclass @@ -198,13 +240,22 @@ class Agent: Rules: - Deduplicate and prioritize. - Prefer concrete, actionable items. +- Each bullet/action should cite at least one file from `changed_files` OR be framed as a question. - If evidence is weak/unknown, downgrade or omit it. - If mode is INITIAL: resolved/still_open can be empty. - If mode is FOLLOW_UP: classify items as resolved/still_open/new_risks when possible. """.strip() -def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: str, head_sha: str) -> str: +def build_prompt( + agent: Agent, + guidelines: str, + diff: str, + mode: str, + prev_sha: str, + head_sha: str, + allowed_files: List[str], +) -> str: followup_hint = "" if mode == "FOLLOW_UP": followup_hint = f""" @@ -215,6 +266,8 @@ def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: - Focus on what changed, what got resolved, and what new issues were introduced. """.strip() + allowed_files_text = "\n".join(allowed_files) if allowed_files else "(none)" + return f""" SYSTEM: You are a strict senior/staff+ software engineer acting as a pull request reviewer. @@ -222,6 +275,13 @@ def build_prompt(agent: Agent, guidelines: str, diff: str, mode: str, prev_sha: MODE: {mode} {followup_hint} +ALLOWED_FILES (critical): +{allowed_files_text} + +Critical rules: +- You may ONLY reference file paths from ALLOWED_FILES. +- If you cannot point to evidence in the DIFF, DO NOT create an issue. Put it into "questions". + TASK: {agent.focus} @@ -242,11 +302,13 @@ def build_curator_prompt( head_sha: str, agent_json: List[Dict[str, Any]], prev_review_text: str, + changed_files: List[str], ) -> str: payload = { "mode": mode, "previous_reviewed_sha": prev_sha or None, "current_sha": head_sha, + "changed_files": changed_files, "agent_reviews": agent_json, "previous_review_text": truncate(prev_review_text, MAX_PREV_REVIEW_CHARS) if prev_review_text else "", } @@ -302,6 +364,7 @@ def main() -> None: return diff = truncate(diff, MAX_DIFF_CHARS) + changed_files = extract_changed_files(diff) head_sha = get_head_sha() mode = "FOLLOW_UP" if PREV_SHA else "INITIAL" @@ -311,15 +374,21 @@ def main() -> None: agent_payloads: List[Dict[str, Any]] = [] for agent in AGENTS: - prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha) + prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha, changed_files) raw = run_ollama(MODEL, prompt) data, err = safe_json_loads(raw) if data: data["summary"] = str(data.get("summary", ""))[:500] - blocking = drop_weak_issues(cap_issues(data.get("blocking"), 2)) - non_blocking = drop_weak_issues(cap_issues(data.get("non_blocking"), 3)) + blocking = drop_issues_not_in_files( + drop_weak_issues(cap_issues(data.get("blocking"), 2)), + changed_files + ) + non_blocking = drop_issues_not_in_files( + drop_weak_issues(cap_issues(data.get("non_blocking"), 3)), + changed_files + ) data["blocking"] = blocking data["non_blocking"] = non_blocking @@ -338,7 +407,14 @@ def main() -> None: results.append((agent, data, err, raw)) # Curator step - curator_prompt = build_curator_prompt(mode, PREV_SHA, head_sha, agent_payloads, prev_review_text) + curator_prompt = build_curator_prompt( + mode=mode, + prev_sha=PREV_SHA, + head_sha=head_sha, + agent_json=agent_payloads, + prev_review_text=prev_review_text, + changed_files=changed_files, + ) curator_raw = run_ollama(CURATOR_MODEL, curator_prompt) curator, curator_err = safe_json_loads(curator_raw) From db135c2fa6d963fcb76c2eb6bf25c3cdf313e7b9 Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 15:08:40 +0100 Subject: [PATCH 13/17] review adjustments --- .github/scripts/multi_agent_review.py | 80 +++++++++++++++++++-------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index 6220ba5..d320dc0 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -19,7 +19,9 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") PREV_SHA = (os.environ.get("PREV_SHA") or "").strip() PREV_REVIEW_PATH = Path(os.environ.get("PREV_REVIEW_PATH", "prev_review.txt")) # optional + SHOW_DEBUG = (os.environ.get("SHOW_DEBUG") or "").lower() in ("1", "true", "yes") +SHOW_AGENT_TESTS_QUESTIONS = (os.environ.get("SHOW_AGENT_TESTS_QUESTIONS") or "").lower() in ("1", "true", "yes") # --- Utilities --- @@ -106,9 +108,7 @@ def cap_issues(x: Any, max_items: int) -> List[Dict[str, Any]]: def extract_changed_files(diff: str) -> List[str]: """ - Parse `git diff` output and extract changed file paths. We use these paths to: - - constrain model references - - filter hallucinated evidence + Parse `git diff` output and extract changed file paths. """ files: List[str] = [] for line in diff.splitlines(): @@ -129,9 +129,19 @@ def extract_changed_files(diff: str) -> List[str]: return out +def filter_allowed_files(files: List[str]) -> List[str]: + """ + To prevent models from "reviewing guidelines" instead of code, + drop docs/ by default. If PR changes ONLY docs, allow them. + """ + dropped_prefixes = ("docs/",) + filtered = [f for f in files if not f.startswith(dropped_prefixes)] + return filtered if filtered else files + + def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ - Remove issues with weak evidence. This is critical to reduce hallucinations. + Remove issues with weak evidence. Critical to reduce hallucinations. """ out: List[Dict[str, Any]] = [] for it in issues: @@ -144,7 +154,7 @@ def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: def drop_issues_not_in_files(issues: List[Dict[str, Any]], allowed_files: List[str]) -> List[Dict[str, Any]]: """ - If evidence doesn't mention any changed file, the model likely hallucinated the reference. + If evidence doesn't mention any allowed file, the model likely hallucinated. Drop it. """ if not allowed_files: @@ -159,6 +169,15 @@ def drop_issues_not_in_files(issues: List[Dict[str, Any]], allowed_files: List[s return out +def must_mention_file(line: str, allowed_files: List[str]) -> bool: + """ + Helper for debugging curator output quality: + check whether a bullet mentions any allowed file. + """ + s = (line or "").lower() + return any(f.lower() in s for f in allowed_files) + + # --- Agent definitions --- @dataclass @@ -172,28 +191,32 @@ class Agent: name="Correctness & Reliability", focus=( "Find correctness risks: idempotency, retries/backoff, timeouts, transactional boundaries, " - "race conditions, partial failures, error handling, and data consistency." + "race conditions, partial failures, error handling, and data consistency. " + "Focus ONLY on code/workflow changes in ALLOWED_FILES." ), ), Agent( name="Architecture & Boundaries", focus=( "Review architecture: module boundaries, coupling, layering, dependency direction, " - "API contracts, naming of abstractions, and maintainability trade-offs." + "API contracts, naming of abstractions, and maintainability trade-offs. " + "Focus ONLY on code/workflow changes in ALLOWED_FILES." ), ), Agent( name="Tests & Observability", focus=( "Suggest tests and observability: unit/integration tests worth adding, edge cases, " - "logging/metrics/tracing, and how to reproduce failures." + "logging/metrics/tracing, and how to reproduce failures. " + "Do NOT review the guideline document itself; review only ALLOWED_FILES changes." ), ), Agent( name="Cost & LLM Discipline", focus=( "Look for cost/perf traps: unnecessary LLM calls, large payloads, missing caching, " - "missing state+delta pattern, and risk of GPU contention." + "missing state+delta pattern, and risk of GPU contention. " + "Focus ONLY on the actual automation code/workflow changes in ALLOWED_FILES." ), ), ] @@ -219,8 +242,7 @@ class Agent: Rules: - Use only the provided DIFF and GUIDELINES. -- Do not guess. If uncertain, put it into "questions" and set evidence to "unknown". -- Keep evidence concrete (file/line hints if visible in diff). +- You may ONLY reference file paths from ALLOWED_FILES. - If you cannot cite evidence from the DIFF, set evidence to "unknown" AND prefer putting it into "questions" instead of issues. """.strip() @@ -237,11 +259,11 @@ class Agent: "new_risks": ["string (max 5 items)"] } -Rules: +Hard rules: - Deduplicate and prioritize. -- Prefer concrete, actionable items. -- Each bullet/action should cite at least one file from `changed_files` OR be framed as a question. -- If evidence is weak/unknown, downgrade or omit it. +- Each short_summary/top_actions item MUST mention at least one file from `allowed_files` by name. + If you cannot tie it to a file, write it as a question (still mention a file that it relates to). +- If evidence is weak/unknown, omit it. - If mode is INITIAL: resolved/still_open can be empty. - If mode is FOLLOW_UP: classify items as resolved/still_open/new_risks when possible. """.strip() @@ -302,13 +324,13 @@ def build_curator_prompt( head_sha: str, agent_json: List[Dict[str, Any]], prev_review_text: str, - changed_files: List[str], + allowed_files: List[str], ) -> str: payload = { "mode": mode, "previous_reviewed_sha": prev_sha or None, "current_sha": head_sha, - "changed_files": changed_files, + "allowed_files": allowed_files, "agent_reviews": agent_json, "previous_review_text": truncate(prev_review_text, MAX_PREV_REVIEW_CHARS) if prev_review_text else "", } @@ -364,7 +386,9 @@ def main() -> None: return diff = truncate(diff, MAX_DIFF_CHARS) + changed_files = extract_changed_files(diff) + allowed_files = filter_allowed_files(changed_files) head_sha = get_head_sha() mode = "FOLLOW_UP" if PREV_SHA else "INITIAL" @@ -374,7 +398,7 @@ def main() -> None: agent_payloads: List[Dict[str, Any]] = [] for agent in AGENTS: - prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha, changed_files) + prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha, allowed_files) raw = run_ollama(MODEL, prompt) data, err = safe_json_loads(raw) @@ -383,11 +407,11 @@ def main() -> None: blocking = drop_issues_not_in_files( drop_weak_issues(cap_issues(data.get("blocking"), 2)), - changed_files + allowed_files, ) non_blocking = drop_issues_not_in_files( drop_weak_issues(cap_issues(data.get("non_blocking"), 3)), - changed_files + allowed_files, ) data["blocking"] = blocking @@ -413,7 +437,7 @@ def main() -> None: head_sha=head_sha, agent_json=agent_payloads, prev_review_text=prev_review_text, - changed_files=changed_files, + allowed_files=allowed_files, ) curator_raw = run_ollama(CURATOR_MODEL, curator_prompt) curator, curator_err = safe_json_loads(curator_raw) @@ -433,6 +457,13 @@ def main() -> None: "new_risks": [], } + # Optional: basic sanity check for curator outputs (debug only) + if SHOW_DEBUG and allowed_files: + for section in ("short_summary", "top_actions"): + for item in curator.get(section, []) or []: + if not must_mention_file(str(item), allowed_files): + print(f"[DEBUG] Curator {section} item missing file mention: {item}") + # Build markdown title = "Local Multi-Agent AI Review" title += " (Follow-up)" if mode == "FOLLOW_UP" else " (Initial)" @@ -463,7 +494,6 @@ def main() -> None: md += "## New risks\n\n" md += fmt_bullets(curator.get("new_risks", [])) + "\n" - # Details md += "
\nAgent details\n\n" md += "## Rollup\n\n" @@ -488,8 +518,10 @@ def main() -> None: md += f"**Summary:** {md_escape(str(data.get('summary', '')))}\n\n" md += fmt_issue_list("Blocking", data.get("blocking", []) or []) md += fmt_issue_list("Non-blocking", data.get("non_blocking", []) or []) - md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) - md += fmt_list("Questions", data.get("questions", []) or []) + + if SHOW_AGENT_TESTS_QUESTIONS: + md += fmt_list("Tests to add", data.get("tests_to_add", []) or []) + md += fmt_list("Questions", data.get("questions", []) or []) if SHOW_DEBUG: md += "\n---\n\n## Curator (debug)\n\n" From 2008e73b2aea1c2b0dfcc1911d9346ec421b8d1c Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 15:15:29 +0100 Subject: [PATCH 14/17] review adjustments --- .github/scripts/multi_agent_review.py | 84 +++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index d320dc0..f970ffc 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -23,6 +23,9 @@ SHOW_DEBUG = (os.environ.get("SHOW_DEBUG") or "").lower() in ("1", "true", "yes") SHOW_AGENT_TESTS_QUESTIONS = (os.environ.get("SHOW_AGENT_TESTS_QUESTIONS") or "").lower() in ("1", "true", "yes") +# Retry agent call once if we fail to parse JSON (common with local models) +RETRY_ON_BAD_JSON = (os.environ.get("RETRY_ON_BAD_JSON") or "").lower() in ("1", "true", "yes") + # --- Utilities --- @@ -83,9 +86,49 @@ def run_ollama(model: str, prompt: str, timeout_s: int = 300) -> str: raise RuntimeError(f"Ollama HTTP call failed: {e}") +def extract_first_json_object(s: str) -> Optional[str]: + """ + Extract the first top-level JSON object from a string. + Handles cases where the model wraps JSON in markdown fences or appends extra text. + """ + if not s: + return None + + stripped = s.strip() + + # Strip a leading markdown fence if present + if stripped.startswith("```"): + lines = stripped.splitlines() + # drop first fence line (``` or ```json) + lines = lines[1:] + # drop last fence line if present + if lines and lines[-1].strip().startswith("```"): + lines = lines[:-1] + stripped = "\n".join(lines).strip() + + start = stripped.find("{") + if start == -1: + return None + + depth = 0 + for i in range(start, len(stripped)): + ch = stripped[i] + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return stripped[start:i + 1] + + return None + + def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: try: - return json.loads(s), None + js = extract_first_json_object(s) + if not js: + return None, "No JSON object found" + return json.loads(js), None except Exception as e: return None, str(e) @@ -170,10 +213,6 @@ def drop_issues_not_in_files(issues: List[Dict[str, Any]], allowed_files: List[s def must_mention_file(line: str, allowed_files: List[str]) -> bool: - """ - Helper for debugging curator output quality: - check whether a bullet mentions any allowed file. - """ s = (line or "").lower() return any(f.lower() in s for f in allowed_files) @@ -263,6 +302,7 @@ class Agent: - Deduplicate and prioritize. - Each short_summary/top_actions item MUST mention at least one file from `allowed_files` by name. If you cannot tie it to a file, write it as a question (still mention a file that it relates to). +- Do NOT recommend changing guidelines unless `docs/review_guidelines.md` is in allowed_files. - If evidence is weak/unknown, omit it. - If mode is INITIAL: resolved/still_open can be empty. - If mode is FOLLOW_UP: classify items as resolved/still_open/new_risks when possible. @@ -376,6 +416,28 @@ def fmt_bullets(items: List[str]) -> str: return "".join([f"- {md_escape(str(x))}\n" for x in items]) +def call_agent_with_optional_retry(model: str, prompt: str) -> Tuple[str, Optional[Dict[str, Any]], Optional[str]]: + """ + Returns: (raw, parsed_json_or_none, err_or_none) + If RETRY_ON_BAD_JSON is enabled, retries once with a stricter reminder. + """ + raw = run_ollama(model, prompt) + data, err = safe_json_loads(raw) + if data is not None: + return raw, data, None + + if not RETRY_ON_BAD_JSON: + return raw, None, err + + # retry once with a strict reminder (helps local models a lot) + retry_prompt = prompt + "\n\nREMINDER: OUTPUT ONLY VALID JSON. NO MARKDOWN. NO EXTRA TEXT." + raw2 = run_ollama(model, retry_prompt) + data2, err2 = safe_json_loads(raw2) + if data2 is not None: + return raw2, data2, None + return raw2, None, err2 + + def main() -> None: diff = read_text(DIFF_PATH) guidelines = read_text(GUIDELINES_PATH) @@ -399,8 +461,7 @@ def main() -> None: for agent in AGENTS: prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha, allowed_files) - raw = run_ollama(MODEL, prompt) - data, err = safe_json_loads(raw) + raw, data, err = call_agent_with_optional_retry(MODEL, prompt) if data: data["summary"] = str(data.get("summary", ""))[:500] @@ -439,8 +500,7 @@ def main() -> None: prev_review_text=prev_review_text, allowed_files=allowed_files, ) - curator_raw = run_ollama(CURATOR_MODEL, curator_prompt) - curator, curator_err = safe_json_loads(curator_raw) + curator_raw, curator, curator_err = call_agent_with_optional_retry(CURATOR_MODEL, curator_prompt) if curator: curator["short_summary"] = cap_list(curator.get("short_summary"), 2) @@ -457,8 +517,8 @@ def main() -> None: "new_risks": [], } - # Optional: basic sanity check for curator outputs (debug only) - if SHOW_DEBUG and allowed_files: + # Optional sanity check (debug only) + if SHOW_DEBUG and allowed_files and curator and isinstance(curator, dict): for section in ("short_summary", "top_actions"): for item in curator.get(section, []) or []: if not must_mention_file(str(item), allowed_files): @@ -525,7 +585,7 @@ def main() -> None: if SHOW_DEBUG: md += "\n---\n\n## Curator (debug)\n\n" - md += "```json\n" + truncate(curator_raw, 3000) + "\n```\n" + md += "```text\n" + truncate(curator_raw, 3000) + "\n```\n" md += "\n
\n\n" From 11abec81539ed2d0d151facb2f1f1ef499406774 Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 15:24:27 +0100 Subject: [PATCH 15/17] review adjustments --- .github/scripts/multi_agent_review.py | 96 ++++++++++++++------------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index f970ffc..abd2ff5 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -22,8 +22,6 @@ SHOW_DEBUG = (os.environ.get("SHOW_DEBUG") or "").lower() in ("1", "true", "yes") SHOW_AGENT_TESTS_QUESTIONS = (os.environ.get("SHOW_AGENT_TESTS_QUESTIONS") or "").lower() in ("1", "true", "yes") - -# Retry agent call once if we fail to parse JSON (common with local models) RETRY_ON_BAD_JSON = (os.environ.get("RETRY_ON_BAD_JSON") or "").lower() in ("1", "true", "yes") @@ -99,9 +97,7 @@ def extract_first_json_object(s: str) -> Optional[str]: # Strip a leading markdown fence if present if stripped.startswith("```"): lines = stripped.splitlines() - # drop first fence line (``` or ```json) - lines = lines[1:] - # drop last fence line if present + lines = lines[1:] # drop first fence line if lines and lines[-1].strip().startswith("```"): lines = lines[:-1] stripped = "\n".join(lines).strip() @@ -184,7 +180,7 @@ def filter_allowed_files(files: List[str]) -> List[str]: def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ - Remove issues with weak evidence. Critical to reduce hallucinations. + Remove issues with weak evidence. """ out: List[Dict[str, Any]] = [] for it in issues: @@ -198,7 +194,6 @@ def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: def drop_issues_not_in_files(issues: List[Dict[str, Any]], allowed_files: List[str]) -> List[Dict[str, Any]]: """ If evidence doesn't mention any allowed file, the model likely hallucinated. - Drop it. """ if not allowed_files: return issues @@ -217,6 +212,17 @@ def must_mention_file(line: str, allowed_files: List[str]) -> bool: return any(f.lower() in s for f in allowed_files) +def diff_facts(diff: str, allowed_files: List[str], max_lines: int = 60) -> str: + """ + Provide a minimal "facts" header to reduce model drift: + - list changed files + - show first N lines of diff + """ + files = "\n".join(allowed_files) if allowed_files else "(none)" + head = "\n".join(diff.splitlines()[:max_lines]) + return f"CHANGED_FILES:\n{files}\n\nDIFF_HEAD (first {max_lines} lines):\n{head}" + + # --- Agent definitions --- @dataclass @@ -279,10 +285,12 @@ class Agent: - tests_to_add: max 6 items - questions: max 5 items -Rules: -- Use only the provided DIFF and GUIDELINES. +Hard rules (facts): +- You MUST NOT mention technologies, files, functions, or problems that are not present in DIFF. +- If DIFF does not show Kotlin/coroutines/external calls/etc, do NOT mention them. +- GUIDELINES are for evaluation only; DIFF is the ONLY source of facts. - You may ONLY reference file paths from ALLOWED_FILES. -- If you cannot cite evidence from the DIFF, set evidence to "unknown" AND prefer putting it into "questions" instead of issues. +- If you cannot cite evidence from the DIFF, set evidence to "unknown" AND prefer putting it into "questions". """.strip() @@ -301,11 +309,10 @@ class Agent: Hard rules: - Deduplicate and prioritize. - Each short_summary/top_actions item MUST mention at least one file from `allowed_files` by name. - If you cannot tie it to a file, write it as a question (still mention a file that it relates to). + If you cannot tie it to a file, write it as a question (still mention a related file). - Do NOT recommend changing guidelines unless `docs/review_guidelines.md` is in allowed_files. -- If evidence is weak/unknown, omit it. -- If mode is INITIAL: resolved/still_open can be empty. -- If mode is FOLLOW_UP: classify items as resolved/still_open/new_risks when possible. +- Do NOT output "mega lists" of test categories. Each action should be a single concrete thing. +- Use DIFF as the only source of facts; do not invent Kotlin/coroutines/etc. """.strip() @@ -330,6 +337,8 @@ def build_prompt( allowed_files_text = "\n".join(allowed_files) if allowed_files else "(none)" + facts = diff_facts(diff, allowed_files, max_lines=60) + return f""" SYSTEM: You are a strict senior/staff+ software engineer acting as a pull request reviewer. @@ -340,14 +349,18 @@ def build_prompt( ALLOWED_FILES (critical): {allowed_files_text} +FACTS_FROM_DIFF (use for factual claims): +{facts} + Critical rules: +- DIFF is the ONLY source of facts. GUIDELINES are only for judging what you see in DIFF. - You may ONLY reference file paths from ALLOWED_FILES. -- If you cannot point to evidence in the DIFF, DO NOT create an issue. Put it into "questions". +- You MUST NOT mention technologies/files not present in DIFF. If not shown, don't say it. TASK: {agent.focus} -GUIDELINES: +GUIDELINES (evaluation only): {guidelines if guidelines else "(none provided)"} DIFF: @@ -365,6 +378,7 @@ def build_curator_prompt( agent_json: List[Dict[str, Any]], prev_review_text: str, allowed_files: List[str], + diff: str, ) -> str: payload = { "mode": mode, @@ -373,6 +387,7 @@ def build_curator_prompt( "allowed_files": allowed_files, "agent_reviews": agent_json, "previous_review_text": truncate(prev_review_text, MAX_PREV_REVIEW_CHARS) if prev_review_text else "", + "facts_from_diff": diff_facts(diff, allowed_files, max_lines=60), } return f""" @@ -387,6 +402,23 @@ def build_curator_prompt( """.strip() +def call_with_optional_retry(model: str, prompt: str) -> Tuple[str, Optional[Dict[str, Any]], Optional[str]]: + raw = run_ollama(model, prompt) + data, err = safe_json_loads(raw) + if data is not None: + return raw, data, None + + if not RETRY_ON_BAD_JSON: + return raw, None, err + + retry_prompt = prompt + "\n\nREMINDER: OUTPUT ONLY VALID JSON. NO MARKDOWN. NO EXTRA TEXT." + raw2 = run_ollama(model, retry_prompt) + data2, err2 = safe_json_loads(raw2) + if data2 is not None: + return raw2, data2, None + return raw2, None, err2 + + # --- Markdown building --- def fmt_issue_list(title: str, issues: List[Dict[str, Any]]) -> str: @@ -416,28 +448,6 @@ def fmt_bullets(items: List[str]) -> str: return "".join([f"- {md_escape(str(x))}\n" for x in items]) -def call_agent_with_optional_retry(model: str, prompt: str) -> Tuple[str, Optional[Dict[str, Any]], Optional[str]]: - """ - Returns: (raw, parsed_json_or_none, err_or_none) - If RETRY_ON_BAD_JSON is enabled, retries once with a stricter reminder. - """ - raw = run_ollama(model, prompt) - data, err = safe_json_loads(raw) - if data is not None: - return raw, data, None - - if not RETRY_ON_BAD_JSON: - return raw, None, err - - # retry once with a strict reminder (helps local models a lot) - retry_prompt = prompt + "\n\nREMINDER: OUTPUT ONLY VALID JSON. NO MARKDOWN. NO EXTRA TEXT." - raw2 = run_ollama(model, retry_prompt) - data2, err2 = safe_json_loads(raw2) - if data2 is not None: - return raw2, data2, None - return raw2, None, err2 - - def main() -> None: diff = read_text(DIFF_PATH) guidelines = read_text(GUIDELINES_PATH) @@ -455,13 +465,12 @@ def main() -> None: head_sha = get_head_sha() mode = "FOLLOW_UP" if PREV_SHA else "INITIAL" - # Run specialist agents results: List[Tuple[Agent, Optional[Dict[str, Any]], Optional[str], str]] = [] agent_payloads: List[Dict[str, Any]] = [] for agent in AGENTS: prompt = build_prompt(agent, guidelines, diff, mode, PREV_SHA, head_sha, allowed_files) - raw, data, err = call_agent_with_optional_retry(MODEL, prompt) + raw, data, err = call_with_optional_retry(MODEL, prompt) if data: data["summary"] = str(data.get("summary", ""))[:500] @@ -491,7 +500,6 @@ def main() -> None: results.append((agent, data, err, raw)) - # Curator step curator_prompt = build_curator_prompt( mode=mode, prev_sha=PREV_SHA, @@ -499,8 +507,9 @@ def main() -> None: agent_json=agent_payloads, prev_review_text=prev_review_text, allowed_files=allowed_files, + diff=diff, ) - curator_raw, curator, curator_err = call_agent_with_optional_retry(CURATOR_MODEL, curator_prompt) + curator_raw, curator, curator_err = call_with_optional_retry(CURATOR_MODEL, curator_prompt) if curator: curator["short_summary"] = cap_list(curator.get("short_summary"), 2) @@ -517,14 +526,12 @@ def main() -> None: "new_risks": [], } - # Optional sanity check (debug only) if SHOW_DEBUG and allowed_files and curator and isinstance(curator, dict): for section in ("short_summary", "top_actions"): for item in curator.get(section, []) or []: if not must_mention_file(str(item), allowed_files): print(f"[DEBUG] Curator {section} item missing file mention: {item}") - # Build markdown title = "Local Multi-Agent AI Review" title += " (Follow-up)" if mode == "FOLLOW_UP" else " (Initial)" @@ -589,7 +596,6 @@ def main() -> None: md += "\n\n\n" - # Stable marker for next run md += "---\n\n" md += f"\n" From d2954f0b07de2dd43fafd1399b6f21c37dbbf5f7 Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 15:32:36 +0100 Subject: [PATCH 16/17] review adjustments --- .github/scripts/multi_agent_review.py | 138 +++++++++++++++++++------- 1 file changed, 104 insertions(+), 34 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index abd2ff5..25985ec 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -24,6 +24,9 @@ SHOW_AGENT_TESTS_QUESTIONS = (os.environ.get("SHOW_AGENT_TESTS_QUESTIONS") or "").lower() in ("1", "true", "yes") RETRY_ON_BAD_JSON = (os.environ.get("RETRY_ON_BAD_JSON") or "").lower() in ("1", "true", "yes") +# Optional: aggressively prune vague summaries/top_actions +STRICT_FACT_GATING = (os.environ.get("STRICT_FACT_GATING") or "").lower() in ("1", "true", "yes") + # --- Utilities --- @@ -85,10 +88,6 @@ def run_ollama(model: str, prompt: str, timeout_s: int = 300) -> str: def extract_first_json_object(s: str) -> Optional[str]: - """ - Extract the first top-level JSON object from a string. - Handles cases where the model wraps JSON in markdown fences or appends extra text. - """ if not s: return None @@ -146,9 +145,6 @@ def cap_issues(x: Any, max_items: int) -> List[Dict[str, Any]]: def extract_changed_files(diff: str) -> List[str]: - """ - Parse `git diff` output and extract changed file paths. - """ files: List[str] = [] for line in diff.splitlines(): if line.startswith("diff --git "): @@ -158,7 +154,6 @@ def extract_changed_files(diff: str) -> List[str]: if b.startswith("b/"): files.append(b[2:]) - # dedupe, keep order seen = set() out: List[str] = [] for f in files: @@ -169,19 +164,12 @@ def extract_changed_files(diff: str) -> List[str]: def filter_allowed_files(files: List[str]) -> List[str]: - """ - To prevent models from "reviewing guidelines" instead of code, - drop docs/ by default. If PR changes ONLY docs, allow them. - """ dropped_prefixes = ("docs/",) filtered = [f for f in files if not f.startswith(dropped_prefixes)] return filtered if filtered else files def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """ - Remove issues with weak evidence. - """ out: List[Dict[str, Any]] = [] for it in issues: ev = str(it.get("evidence", "")).strip().lower() @@ -192,9 +180,6 @@ def drop_weak_issues(issues: List[Dict[str, Any]]) -> List[Dict[str, Any]]: def drop_issues_not_in_files(issues: List[Dict[str, Any]], allowed_files: List[str]) -> List[Dict[str, Any]]: - """ - If evidence doesn't mention any allowed file, the model likely hallucinated. - """ if not allowed_files: return issues @@ -212,17 +197,76 @@ def must_mention_file(line: str, allowed_files: List[str]) -> bool: return any(f.lower() in s for f in allowed_files) -def diff_facts(diff: str, allowed_files: List[str], max_lines: int = 60) -> str: +def mentions_only_allowed_files(line: str, allowed_files: List[str]) -> bool: """ - Provide a minimal "facts" header to reduce model drift: - - list changed files - - show first N lines of diff + Conservative: if line mentions a '.py' / '.yml' / '.yaml' / '.kt' / '.java' / '.md' path, + ensure every mentioned path is within allowed_files. """ + import re + paths = re.findall(r"[\w./-]+\.(?:py|yml|yaml|kt|java|md)", line or "") + if not paths: + return True + allowed_set = set(allowed_files) + return all(p in allowed_set for p in paths) + + +def diff_facts(diff: str, allowed_files: List[str], max_lines: int = 60) -> str: files = "\n".join(allowed_files) if allowed_files else "(none)" head = "\n".join(diff.splitlines()[:max_lines]) return f"CHANGED_FILES:\n{files}\n\nDIFF_HEAD (first {max_lines} lines):\n{head}" +# --- Fact gating (reduce "smart generic" talk) --- + +KEYWORD_GATES: Dict[str, List[str]] = { + "idempot": ["idempot", "dedup", "at-least-once", "exactly-once"], + "retry": ["retry", "backoff", "exponential"], + "timeout": ["timeout", "time out", "deadline"], + "logging": ["logger", "log.", "logging", "trace", "span", "metric"], + "external": ["http", "request", "client", "api", "url", "fetch"], + "concurrency": ["mutex", "lock", "synchronized", "race", "concurrent", "coroutine", "thread"], +} + +def diff_contains_any(diff_lower: str, needles: List[str]) -> bool: + return any(n in diff_lower for n in needles) + +def gate_claims_to_diff(text: str, diff_lower: str) -> bool: + """ + If the text talks about a gated topic, require the diff to contain related lexemes. + This is intentionally strict to avoid hallucinated "idempotence/retry/etc" claims. + """ + t = (text or "").lower() + for topic, needles in KEYWORD_GATES.items(): + if topic in t or any(n in t for n in needles): + if not diff_contains_any(diff_lower, needles): + return False + return True + + +def filter_bullets_by_fact_gate(items: List[str], diff_lower: str, allowed_files: List[str]) -> List[str]: + out: List[str] = [] + for x in items: + s = str(x) + if not must_mention_file(s, allowed_files): + continue + if not mentions_only_allowed_files(s, allowed_files): + continue + if STRICT_FACT_GATING and not gate_claims_to_diff(s, diff_lower): + continue + out.append(s) + return out + + +def rewrite_agent_summary_if_vague(summary: str, diff_lower: str) -> str: + s = (summary or "").strip() + if not s: + return s + if STRICT_FACT_GATING and not gate_claims_to_diff(s, diff_lower): + # neutral fallback + return "No concrete issues were identified from the DIFF." + return s + + # --- Agent definitions --- @dataclass @@ -286,9 +330,8 @@ class Agent: - questions: max 5 items Hard rules (facts): -- You MUST NOT mention technologies, files, functions, or problems that are not present in DIFF. -- If DIFF does not show Kotlin/coroutines/external calls/etc, do NOT mention them. -- GUIDELINES are for evaluation only; DIFF is the ONLY source of facts. +- DIFF is the ONLY source of facts. GUIDELINES are for evaluation only. +- You MUST NOT mention technologies/files/functions/problems not present in DIFF. - You may ONLY reference file paths from ALLOWED_FILES. - If you cannot cite evidence from the DIFF, set evidence to "unknown" AND prefer putting it into "questions". """.strip() @@ -307,12 +350,12 @@ class Agent: } Hard rules: -- Deduplicate and prioritize. +- DIFF is the ONLY source of facts. +- You MUST NOT mention any file not present in `allowed_files`. - Each short_summary/top_actions item MUST mention at least one file from `allowed_files` by name. - If you cannot tie it to a file, write it as a question (still mention a related file). - Do NOT recommend changing guidelines unless `docs/review_guidelines.md` is in allowed_files. -- Do NOT output "mega lists" of test categories. Each action should be a single concrete thing. -- Use DIFF as the only source of facts; do not invent Kotlin/coroutines/etc. +- Do NOT output mega lists of test categories; each action must be one concrete thing. +- Prefer "Changed: ..." style over generic claims. """.strip() @@ -336,7 +379,6 @@ def build_prompt( """.strip() allowed_files_text = "\n".join(allowed_files) if allowed_files else "(none)" - facts = diff_facts(diff, allowed_files, max_lines=60) return f""" @@ -353,9 +395,9 @@ def build_prompt( {facts} Critical rules: -- DIFF is the ONLY source of facts. GUIDELINES are only for judging what you see in DIFF. +- DIFF is the ONLY source of facts. GUIDELINES are for judging what you see in DIFF. - You may ONLY reference file paths from ALLOWED_FILES. -- You MUST NOT mention technologies/files not present in DIFF. If not shown, don't say it. +- You MUST NOT mention technologies/files not present in DIFF. TASK: {agent.focus} @@ -458,6 +500,7 @@ def main() -> None: return diff = truncate(diff, MAX_DIFF_CHARS) + diff_lower = diff.lower() changed_files = extract_changed_files(diff) allowed_files = filter_allowed_files(changed_files) @@ -473,7 +516,8 @@ def main() -> None: raw, data, err = call_with_optional_retry(MODEL, prompt) if data: - data["summary"] = str(data.get("summary", ""))[:500] + # Summary gating to reduce generic claims + data["summary"] = rewrite_agent_summary_if_vague(str(data.get("summary", ""))[:500], diff_lower) blocking = drop_issues_not_in_files( drop_weak_issues(cap_issues(data.get("blocking"), 2)), @@ -517,6 +561,30 @@ def main() -> None: curator["resolved"] = cap_list(curator.get("resolved"), 5) curator["still_open"] = cap_list(curator.get("still_open"), 5) curator["new_risks"] = cap_list(curator.get("new_risks"), 5) + + # Fact-gate curator bullets/actions + curator["short_summary"] = filter_bullets_by_fact_gate(curator["short_summary"], diff_lower, allowed_files)[:2] + curator["top_actions"] = filter_bullets_by_fact_gate(curator["top_actions"], diff_lower, allowed_files)[:3] + + if mode == "FOLLOW_UP": + curator["resolved"] = filter_bullets_by_fact_gate(curator["resolved"], diff_lower, allowed_files)[:5] + curator["still_open"] = filter_bullets_by_fact_gate(curator["still_open"], diff_lower, allowed_files)[:5] + curator["new_risks"] = filter_bullets_by_fact_gate(curator["new_risks"], diff_lower, allowed_files)[:5] + else: + curator["resolved"] = cap_list(curator.get("resolved"), 5) + curator["still_open"] = cap_list(curator.get("still_open"), 5) + curator["new_risks"] = cap_list(curator.get("new_risks"), 5) + + # If gating nuked everything, keep a minimal, honest summary. + if not curator["short_summary"]: + if allowed_files: + curator["short_summary"] = [f"Changed: {', '.join(allowed_files[:3])}" + (" ..." if len(allowed_files) > 3 else "")] + else: + curator["short_summary"] = ["No concrete issues were identified from the DIFF."] + + if not curator["top_actions"]: + curator["top_actions"] = ["No high-priority actions identified from the DIFF."] + else: curator = { "short_summary": [f"⚠️ Curator returned invalid JSON: {curator_err}"], @@ -530,7 +598,9 @@ def main() -> None: for section in ("short_summary", "top_actions"): for item in curator.get(section, []) or []: if not must_mention_file(str(item), allowed_files): - print(f"[DEBUG] Curator {section} item missing file mention: {item}") + print(f"[DEBUG] Curator {section} item missing allowed file mention: {item}") + if not mentions_only_allowed_files(str(item), allowed_files): + print(f"[DEBUG] Curator {section} item mentions non-allowed file(s): {item}") title = "Local Multi-Agent AI Review" title += " (Follow-up)" if mode == "FOLLOW_UP" else " (Initial)" From e4336b0e25ff78be74cdb4707de67a9253f344a3 Mon Sep 17 00:00:00 2001 From: valerii Date: Fri, 23 Jan 2026 15:45:03 +0100 Subject: [PATCH 17/17] review adjustments --- .github/scripts/multi_agent_review.py | 84 ++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/.github/scripts/multi_agent_review.py b/.github/scripts/multi_agent_review.py index 25985ec..4d062a3 100644 --- a/.github/scripts/multi_agent_review.py +++ b/.github/scripts/multi_agent_review.py @@ -87,6 +87,19 @@ def run_ollama(model: str, prompt: str, timeout_s: int = 300) -> str: raise RuntimeError(f"Ollama HTTP call failed: {e}") +def sanitize_jsonish(s: str) -> str: + """ + Normalize common model output issues: + - smart quotes -> ascii quotes + - odd apostrophes -> ascii apostrophes + """ + if not s: + return s + s = s.replace("“", '"').replace("”", '"').replace("„", '"') + s = s.replace("’", "'").replace("‘", "'") + return s + + def extract_first_json_object(s: str) -> Optional[str]: if not s: return None @@ -120,9 +133,11 @@ def extract_first_json_object(s: str) -> Optional[str]: def safe_json_loads(s: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: try: + s = sanitize_jsonish(s) js = extract_first_json_object(s) if not js: return None, "No JSON object found" + js = sanitize_jsonish(js) return json.loads(js), None except Exception as e: return None, str(e) @@ -199,7 +214,7 @@ def must_mention_file(line: str, allowed_files: List[str]) -> bool: def mentions_only_allowed_files(line: str, allowed_files: List[str]) -> bool: """ - Conservative: if line mentions a '.py' / '.yml' / '.yaml' / '.kt' / '.java' / '.md' path, + Conservative: if line mentions a path with a known extension, ensure every mentioned path is within allowed_files. """ import re @@ -233,7 +248,6 @@ def diff_contains_any(diff_lower: str, needles: List[str]) -> bool: def gate_claims_to_diff(text: str, diff_lower: str) -> bool: """ If the text talks about a gated topic, require the diff to contain related lexemes. - This is intentionally strict to avoid hallucinated "idempotence/retry/etc" claims. """ t = (text or "").lower() for topic, needles in KEYWORD_GATES.items(): @@ -247,7 +261,7 @@ def filter_bullets_by_fact_gate(items: List[str], diff_lower: str, allowed_files out: List[str] = [] for x in items: s = str(x) - if not must_mention_file(s, allowed_files): + if allowed_files and not must_mention_file(s, allowed_files): continue if not mentions_only_allowed_files(s, allowed_files): continue @@ -257,16 +271,60 @@ def filter_bullets_by_fact_gate(items: List[str], diff_lower: str, allowed_files return out -def rewrite_agent_summary_if_vague(summary: str, diff_lower: str) -> str: +def agent_summary_ok(summary: str, diff_lower: str, allowed_files: List[str]) -> bool: s = (summary or "").strip() if not s: - return s + return False if STRICT_FACT_GATING and not gate_claims_to_diff(s, diff_lower): - # neutral fallback + return False + if allowed_files and not must_mention_file(s, allowed_files): + return False + if not mentions_only_allowed_files(s, allowed_files): + return False + return True + + +def rewrite_agent_summary_if_vague(summary: str, diff_lower: str, allowed_files: List[str]) -> str: + s = (summary or "").strip() + if not s: + return s + if not agent_summary_ok(s, diff_lower, allowed_files): return "No concrete issues were identified from the DIFF." return s +def extract_relevant_guidelines(guidelines: str, diff: str, max_lines: int = 80) -> str: + """ + Reduce guideline "prompt bleed" by only including relevant snippets. + - If diff doesn't trigger any keywords, include only a tiny header (first ~20 lines). + - Otherwise, include up to max_lines lines that match triggered keywords. + """ + if not guidelines.strip(): + return "" + diff_lower = diff.lower() + triggers: List[str] = [] + for needles in KEYWORD_GATES.values(): + for n in needles: + if n in diff_lower: + triggers.append(n) + triggers = list(dict.fromkeys(triggers)) + + lines = guidelines.splitlines() + + if not triggers: + return "\n".join(lines[:min(len(lines), 20)]) + + picked: List[str] = [] + for ln in lines: + l = ln.lower() + if any(k in l for k in triggers): + picked.append(ln) + if len(picked) >= max_lines: + break + + return "\n".join(picked) if picked else "\n".join(lines[:min(len(lines), 20)]) + + # --- Agent definitions --- @dataclass @@ -380,6 +438,7 @@ def build_prompt( allowed_files_text = "\n".join(allowed_files) if allowed_files else "(none)" facts = diff_facts(diff, allowed_files, max_lines=60) + relevant_guidelines = extract_relevant_guidelines(guidelines, diff) return f""" SYSTEM: @@ -402,8 +461,8 @@ def build_prompt( TASK: {agent.focus} -GUIDELINES (evaluation only): -{guidelines if guidelines else "(none provided)"} +GUIDELINES (evaluation only, trimmed): +{relevant_guidelines if relevant_guidelines else "(none provided)"} DIFF: {diff} @@ -516,8 +575,7 @@ def main() -> None: raw, data, err = call_with_optional_retry(MODEL, prompt) if data: - # Summary gating to reduce generic claims - data["summary"] = rewrite_agent_summary_if_vague(str(data.get("summary", ""))[:500], diff_lower) + data["summary"] = rewrite_agent_summary_if_vague(str(data.get("summary", ""))[:500], diff_lower, allowed_files) blocking = drop_issues_not_in_files( drop_weak_issues(cap_issues(data.get("blocking"), 2)), @@ -562,7 +620,6 @@ def main() -> None: curator["still_open"] = cap_list(curator.get("still_open"), 5) curator["new_risks"] = cap_list(curator.get("new_risks"), 5) - # Fact-gate curator bullets/actions curator["short_summary"] = filter_bullets_by_fact_gate(curator["short_summary"], diff_lower, allowed_files)[:2] curator["top_actions"] = filter_bullets_by_fact_gate(curator["top_actions"], diff_lower, allowed_files)[:3] @@ -570,12 +627,7 @@ def main() -> None: curator["resolved"] = filter_bullets_by_fact_gate(curator["resolved"], diff_lower, allowed_files)[:5] curator["still_open"] = filter_bullets_by_fact_gate(curator["still_open"], diff_lower, allowed_files)[:5] curator["new_risks"] = filter_bullets_by_fact_gate(curator["new_risks"], diff_lower, allowed_files)[:5] - else: - curator["resolved"] = cap_list(curator.get("resolved"), 5) - curator["still_open"] = cap_list(curator.get("still_open"), 5) - curator["new_risks"] = cap_list(curator.get("new_risks"), 5) - # If gating nuked everything, keep a minimal, honest summary. if not curator["short_summary"]: if allowed_files: curator["short_summary"] = [f"Changed: {', '.join(allowed_files[:3])}" + (" ..." if len(allowed_files) > 3 else "")]