From e7440290acf8e29d1ad9316d652bfc54c8141a9d Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 18 Jun 2026 19:08:18 +0900 Subject: [PATCH 1/5] bench(l6): Kimi K2.5/K2.6 + GLM-5.1 rounds; OpenRouter key from .env Benchmark the planner/worker hybrid with non-OpenAI models, graded by the official swebench harness: - Kimi K2.5/K2.6 sweep L0-L5 100% (24/24), beating gemini-2.5-flash (88%); k2.5 -57% $/pass. L6 hybrid: k2.5 worker 2/2 RESOLVED, k2.6 planner 1/1. - GLM-5.1: solid implementer (5859 RESOLVED), recoverable diagnostician (7080 via re-diagnosis escalate). New failure mode: self-authored test masking (defect #8 candidate, fixed separately). Also switch the OpenRouter key source from a manually-sourced OPENROUTER_API to the repo-local OPENROUTER_API_KEY via the zero-dep .env loader; benchmarks call loadEnvFile() at startup. Refs: INT-1455, INT-1460, INT-1461 --- README.md | 2 +- benchmarks/RUBRIC.md | 103 +- benchmarks/modelSelect.ts | 8 +- benchmarks/results/kimi_ladder_260611.json | 973 ++++++++++++++ benchmarks/results/latest.json | 48 +- .../results/swe_5859_glm51_worker_preds.json | 7 + .../results/swe_5859_glm51_worker_report.json | 23 + .../swe_5859_kimi_worker_RESOLVED_report.json | 23 + .../results/swe_5859_kimi_worker_preds.json | 7 + .../results/swe_7080_glm51_diagnosis.txt | 53 + ...we_7080_glm51_planner_v1_failed_preds.json | 7 + ...swe_7080_glm51_rediag_RESOLVED_report.json | 23 + .../results/swe_7080_glm51_rediag_preds.json | 7 + .../results/swe_7080_glm51_rediagnosis.txt | 125 ++ .../results/swe_7080_kimi_k26_diagnosis.txt | 1119 +++++++++++++++++ ...swe_7080_kimi_planner_RESOLVED_report.json | 23 + .../results/swe_7080_kimi_planner_preds.json | 7 + .../swe_7993_kimi_worker_RESOLVED_report.json | 23 + .../results/swe_7993_kimi_worker_preds.json | 7 + benchmarks/throughputProbe.ts | 11 +- src/core/config.ts | 2 +- 21 files changed, 2547 insertions(+), 54 deletions(-) create mode 100644 benchmarks/results/kimi_ladder_260611.json create mode 100644 benchmarks/results/swe_5859_glm51_worker_preds.json create mode 100644 benchmarks/results/swe_5859_glm51_worker_report.json create mode 100644 benchmarks/results/swe_5859_kimi_worker_RESOLVED_report.json create mode 100644 benchmarks/results/swe_5859_kimi_worker_preds.json create mode 100644 benchmarks/results/swe_7080_glm51_diagnosis.txt create mode 100644 benchmarks/results/swe_7080_glm51_planner_v1_failed_preds.json create mode 100644 benchmarks/results/swe_7080_glm51_rediag_RESOLVED_report.json create mode 100644 benchmarks/results/swe_7080_glm51_rediag_preds.json create mode 100644 benchmarks/results/swe_7080_glm51_rediagnosis.txt create mode 100644 benchmarks/results/swe_7080_kimi_k26_diagnosis.txt create mode 100644 benchmarks/results/swe_7080_kimi_planner_RESOLVED_report.json create mode 100644 benchmarks/results/swe_7080_kimi_planner_preds.json create mode 100644 benchmarks/results/swe_7993_kimi_worker_RESOLVED_report.json create mode 100644 benchmarks/results/swe_7993_kimi_worker_preds.json diff --git a/README.md b/README.md index db293f1..a692825 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ Switch at runtime via Discord: `!provider codex` / `!provider claude` | `claude` | Claude Code CLI | sonnet-4, haiku-4.5, opus-4 | CLI auth | | `codex` | OpenAI Codex CLI | o3, o4-mini | CLI auth | | `gpt` | OpenAI API | gpt-4o, o3, gpt-4.1 | OAuth PKCE | -| `openrouter` | OpenRouter API (native agentic loop) | any OpenRouter model — gpt-5, gemini-2.5-flash, deepseek, glm, qwen, … | OAuth PKCE or `OPENROUTER_API` | +| `openrouter` | OpenRouter API (native agentic loop) | any OpenRouter model — gpt-5, gemini-2.5-flash, deepseek, glm, qwen, … | OAuth PKCE or `OPENROUTER_API_KEY` | | `local` | Ollama / LMStudio / llama.cpp | gemma4, llama3, mistral, qwen, etc. | None | | `lmstudio` | LM Studio OpenAI-compatible API | loaded LM Studio model (`LMSTUDIO_MODEL`) | Optional API key | diff --git a/benchmarks/RUBRIC.md b/benchmarks/RUBRIC.md index 851c430..7532089 100644 --- a/benchmarks/RUBRIC.md +++ b/benchmarks/RUBRIC.md @@ -35,8 +35,8 @@ Derived from benchmark data (`benchmarks/results/`). Score = pass_rate → $/pas |----|--------------------------|-----------| | L0–L3 | **z-ai/glm-4.7-flash** or deepseek-v4-flash | 100% pass, $0.002–0.004/pass. glm is fastest at 2759 tok/s (DeepInfra). Lightweight is enough | | L4 | Lightweight + escalate | Lightweight models mostly pass (100%); frontier escalation absorbs failures | -| L5 | Lightweight (tolerating some failures) | glm/qwen fail 1–2 tasks like L5-lru (87–95%). Escalation absorbs it | -| **L6** | **Frontier (openai/gpt-5)** | **Lightweight models lack answer accuracy** — see L6 measurements below | +| L5 | **moonshotai/kimi-k2.5** | 100% L0–L5 sweep (24/24, repeat 2), $0.0095/pass — 57% cheaper than gemini-2.5-flash at higher quality (2026-06-11 round) | +| **L6** | **Frontier diagnosis + lightweight implementation** | Solo lightweight lacks diagnostic depth. Diagnosis role: gpt-5 or **kimi-k2.6** (first-shot RESOLVED); glm-5.1 works but needed the re-diagnosis escalate loop. Implementation role: kimi-k2.5 / deepseek-v4-flash / glm-5.1 | ### L6 measurements (pylint-dev__pylint-7080, 2026-06) @@ -116,6 +116,92 @@ diagnosis) — the no-edit guard (`nudgeMaxOnNoEdit`) and a rich diagnosis (including concrete pseudocode) are the success factors. Best-of-N has low expected value (all 9 undiagnosed gemini attempts were wrong). +### Kimi K2.5/K2.6 round (2026-06-11) — both roles validated + +Tested MoonshotAI Kimi K2.5 ($0.40/$1.90 per M) and K2.6 ($0.68/$3.41 per M) +in both OpenSwarm roles, same harness, official grading. + +**L0–L5 ladder** (12 tasks × repeat 2 vs baseline gemini-2.5-flash): + +| Model | L0–L5 pass | $/pass | avg tools | avg dur | +|-------|-----------|--------|-----------|---------| +| **moonshotai/kimi-k2.5** | **100%** (24/24) | $0.0095 | 7.9 | 34s | +| moonshotai/kimi-k2.6 | **100%** (24/24) | $0.0122 | 6.8 | 41s | +| google/gemini-2.5-flash (baseline) | 88% | $0.0220 | 10.8 | 27s | + +k2.5 strictly dominates k2.6 as a worker (same quality, cheaper, faster); +k2.6's higher reasoning latency (~2× on trivial tasks) buys nothing at L0–L5. + +**L6 worker role** — kimi-k2.5 as hybrid implementer (saved gpt-5 diagnoses): + +| Configuration | Instance | Result | +|---------------|----------|--------| +| gpt-5 diagnosis + **kimi-k2.5** implementation | 5859 | **RESOLVED** ✅ | +| gpt-5 re-diagnosis + **kimi-k2.5** implementation | 7993 | **RESOLVED** ✅ (first attempt, clean single-file patch) | + +**L6 planner role** — kimi-k2.6 as the diagnostician (replacing gpt-5): + +| Configuration | Instance | Result | +|---------------|----------|--------| +| **kimi-k2.6 diagnosis** + deepseek-v4-flash implementation | 7080 | **RESOLVED** ✅ | + +→ **The hybrid pattern no longer requires gpt-5.** kimi-k2.6 produced a +RESOLVED-grade diagnosis on the instance where 5 of 6 solo models failed +(its fix differs from gpt-5's — `_is_in_ignore_list_re` path normalization +vs `os.path.relpath` — both pass the official grader). Caveat: the k2.6 +diagnosis is verbose (58k chars vs gpt-5's 5.5k), so implementer context +pressure is higher; it worked, but diagnosis-length budgeting is a future +lever. Implementer fitness update: **kimi-k2.5 ✅✅ (2/2, joins deepseek as +a reliable finisher)**. + +One contamination case (5859): the implementer re-created the missing +FAIL_TO_PASS test in the test file to self-verify; test-file hunks were +stripped from model_patch before grading (standard SWE-bench practice). +Harness improvement candidate: auto-exclude `tests/**` from patch +extraction. + +Evidence: `results/kimi_ladder_260611.json`, +`results/swe_{5859,7993}_kimi_worker_RESOLVED_report.json`, +`results/swe_7080_kimi_planner_RESOLVED_report.json`, +`results/swe_7080_kimi_k26_diagnosis.txt`. + +### GLM-5.1 round (2026-06-11) — worker ✅, planner needs the escalate loop + +Tested `z-ai/glm-5.1` ($0.98/$3.08 per M, 202k ctx — kimi-k2.6's class) on the +same instances: + +| Role | Configuration | Instance | Result | +|------|---------------|----------|--------| +| Worker | gpt-5 diagnosis + **glm-5.1** implementation | 5859 | **RESOLVED** ✅ (identical source fix to kimi-k2.5's) | +| Planner (1st shot) | **glm-5.1** diagnosis + deepseek implementation | 7080 | unresolved — wrong mechanism (directory pruning instead of the `./` prefix mismatch); FAIL_TO_PASS 0/1, PASS_TO_PASS 120/120 intact | +| Planner (escalate) | **glm-5.1 re-diagnosis** (fed the failed patch + official test output) + deepseek | 7080 | **RESOLVED** ✅ (`os.path.normpath(root)` + `norm_root + os.sep` — converged on the real mechanism) | + +Planner-role comparison on 7080: gpt-5 ✅ first shot / kimi-k2.6 ✅ first shot / +**glm-5.1 ✅ but only via the re-diagnosis escalate loop**. Its first diagnosis +reads precise (compact 4k chars, detailed mechanism walkthrough) yet targeted +the wrong layer — articulate-but-wrong is exactly the failure mode the escalate +loop exists for, and glm-5.1 self-corrected when fed the failing evidence +(same protocol that rescued gpt-5 on 7993). + +**Harness defect #8 — self-authored test masking (FIXED 2026-06-18, INT-1462)**: +the FAIL_TO_PASS test is absent from the extracted /testbed, so the implementer +wrote its own guess of the gold test, which validated the wrong fix — local +verification "passed" while official grading failed. When the test oracle +itself is model-authored, the verification loop cannot catch a wrong diagnosis. +→ Fix: `sweBench.ts` now pre-applies the instance's `test_patch` to the sandbox +**before** baselining, so local verification runs the real oracle; the touched +test files join `protectedFiles` so the implementer can't rewrite them. Because +the gold test is baked into the baseline, `git diff baseSha` excludes it and +model_patch stays source-only automatically (no manual test-hunk stripping). +Verified on 5859: worker created zero test files, model_patch was source-only +(`pylint/checkers/misc.py`), official grading still RESOLVED +(`results/swe_5859_fix8_{sourceonly_preds,RESOLVED_report}.json`). + +Evidence: `results/swe_5859_glm51_worker_{report,preds}.json`, +`results/swe_7080_glm51_{diagnosis,rediagnosis}.txt`, +`results/swe_7080_glm51_planner_v1_failed_preds.json`, +`results/swe_7080_glm51_rediag_{RESOLVED_report,preds}.json`. + Additional hybrid failure modes (discovered on 7993): - **Diagnosis error propagation**: if the diagnosis pseudocode itself is buggy @@ -133,8 +219,9 @@ Additional hybrid failure modes (discovered on 7993): ## Routing principles (tiering) - **Judgment-heavy roles** (Planner/decomposition, Reviewer): pinned to - frontier (gpt-5). A wrong judgment poisons everything downstream, so these - are never downgraded. + frontier-grade models. A wrong judgment poisons everything downstream, so + these are never downgraded. Measured options: gpt-5, **kimi-k2.6** + (RESOLVED-grade L6 diagnosis at ~1/3 of gpt-5 pricing — 2026-06-11 round). - **Execution roles** (Worker/Tester/Documenter/Auditor): lightweight by default + frontier escalation after 2 failures. - But **L6-grade real-world work should use a frontier worker too** — @@ -147,7 +234,7 @@ Additional hybrid failure modes (discovered on 7993): export DOCKER_HOST="unix:///Users//.orbstack/run/docker.sock" # 2. The OpenSwarm worker solves the instance and produces predictions -OPENROUTER_API=... SWE_MODEL=openai/gpt-5 \ +SWE_MODEL=openai/gpt-5 \ npx tsx benchmarks/sweBench.ts # 3. Grade with the official swebench harness (per-model, max_workers 1 — concurrency overloads the VM) @@ -186,6 +273,6 @@ L6 exposed defects that only manifest in large repos: were never reached. → Thresholds 24k→60k tokens, compactAfterMessages 24→60, keepRecent 8→16. -(Defects #4–#7 — final-answer turn, no-edit guard, protected files, bash -timeout — were found later during the hybrid experiments; see the hybrid -section above.) +(Defects #4–#8 — final-answer turn, no-edit guard, protected files, bash +timeout, and self-authored test masking — were found later during the hybrid +experiments; see the hybrid section above. #8 is fixed in INT-1462.) diff --git a/benchmarks/modelSelect.ts b/benchmarks/modelSelect.ts index ba3768a..e768bb5 100644 --- a/benchmarks/modelSelect.ts +++ b/benchmarks/modelSelect.ts @@ -7,8 +7,8 @@ // 자동 교체 안 함 — 랭킹만 출력, 사람이 config 반영(통제력 유지). // // 실행: -// source ~/dev/VEGA/.env (OPENROUTER_API 필요) // npx tsx benchmarks/modelSelect.ts --repeat 3 +// (OPENROUTER_API_KEY required — auto-loaded from the repo .env) // npx tsx benchmarks/modelSelect.ts --model openai/gpt-5 --model qwen/qwen3-coder // npx tsx benchmarks/modelSelect.ts --task L0-fix-multiply --repeat 5 // ============================================ @@ -22,6 +22,7 @@ import { promisify } from 'node:util'; import { runWorker } from '../src/agents/worker.js'; import { setDefaultAdapter } from '../src/adapters/index.js'; import { initLocale } from '../src/locale/index.js'; +import { loadEnvFile } from '../src/core/envFile.js'; import { CODING_TASKS, type BenchTask } from './tasks/codingTasks.js'; const exec = promisify(execFile); @@ -285,12 +286,13 @@ function parseArgs(argv: string[]) { } async function main() { + loadEnvFile(); initLocale('en'); setDefaultAdapter('openrouter'); - const apiKey = process.env.OPENROUTER_API; + const apiKey = process.env.OPENROUTER_API_KEY ?? process.env.OPENROUTER_API; if (!apiKey) { - console.error('OPENROUTER_API not set. Run: source ~/dev/VEGA/.env'); + console.error('OPENROUTER_API_KEY not set (checked shell env and .env).'); process.exit(1); } diff --git a/benchmarks/results/kimi_ladder_260611.json b/benchmarks/results/kimi_ladder_260611.json new file mode 100644 index 0000000..fb0da21 --- /dev/null +++ b/benchmarks/results/kimi_ladder_260611.json @@ -0,0 +1,973 @@ +{ + "results": [ + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L0-fix-multiply", + "rep": 1, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 4, + "apiCalls": 5, + "promptTokens": 5446, + "completionTokens": 1361, + "costUsd": 0.0047643, + "durationMs": 21489 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L0-fix-multiply", + "rep": 2, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 4, + "apiCalls": 5, + "promptTokens": 5068, + "completionTokens": 1267, + "costUsd": 0.0044345, + "durationMs": 15837 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L1-add-null-guard", + "rep": 1, + "passed": true, + "reason": "guard + empty return + parse intact", + "toolCalls": 6, + "apiCalls": 7, + "promptTokens": 9558, + "completionTokens": 2389, + "costUsd": 0.0083623, + "durationMs": 23760 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L1-add-null-guard", + "rep": 2, + "passed": true, + "reason": "guard + empty return + parse intact", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 5382, + "completionTokens": 1346, + "costUsd": 0.0047101999999999995, + "durationMs": 22427 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L2-rename-across-files", + "rep": 1, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 14, + "apiCalls": 10, + "promptTokens": 19766, + "completionTokens": 4942, + "costUsd": 0.017296199999999998, + "durationMs": 72933 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L2-rename-across-files", + "rep": 2, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 13, + "apiCalls": 9, + "promptTokens": 14868, + "completionTokens": 3717, + "costUsd": 0.0130095, + "durationMs": 27487 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L3-implement-to-pass-test", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 6, + "promptTokens": 7926, + "completionTokens": 1982, + "costUsd": 0.0069362, + "durationMs": 26731 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L3-implement-to-pass-test", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 10, + "apiCalls": 9, + "promptTokens": 16978, + "completionTokens": 4244, + "costUsd": 0.014854800000000001, + "durationMs": 41832 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-cascading-signature-change", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 17, + "apiCalls": 9, + "promptTokens": 23826, + "completionTokens": 5957, + "costUsd": 0.020848699999999998, + "durationMs": 50846 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-cascading-signature-change", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 13, + "apiCalls": 8, + "promptTokens": 19440, + "completionTokens": 4860, + "costUsd": 0.01701, + "durationMs": 55736 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-edge-case-completeness", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 5, + "promptTokens": 8070, + "completionTokens": 2018, + "costUsd": 0.007062199999999999, + "durationMs": 25328 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-edge-case-completeness", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 6, + "promptTokens": 10622, + "completionTokens": 2655, + "costUsd": 0.0092933, + "durationMs": 40703 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-hidden-bug-debug", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 7759, + "completionTokens": 1940, + "costUsd": 0.0067896, + "durationMs": 19828 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-hidden-bug-debug", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 7305, + "completionTokens": 1826, + "costUsd": 0.0063914, + "durationMs": 35571 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-deep-dependency-chain", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 12, + "apiCalls": 6, + "promptTokens": 13854, + "completionTokens": 3464, + "costUsd": 0.0121232, + "durationMs": 37517 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L4-deep-dependency-chain", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 11, + "apiCalls": 5, + "promptTokens": 9965, + "completionTokens": 2491, + "costUsd": 0.0087189, + "durationMs": 36404 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-merge-intervals", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 6, + "promptTokens": 10181, + "completionTokens": 2545, + "costUsd": 0.0089079, + "durationMs": 27935 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-merge-intervals", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 7713, + "completionTokens": 1928, + "costUsd": 0.0067484, + "durationMs": 25148 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-lru-cache", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 8, + "apiCalls": 8, + "promptTokens": 12450, + "completionTokens": 3112, + "costUsd": 0.010892800000000001, + "durationMs": 49856 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-lru-cache", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 7, + "promptTokens": 11760, + "completionTokens": 2940, + "costUsd": 0.01029, + "durationMs": 35581 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-tokenizer-state-machine", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 7, + "promptTokens": 10010, + "completionTokens": 2502, + "costUsd": 0.0087578, + "durationMs": 34673 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-tokenizer-state-machine", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 6078, + "completionTokens": 1520, + "costUsd": 0.0053192, + "durationMs": 26740 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-generic-groupby", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 6, + "promptTokens": 8914, + "completionTokens": 2229, + "costUsd": 0.007800700000000001, + "durationMs": 36658 + }, + { + "model": "moonshotai/kimi-k2.5", + "taskId": "L5-generic-groupby", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 6, + "promptTokens": 8608, + "completionTokens": 2152, + "costUsd": 0.0075320000000000005, + "durationMs": 32381 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L0-fix-multiply", + "rep": 1, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 4, + "apiCalls": 5, + "promptTokens": 5053, + "completionTokens": 1263, + "costUsd": 0.00774287, + "durationMs": 62092 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L0-fix-multiply", + "rep": 2, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 5, + "apiCalls": 6, + "promptTokens": 6273, + "completionTokens": 1568, + "costUsd": 0.00961252, + "durationMs": 70063 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L1-add-null-guard", + "rep": 1, + "passed": true, + "reason": "guard + empty return + parse intact", + "toolCalls": 7, + "apiCalls": 8, + "promptTokens": 8930, + "completionTokens": 2232, + "costUsd": 0.01368352, + "durationMs": 29440 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L1-add-null-guard", + "rep": 2, + "passed": true, + "reason": "guard + empty return + parse intact", + "toolCalls": 7, + "apiCalls": 8, + "promptTokens": 8955, + "completionTokens": 2239, + "costUsd": 0.01372439, + "durationMs": 63420 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L2-rename-across-files", + "rep": 1, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 12, + "apiCalls": 10, + "promptTokens": 16651, + "completionTokens": 4163, + "costUsd": 0.02551851, + "durationMs": 61816 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L2-rename-across-files", + "rep": 2, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 7, + "apiCalls": 6, + "promptTokens": 7136, + "completionTokens": 1784, + "costUsd": 0.010935919999999998, + "durationMs": 35293 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L3-implement-to-pass-test", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 6200, + "completionTokens": 1550, + "costUsd": 0.0095015, + "durationMs": 18676 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L3-implement-to-pass-test", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 4, + "apiCalls": 5, + "promptTokens": 5704, + "completionTokens": 1426, + "costUsd": 0.00874138, + "durationMs": 19065 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-cascading-signature-change", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 11, + "apiCalls": 5, + "promptTokens": 8336, + "completionTokens": 2084, + "costUsd": 0.012774919999999999, + "durationMs": 33761 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-cascading-signature-change", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 11, + "apiCalls": 5, + "promptTokens": 8338, + "completionTokens": 2084, + "costUsd": 0.012776280000000001, + "durationMs": 47359 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-edge-case-completeness", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 5, + "promptTokens": 7439, + "completionTokens": 1860, + "costUsd": 0.01140112, + "durationMs": 45558 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-edge-case-completeness", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 5, + "promptTokens": 7629, + "completionTokens": 1907, + "costUsd": 0.01169059, + "durationMs": 43126 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-hidden-bug-debug", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 7084, + "completionTokens": 1771, + "costUsd": 0.01085623, + "durationMs": 29383 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-hidden-bug-debug", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 7024, + "completionTokens": 1756, + "costUsd": 0.01076428, + "durationMs": 26774 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-deep-dependency-chain", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 12, + "apiCalls": 8, + "promptTokens": 14362, + "completionTokens": 3590, + "costUsd": 0.02200806, + "durationMs": 54954 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L4-deep-dependency-chain", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 11, + "apiCalls": 5, + "promptTokens": 8765, + "completionTokens": 2191, + "costUsd": 0.01343151, + "durationMs": 65491 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-merge-intervals", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 6, + "promptTokens": 10102, + "completionTokens": 2525, + "costUsd": 0.01547961, + "durationMs": 45108 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-merge-intervals", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 5, + "promptTokens": 7088, + "completionTokens": 1772, + "costUsd": 0.01086236, + "durationMs": 30043 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-lru-cache", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 6466, + "completionTokens": 1616, + "costUsd": 0.00990744, + "durationMs": 24619 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-lru-cache", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 6412, + "completionTokens": 1603, + "costUsd": 0.00982639, + "durationMs": 19719 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-tokenizer-state-machine", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 6057, + "completionTokens": 1514, + "costUsd": 0.0092815, + "durationMs": 44427 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-tokenizer-state-machine", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 6, + "promptTokens": 8017, + "completionTokens": 2004, + "costUsd": 0.0122852, + "durationMs": 40559 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-generic-groupby", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 5, + "promptTokens": 6246, + "completionTokens": 1562, + "costUsd": 0.009573700000000001, + "durationMs": 43635 + }, + { + "model": "moonshotai/kimi-k2.6", + "taskId": "L5-generic-groupby", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 6, + "apiCalls": 5, + "promptTokens": 6326, + "completionTokens": 1581, + "costUsd": 0.00969289, + "durationMs": 34983 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L0-fix-multiply", + "rep": 1, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 9, + "apiCalls": 10, + "promptTokens": 11949, + "completionTokens": 2987, + "costUsd": 0.0110522, + "durationMs": 21729 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L0-fix-multiply", + "rep": 2, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 7, + "apiCalls": 8, + "promptTokens": 10046, + "completionTokens": 2511, + "costUsd": 0.0092913, + "durationMs": 15913 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L1-add-null-guard", + "rep": 1, + "passed": true, + "reason": "guard + empty return + parse intact", + "toolCalls": 4, + "apiCalls": 5, + "promptTokens": 5143, + "completionTokens": 1286, + "costUsd": 0.004757900000000001, + "durationMs": 8534 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L1-add-null-guard", + "rep": 2, + "passed": true, + "reason": "guard + empty return + parse intact", + "toolCalls": 5, + "apiCalls": 6, + "promptTokens": 7086, + "completionTokens": 1771, + "costUsd": 0.0065533, + "durationMs": 13324 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L2-rename-across-files", + "rep": 1, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 6, + "apiCalls": 7, + "promptTokens": 8722, + "completionTokens": 2180, + "costUsd": 0.0080666, + "durationMs": 19067 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L2-rename-across-files", + "rep": 2, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 5, + "apiCalls": 6, + "promptTokens": 7460, + "completionTokens": 1865, + "costUsd": 0.0069005, + "durationMs": 11110 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L3-implement-to-pass-test", + "rep": 1, + "passed": false, + "reason": "palindrome.ts missing", + "toolCalls": 12, + "apiCalls": 13, + "promptTokens": 21730, + "completionTokens": 5433, + "costUsd": 0.0201015, + "durationMs": 25287 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L3-implement-to-pass-test", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 12, + "apiCalls": 12, + "promptTokens": 15149, + "completionTokens": 3787, + "costUsd": 0.014012199999999999, + "durationMs": 24945 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-cascading-signature-change", + "rep": 1, + "passed": false, + "reason": "test failed: Command failed: npx tsx check.test.ts\nnode:internal/modules/run_main:123\n triggerUncaughtException(\n ^\n\nError: Transform failed with 1", + "toolCalls": 21, + "apiCalls": 21, + "promptTokens": 48139, + "completionTokens": 12035, + "costUsd": 0.044529200000000005, + "durationMs": 38556 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-cascading-signature-change", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 11, + "apiCalls": 12, + "promptTokens": 24614, + "completionTokens": 6153, + "costUsd": 0.0227667, + "durationMs": 21462 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-edge-case-completeness", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 21, + "apiCalls": 21, + "promptTokens": 53220, + "completionTokens": 13305, + "costUsd": 0.049228499999999994, + "durationMs": 60661 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-edge-case-completeness", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 9, + "apiCalls": 10, + "promptTokens": 17583, + "completionTokens": 4396, + "costUsd": 0.016264900000000002, + "durationMs": 18307 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-hidden-bug-debug", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 12, + "apiCalls": 13, + "promptTokens": 21966, + "completionTokens": 5491, + "costUsd": 0.020317300000000003, + "durationMs": 22616 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-hidden-bug-debug", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 8, + "promptTokens": 10824, + "completionTokens": 2706, + "costUsd": 0.0100122, + "durationMs": 15768 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-deep-dependency-chain", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 16, + "apiCalls": 17, + "promptTokens": 33873, + "completionTokens": 8468, + "costUsd": 0.0313319, + "durationMs": 57836 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L4-deep-dependency-chain", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 11, + "apiCalls": 12, + "promptTokens": 17860, + "completionTokens": 4465, + "costUsd": 0.0165205, + "durationMs": 21894 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-merge-intervals", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 9, + "apiCalls": 10, + "promptTokens": 14506, + "completionTokens": 3627, + "costUsd": 0.0134193, + "durationMs": 20377 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-merge-intervals", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 12, + "apiCalls": 13, + "promptTokens": 25986, + "completionTokens": 6497, + "costUsd": 0.0240383, + "durationMs": 29340 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-lru-cache", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 20, + "apiCalls": 21, + "promptTokens": 52204, + "completionTokens": 13051, + "costUsd": 0.048288700000000004, + "durationMs": 61520 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-lru-cache", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 7, + "apiCalls": 8, + "promptTokens": 13314, + "completionTokens": 3329, + "costUsd": 0.0123167, + "durationMs": 22343 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-tokenizer-state-machine", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 21, + "apiCalls": 21, + "promptTokens": 44935, + "completionTokens": 11234, + "costUsd": 0.041565500000000005, + "durationMs": 53125 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-tokenizer-state-machine", + "rep": 2, + "passed": false, + "reason": "test failed: Command failed: npx tsx tokenizer.test.ts\n/private/var/folders/x4/5ltqqh6921176w4qz_gr6pr00000gn/T/osw-bench-LwQJ1C/tokenizer.test.ts:2\nfunc", + "toolCalls": 8, + "apiCalls": 9, + "promptTokens": 10814, + "completionTokens": 2704, + "costUsd": 0.010004200000000001, + "durationMs": 25101 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-generic-groupby", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 5, + "apiCalls": 6, + "promptTokens": 6860, + "completionTokens": 1715, + "costUsd": 0.0063455000000000004, + "durationMs": 11360 + }, + { + "model": "google/gemini-2.5-flash", + "taskId": "L5-generic-groupby", + "rep": 2, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 9, + "apiCalls": 10, + "promptTokens": 15402, + "completionTokens": 3851, + "costUsd": 0.0142481, + "durationMs": 26414 + } + ], + "aggs": [ + { + "model": "moonshotai/kimi-k2.5", + "runs": 24, + "passes": 24, + "passRate": 1, + "avgCostUsd": 0.0095355875, + "costPerPass": 0.0095355875, + "avgToolCalls": 7.875, + "avgDurationMs": 34308.375 + }, + { + "model": "moonshotai/kimi-k2.6", + "runs": 24, + "passes": 24, + "passRate": 1, + "avgCostUsd": 0.012169695416666666, + "costPerPass": 0.012169695416666666, + "avgToolCalls": 6.791666666666667, + "avgDurationMs": 41223.5 + }, + { + "model": "google/gemini-2.5-flash", + "runs": 24, + "passes": 21, + "passRate": 0.875, + "avgCostUsd": 0.019247208333333335, + "costPerPass": 0.021996809523809526, + "avgToolCalls": 10.791666666666666, + "avgDurationMs": 26941.208333333332 + } + ], + "baseline": "google/gemini-2.5-flash" +} \ No newline at end of file diff --git a/benchmarks/results/latest.json b/benchmarks/results/latest.json index 83696c4..d8fc5b1 100644 --- a/benchmarks/results/latest.json +++ b/benchmarks/results/latest.json @@ -8,49 +8,23 @@ "reason": "multiply=a*b, add intact", "toolCalls": 4, "apiCalls": 5, - "promptTokens": 6383, - "completionTokens": 1596, - "costUsd": 0.00102138, - "durationMs": 6091 - }, - { - "model": "z-ai/glm-4.7-flash", - "taskId": "L2-rename-across-files", - "rep": 1, - "passed": true, - "reason": "def + import + call all renamed", - "toolCalls": 7, - "apiCalls": 6, - "promptTokens": 9778, - "completionTokens": 2444, - "costUsd": 0.00156428, - "durationMs": 8043 - }, - { - "model": "z-ai/glm-4.7-flash", - "taskId": "L5-merge-intervals", - "rep": 1, - "passed": true, - "reason": "test passed (executed)", - "toolCalls": 4, - "apiCalls": 4, - "promptTokens": 6058, - "completionTokens": 1515, - "costUsd": 0.00096948, - "durationMs": 5993 + "promptTokens": 6395, + "completionTokens": 1599, + "costUsd": 0.0010233, + "durationMs": 6996 } ], "aggs": [ { "model": "z-ai/glm-4.7-flash", - "runs": 3, - "passes": 3, + "runs": 1, + "passes": 1, "passRate": 1, - "avgCostUsd": 0.0011850466666666667, - "costPerPass": 0.0011850466666666667, - "avgToolCalls": 5, - "avgDurationMs": 6709 + "avgCostUsd": 0.0010233, + "costPerPass": 0.0010233, + "avgToolCalls": 4, + "avgDurationMs": 6996 } ], - "baseline": "z-ai/glm-4.7-flash" + "baseline": "google/gemini-2.5-flash" } \ No newline at end of file diff --git a/benchmarks/results/swe_5859_glm51_worker_preds.json b/benchmarks/results/swe_5859_glm51_worker_preds.json new file mode 100644 index 0000000..42c2fb8 --- /dev/null +++ b/benchmarks/results/swe_5859_glm51_worker_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-5859", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/checkers/misc.py b/pylint/checkers/misc.py\nindex 69149e61a..28978365f 100644\n--- a/pylint/checkers/misc.py\n+++ b/pylint/checkers/misc.py\n@@ -121,9 +121,9 @@ class EncodingChecker(BaseChecker):\n \n notes = \"|\".join(re.escape(note) for note in self.config.notes)\n if self.config.notes_rgx:\n- regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})\\b\"\n+ regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})(?=\\W|$)\"\n else:\n- regex_string = rf\"#\\s*({notes})\\b\"\n+ regex_string = rf\"#\\s*({notes})(?=\\W|$)\"\n \n self._fixme_pattern = re.compile(regex_string, re.I)\n \n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_5859_glm51_worker_report.json b/benchmarks/results/swe_5859_glm51_worker_report.json new file mode 100644 index 0000000..c5df85d --- /dev/null +++ b/benchmarks/results/swe_5859_glm51_worker_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-5859" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-5859" + ], + "resolved_ids": [ + "pylint-dev__pylint-5859" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_5859_kimi_worker_RESOLVED_report.json b/benchmarks/results/swe_5859_kimi_worker_RESOLVED_report.json new file mode 100644 index 0000000..c5df85d --- /dev/null +++ b/benchmarks/results/swe_5859_kimi_worker_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-5859" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-5859" + ], + "resolved_ids": [ + "pylint-dev__pylint-5859" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_5859_kimi_worker_preds.json b/benchmarks/results/swe_5859_kimi_worker_preds.json new file mode 100644 index 0000000..42c2fb8 --- /dev/null +++ b/benchmarks/results/swe_5859_kimi_worker_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-5859", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/checkers/misc.py b/pylint/checkers/misc.py\nindex 69149e61a..28978365f 100644\n--- a/pylint/checkers/misc.py\n+++ b/pylint/checkers/misc.py\n@@ -121,9 +121,9 @@ class EncodingChecker(BaseChecker):\n \n notes = \"|\".join(re.escape(note) for note in self.config.notes)\n if self.config.notes_rgx:\n- regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})\\b\"\n+ regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})(?=\\W|$)\"\n else:\n- regex_string = rf\"#\\s*({notes})\\b\"\n+ regex_string = rf\"#\\s*({notes})(?=\\W|$)\"\n \n self._fixme_pattern = re.compile(regex_string, re.I)\n \n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_7080_glm51_diagnosis.txt b/benchmarks/results/swe_7080_glm51_diagnosis.txt new file mode 100644 index 0000000..f787b2c --- /dev/null +++ b/benchmarks/results/swe_7080_glm51_diagnosis.txt @@ -0,0 +1,53 @@ +## ROOT CAUSE + +In `pylint/lint/pylinter.py`, the `_discover_files` method (line 585-619) only checks `_is_ignored_file` against **directory roots** (`root` from `os.walk`), but never checks individual `.py` files yielded at lines 613-617. The `ignore-paths` regex patterns (e.g., `^src/gen/.*$`) are designed to match **file paths** like `src/gen/about.py`, not bare directory paths like `src/gen`. Since `^src/gen/.*$` requires a `/` after `gen`, it never matches the directory path `src/gen`, so the directory is not pruned from the walk. Then individual files within that directory are yielded unconditionally (only filtered by `.py` extension). + +While `expand_modules` (called later via `_iterate_file_descrs` → `_expand_files`) does re-check individual file paths against `ignore_list_paths_re`, this only works for files within **packages** (the `has_init` branch at line 137-146). For directories without `__init__.py`, `_discover_files` yields individual file paths, and `expand_modules` processes them as top-level entries. The `_is_ignored_file` check at line 78-80 of `expand_modules` does check `ignore_list_paths_re` against the full path — BUT the critical problem is that `_discover_files` also yields the **directory itself** when it contains `__init__.py` (line 611: `yield root`). When `expand_modules` receives a package directory like `src/gen`, the `_is_ignored_file` check at line 78 uses `os.path.basename("src/gen")` = `"gen"` for the `ignore_list` and `ignore_list_re` checks, and checks the full path `"src/gen"` against `ignore_list_paths_re` — which fails to match `^src/gen/.*$` because there's no trailing `/` + content. The package is then expanded, and while subfiles are checked at line 143-145, the **package's own `__init__.py`** is NOT checked against `ignore_list_paths_re` (it's skipped at line 141-142 only if `filepath == subfilepath`). + +The core defect: `_discover_files` does not apply `ignore_paths` filtering to individual `.py` files it yields, and the directory-level check fails because `ignore-paths` regexes are written to match file paths (with content after the directory prefix), not bare directory paths. + +## FIX PLAN + +**File:** `pylint/lint/pylinter.py`, method `_discover_files` (starting line 585) + +**Change:** Add `ignore_paths` filtering to the individual `.py` files yielded in the `else` branch (lines 613-617). Currently: + +```python +else: + yield from ( + os.path.join(root, file) + for file in files + if file.endswith(".py") + ) +``` + +Change to: + +```python +else: + yield from ( + os.path.join(root, file) + for file in files + if file.endswith(".py") + and not _is_in_ignore_list_re( + os.path.join(root, file), self.config.ignore_paths + ) + ) +``` + +This requires importing `_is_in_ignore_list_re` from `pylint.lint.expand_modules` (or alternatively, calling `_is_ignored_file` on each file path, which already does the right check for `ignore_paths` since it passes the full path to `_is_in_ignore_list_re`). + +Additionally, the directory-level check at line 600-607 should also match directories against `ignore_paths` by appending a trailing separator to the directory path before regex matching, so that patterns like `^src/gen/.*$` match `src/gen`. This can be done by changing: + +```python +if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): +``` + +to also check `root + os.sep` (or `root + "/"`) against `ignore_paths`, so that a directory path like `src/gen` is tested as `src/gen/` against the regex. This ensures entire ignored directories are pruned from the walk early, avoiding unnecessary traversal. + +The minimal fix is just the first change (filtering individual files), but both changes together provide correct behavior: directories are pruned early when possible, and individual files that slip through (e.g., because the regex doesn't match the directory prefix alone) are still filtered. \ No newline at end of file diff --git a/benchmarks/results/swe_7080_glm51_planner_v1_failed_preds.json b/benchmarks/results/swe_7080_glm51_planner_v1_failed_preds.json new file mode 100644 index 0000000..7d53bc1 --- /dev/null +++ b/benchmarks/results/swe_7080_glm51_planner_v1_failed_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..20f4ff90 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -602,6 +602,11 @@ class PyLinter(\n self.config.ignore,\n self.config.ignore_patterns,\n self.config.ignore_paths,\n+ ) or _is_ignored_file(\n+ root + os.sep,\n+ self.config.ignore,\n+ self.config.ignore_patterns,\n+ self.config.ignore_paths,\n ):\n skip_subtrees.append(root)\n continue\n@@ -614,6 +619,12 @@ class PyLinter(\n os.path.join(root, file)\n for file in files\n if file.endswith(\".py\")\n+ and not _is_ignored_file(\n+ os.path.join(root, file),\n+ self.config.ignore,\n+ self.config.ignore_patterns,\n+ self.config.ignore_paths,\n+ )\n )\n else:\n yield something\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_7080_glm51_rediag_RESOLVED_report.json b/benchmarks/results/swe_7080_glm51_rediag_RESOLVED_report.json new file mode 100644 index 0000000..81b626d --- /dev/null +++ b/benchmarks/results/swe_7080_glm51_rediag_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7080" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7080" + ], + "resolved_ids": [ + "pylint-dev__pylint-7080" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_7080_glm51_rediag_preds.json b/benchmarks/results/swe_7080_glm51_rediag_preds.json new file mode 100644 index 0000000..e4e02c0 --- /dev/null +++ b/benchmarks/results/swe_7080_glm51_rediag_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..1fe33d0f 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -597,8 +597,14 @@ class PyLinter(\n # Skip subtree of already discovered package.\n continue\n \n+ norm_root = os.path.normpath(root)\n if _is_ignored_file(\n- root,\n+ norm_root,\n+ self.config.ignore,\n+ self.config.ignore_patterns,\n+ self.config.ignore_paths,\n+ ) or _is_ignored_file(\n+ norm_root + os.sep,\n self.config.ignore,\n self.config.ignore_patterns,\n self.config.ignore_paths,\n@@ -611,9 +617,15 @@ class PyLinter(\n yield root\n else:\n yield from (\n- os.path.join(root, file)\n+ os.path.normpath(os.path.join(root, file))\n for file in files\n if file.endswith(\".py\")\n+ and not _is_ignored_file(\n+ os.path.normpath(os.path.join(root, file)),\n+ self.config.ignore,\n+ self.config.ignore_patterns,\n+ self.config.ignore_paths,\n+ )\n )\n else:\n yield something\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_7080_glm51_rediagnosis.txt b/benchmarks/results/swe_7080_glm51_rediagnosis.txt new file mode 100644 index 0000000..1a7992e --- /dev/null +++ b/benchmarks/results/swe_7080_glm51_rediagnosis.txt @@ -0,0 +1,125 @@ +So `os.path.normpath(root) + os.sep` = `"ignored_subdirectory/"` DOES match `^ignored_subdirectory/.*`. + +So the fix needs: +1. Normalize `root` with `os.path.normpath` before checking +2. Also check `os.path.normpath(root) + os.sep` (to match directory paths against patterns that expect a trailing separator) +3. Normalize individual file paths and check them too + +Actually, let me think about this differently. The most robust approach is to normalize the paths and then check both the directory and its contents. But the simplest approach that fixes the test is: + +In `_discover_files`, when checking if a directory should be ignored, normalize the path and also check with a trailing separator: + +```python +norm_root = os.path.normpath(root) +if _is_ignored_file( + norm_root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +) or _is_ignored_file( + norm_root + os.sep, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): + skip_subtrees.append(root) + continue +``` + +And for individual files: +```python +yield from ( + os.path.normpath(os.path.join(root, file)) + for file in files + if file.endswith(".py") + and not _is_ignored_file( + os.path.normpath(os.path.join(root, file)), + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, + ) +) +``` + +Wait, but I also need to make sure the yielded paths are valid for downstream processing. Let me check if `expand_modules` can handle relative paths like `"ignored_subdirectory/failing.py"` — actually, these would be filtered out by `expand_modules`'s own `_is_ignored_file` check, so they'd never reach the linting stage. But if they're not filtered (e.g., the pattern doesn't match), they need to be valid paths. + +Actually, the yielded paths from `_discover_files` go to `expand_modules` which then processes them. If we yield `"subdirectory/module.py"` instead of `"./subdirectory/module.py"`, `expand_modules` should still work because `os.path.exists("subdirectory/module.py")` would be true when CWD is the right directory. + +Let me also check: what about the `yield root` case (for packages)? If we normalize `root`, we'd yield `"package"` instead of `"./package"`. That should also work. + +OK, I'm now confident in the diagnosis. Let me also verify one more thing — the `skip_subtrees` check uses `root.startswith(s)`. If we append `norm_root + os.sep` to `skip_subtrees` instead of `root`, the `startswith` check would break. So we need to keep appending `root` to `skip_subtrees` (the original, non-normalized path) for the subtree skipping to work. + +Let me finalize the diagnosis: + +--- + +**ROOT CAUSE:** + +In `pylint/lint/pylinter.py`, method `_discover_files` (line 585), when the user runs pylint with `--recursive=y` from the current directory (e.g., `pylint . --recursive=y`), `os.walk(".")` produces paths with a `./` prefix (e.g., `./ignored_subdirectory`, `./ignored_subdirectory/failing.py`). These paths are passed directly to `_is_ignored_file` for matching against `ignore-paths` regex patterns. + +The `ignore-paths` patterns (e.g., `^ignored_subdirectory/.*`) are written to match paths relative to the starting directory without the `./` prefix. The `./` prefix prevents the regex from matching, so ignored directories and files are not filtered out. + +Additionally, the directory-level check only tests `root` (e.g., `./ignored_subdirectory`) against the pattern, but patterns like `^ignored_subdirectory/.*` are designed to match file paths within the directory (e.g., `ignored_subdirectory/failing.py`), not the directory name itself. The directory check needs to also test `root + os.sep` (with normalization) to match such patterns. + +**FIX PLAN:** + +File: `pylint/lint/pylinter.py`, method `_discover_files` (starting line 585) + +1. Normalize `root` with `os.path.normpath()` before passing to `_is_ignored_file`, to strip the `./` prefix. +2. Also check `os.path.normpath(root) + os.sep` against `_is_ignored_file`, so that patterns like `^ignored_subdirectory/.*` match the directory path with a trailing separator. +3. Normalize individual file paths with `os.path.normpath()` before yielding them and before checking them against `_is_ignored_file`. +4. Add individual file-level `_is_ignored_file` check in the `yield from` clause, to catch files that match `ignore-paths` even if their parent directory doesn't. + +Concrete code change in `_discover_files`: + +```python +# Replace the current _is_ignored_file check on root (lines 600-607): +if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): + skip_subtrees.append(root) + continue + +# With: +norm_root = os.path.normpath(root) +if _is_ignored_file( + norm_root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +) or _is_ignored_file( + norm_root + os.sep, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): + skip_subtrees.append(root) + continue +``` + +```python +# Replace the current yield from clause (lines 613-617): +yield from ( + os.path.join(root, file) + for file in files + if file.endswith(".py") +) + +# With: +yield from ( + os.path.normpath(os.path.join(root, file)) + for file in files + if file.endswith(".py") + and not _is_ignored_file( + os.path.normpath(os.path.join(root, file)), + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, + ) +) +``` + +Note: `skip_subtrees.append(root)` should still use the original `root` (not `norm_root`) because the `startswith` check on line 596 compares against the original `os.walk` paths. \ No newline at end of file diff --git a/benchmarks/results/swe_7080_kimi_k26_diagnosis.txt b/benchmarks/results/swe_7080_kimi_k26_diagnosis.txt new file mode 100644 index 0000000..e377e2b --- /dev/null +++ b/benchmarks/results/swe_7080_kimi_k26_diagnosis.txt @@ -0,0 +1,1119 @@ +ROOT CAUSE: + +The bug is in `pylint/lint/pylinter.py` in the `_discover_files()` method (lines 585-619). + +When `--recursive=y` is used, `_discover_files()` walks the directory tree with `os.walk()`. For each directory `root`, it calls `_is_ignored_file()` to check if that directory should be skipped. The `_is_ignored_file()` function checks `ignore_paths` patterns against the full path of the element. + +The problem: `ignore_paths` patterns like `^src/gen/.*$` are designed to match **file paths** (with a trailing slash and filename), but `_discover_files()` only checks `ignore_paths` at the **directory level** before descending into it. The directory path `src/gen` does NOT match the pattern `^src/gen/.*$` because the pattern expects `src/gen/` followed by at least one more character (a filename). Therefore, the directory is not skipped. + +Then, when yielding individual `.py` files inside non-package directories (lines 613-617), `_discover_files()` does NOT check `ignore_paths` against those file paths at all — it just yields them unconditionally. These files then flow through `expand_modules()` which DOES check `ignore_paths`, but by then the files have already been discovered and passed along. + +Wait — let me re-check. Looking at the code flow again: + +1. `_discover_files()` yields file paths +2. `check()` passes these to `_get_file_descriptors()` +3. `_get_file_descriptors()` calls `_expand_files()` which calls `expand_modules()` +4. `expand_modules()` DOES check `_is_ignored_file()` for each item + +So `expand_modules()` should filter them out. But the bug report clearly shows files in `src/gen/` ARE being linted. Let me reconsider... + +Actually, looking more carefully at `_discover_files()`: + +```python +if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): + skip_subtrees.append(root) + continue +``` + +The `root` here is a relative path like `src/gen`. The pattern `^src/gen/.*$` won't match `src/gen`. But what about the yielded files? When `_discover_files()` yields `os.path.join(root, file)`, it yields `src/gen/about.py`. Then in `expand_modules()`, `_is_ignored_file()` is called with `something = "src/gen/about.py"`, and the pattern `^src/gen/.*$` SHOULD match. + +Hmm, but the bug report says the files ARE being checked. Let me look at this from another angle — maybe the issue is that when running with `pylint --recursive=y src/`, the paths are relative and the pattern matching in `_is_in_ignore_list_re` uses `match()` which anchors at the start. + +Actually wait — I need to look at this more carefully. The user's config has: +```toml +ignore-paths = ["^src/gen/.*$"] +``` + +And they run: `pylint --recursive=y src/` + +When `os.walk("src/")` runs, `root` will be paths like `src/gen`. The pattern `^src/gen/.*$` won't match `src/gen` because there's no trailing slash + filename. + +But then the yielded file path would be `src/gen/about.py`, and in `expand_modules()`, `_is_ignored_file("src/gen/about.py", ...)` would check `ignore_paths` against `"src/gen/about.py"`. The pattern `^src/gen/.*$` SHOULD match `"src/gen/about.py"`. + +Unless... the issue is that `expand_modules()` is NOT being called on these discovered files? Let me re-check the flow in `check()`: + +```python +if self.config.recursive: + files_or_modules = tuple(self._discover_files(files_or_modules)) +``` + +Then later: +```python +with fix_import_path(files_or_modules): + self._check_files( + self.get_ast, check_astroid_module, files_or_modules + ) +``` + +And `_check_files()` calls `_get_file_descriptors()` which calls `_expand_files()` which calls `expand_modules()`. + +So `expand_modules()` IS called. And it SHOULD filter out the files. Unless there's something else going on... + +Wait, let me re-read `expand_modules()` more carefully: + +```python +for something in files_or_modules: + basename = os.path.basename(something) + if _is_ignored_file( + something, ignore_list, ignore_list_re, ignore_list_paths_re + ): + continue +``` + +This checks at the top of the loop. If `something = "src/gen/about.py"` and `ignore_paths = ["^src/gen/.*$"]`, then `_is_ignored_file()` should return `True` because `_is_in_ignore_list_re("src/gen/about.py", [re.compile("^src/gen/.*$")])` should match. + +Unless the regex doesn't match because of path separators? On Windows, the path might be `src\gen\about.py` and the pattern `^src/gen/.*$` uses forward slashes... + +Actually, that's likely the issue on Windows! But the bug report says this happens, and the test `test_ignore_path_recursive_current_dir` is presumably failing. Let me think about what that test might be checking... + +Actually, I realize I should focus on what I can see in the code. Let me look at `_discover_files()` again: + +```python +for root, _, files in os.walk(something): + if any(root.startswith(s) for s in skip_subtrees): + continue + + if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, + ): + skip_subtrees.append(root) + continue + + if "__init__.py" in files: + skip_subtrees.append(root) + yield root + else: + yield from ( + os.path.join(root, file) + for file in files + if file.endswith(".py") + ) +``` + +The key issue: `_is_ignored_file()` is called with `root` (the directory path). For a pattern like `^src/gen/.*$`, this will NEVER match a directory path because: +1. `root` is `src/gen` — no trailing slash, so `.*` has nothing to match after `/` +2. Even if we had `src/gen/`, the `$` at the end means the pattern must end there, but `os.walk()` yields directory names without trailing slashes + +So directories are never skipped by `ignore_paths` patterns that expect file paths. And then individual files are yielded. But `expand_modules()` should still filter them... + +Unless the real issue is that `_discover_files()` yields files that are then treated as explicit arguments (isarg=True), and `should_analyze_file()` returns True for explicit arguments even if they match ignore patterns? + +No, `expand_modules()` sets `isarg` based on whether the original input was an argument. For discovered files, they would have `isarg=False` in the result dict... wait, let me check. + +Actually in `expand_modules()`: +```python +result.append({ + "path": subfilepath, + "name": submodname, + "isarg": False, + "basepath": filepath, + "basename": modname, +}) +``` + +Yes, `isarg=False`. And in `_get_file_descriptors()`: +```python +for descr in self._expand_files(files_or_modules): + name, filepath, is_arg = descr["name"], descr["path"], descr["isarg"] + if self.should_analyze_file(name, filepath, is_argument=is_arg): + yield FileItem(name, filepath, descr["basename"]) +``` + +And `should_analyze_file()` only checks `path.endswith(".py")` when `is_argument=False`. It does NOT check ignore patterns. + +So the ignore filtering ONLY happens in `expand_modules()`, which calls `_is_ignored_file()`. And that SHOULD work for file paths... + +OK let me just create a concrete reproduction. The issue might be path normalization. When the user runs `pylint --recursive=y src/`, the paths discovered by `os.walk()` might be normalized differently than what the pattern expects. + +Actually, I think I finally see it. Let me look at `_discover_files()` again: + +When `os.walk("src/")` runs, `root` will be `"src/gen"` (no trailing slash). The pattern `^src/gen/.*$` doesn't match `"src/gen"`. So the directory is not skipped. Then files like `src/gen/about.py` are yielded. + +In `expand_modules()`, `_is_ignored_file("src/gen/about.py", ...)` is called. The pattern `^src/gen/.*$` SHOULD match `"src/gen/about.py"`. So it SHOULD be filtered out there. + +Unless... the path has been normalized to an absolute path by then? Let me check if `os.walk()` returns relative or absolute paths. It returns paths relative to the starting point if the starting point is relative. So `root` would be `src/gen`, and `os.path.join(root, file)` would be `src/gen/about.py`. + +Hmm, but what if the user's CWD is such that the pattern needs to account for the full relative path? The pattern `^src/gen/.*$` should match `src/gen/about.py`. + +Let me think about this differently. Maybe the issue is that `_discover_files()` is yielding the files, but then `expand_modules()` is NOT filtering them because of how the paths look. + +Actually, I wonder if the issue is that when `_discover_files()` yields a file path like `src/gen/about.py`, and then `expand_modules()` processes it, the `something` variable is `src/gen/about.py`, and `_is_ignored_file()` checks: +- `basename = "about.py"` — not in ignore list +- `_is_in_ignore_list_re("about.py", ignore_list_re)` — probably doesn't match +- `_is_in_ignore_list_re("src/gen/about.py", ignore_list_paths_re)` — SHOULD match `^src/gen/.*$` + +If the third check works, the file should be ignored. So either: +1. The regex doesn't actually match (maybe due to path separator differences on Windows) +2. Or there's some other path manipulation happening + +On Windows, `os.path.join()` uses backslashes, so the path would be `src\gen\about.py`. The pattern `^src/gen/.*$` uses forward slashes and wouldn't match! This is a known issue with path separators on Windows. + +But wait, the bug report says the user is on Windows and the output shows `src\gen\about.py`. So the path separator issue is real. However, the fix for that would be to normalize paths before matching. + +Actually, looking at the failing test name: `test_ignore_path_recursive_current_dir`. This suggests the issue occurs when running from the current directory. Let me think about what might be different in that case... + +When running `pylint --recursive=y .` from the current directory, the discovered paths would be like `.\src\gen\about.py` or `./src/gen/about.py` depending on the OS. The pattern `^src/gen/.*$` wouldn't match either of those because they start with `./` or `.\`. + +So the root cause might be that: +1. When running recursively from the current directory, paths include `./` prefix +2. The `ignore_paths` patterns don't account for this prefix +3. Therefore the patterns don't match + +But that's more of a user configuration issue. The bug report specifically says `ignore-paths` is "completely ignored" when using `--recursive=y`, which suggests it's not just a prefix issue. + +Let me look at this from yet another angle. What if the issue is that `_discover_files()` discovers packages (directories with `__init__.py`) and yields the directory path, and then `expand_modules()` treats the directory as a package and expands it AGAIN, finding all modules in it? In that case, the individual files inside the package might not go through the `ignore_paths` check. + +Looking at `expand_modules()`: +```python +if os.path.isdir(something): + if _is_ignored_file(...): + continue + # ... expand directory ... +``` + +If `something` is a directory path like `src/gen` (a package with `__init__.py`), and the pattern is `^src/gen/.*$`, the directory path won't match. So the directory is processed, and all its submodules are discovered. The submodules would have paths like `src/gen/about.py`, and they WOULD be checked against `ignore_paths` in the loop... unless they're being added without the check. + +Looking more carefully at the expansion logic in `expand_modules()`: + +```python +for root, _, files in os.walk(something): + if any(root.startswith(s) for s in skip_subtrees): + continue + + if _is_ignored_file( + root, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + skip_subtrees.append(root) + continue + + if "__init__.py" in files: + skip_subtrees.append(root) + modpath = _modpath_from_file(...) + submodname = ".".join(modpath) + result.append({...}) + else: + for filename in files: + if filename.endswith(".py"): + subfilepath = os.path.join(root, filename) + if _is_ignored_file( + subfilepath, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + continue + # ... add to result ... +``` + +Wait! `expand_modules()` ALSO walks the directory tree and checks `ignore_paths` at both the directory level AND the file level. So even if `_discover_files()` yields a package directory, `expand_modules()` will check individual files. + +But here's the key: in `expand_modules()`, when checking directories: +```python +if _is_ignored_file( + root, + ignore_list, + ignore_list_re, + ignore_list_paths_re, +): +``` + +The `root` here is the directory path. For a pattern like `^src/gen/.*$`, this won't match `src/gen`. So the directory isn't skipped. Then individual files ARE checked: +```python +if _is_ignored_file( + subfilepath, + ignore_list, + ignore_list_re, + ignore_list_paths_re, +): +``` + +And `subfilepath` is `src/gen/about.py`, which SHOULD match `^src/gen/.*$`. + +So theoretically, the files SHOULD be ignored. Unless there's a path normalization issue. + +Let me look at `_is_in_ignore_list_re()`: +```python +def _is_in_ignore_list_re(element: str, ignore_list_re: list[Pattern[str]]) -> bool: + return any(file_pattern.match(element) for file_pattern in ignore_list_re) +``` + +It uses `match()`, which anchors at the beginning of the string. So `^src/gen/.*$` would match `src/gen/about.py` but NOT `./src/gen/about.py` or `C:\project\src\gen\about.py`. + +I think the issue might be that when running `pylint --recursive=y src/`, the paths discovered are relative like `src/gen/about.py`, which should match. But when running from a different directory or with absolute paths, they might not. + +Actually, let me re-read the bug report more carefully: + +> When running recursively, it seems `ignore-paths` in my settings in pyproject.toml is completely ignored + +The user says it's "completely ignored". This suggests NO files are being filtered, not just some. This is consistent with the directory-level check failing (because `^src/gen/.*$` doesn't match `src/gen`) and then the file-level check also failing for some reason. + +Wait, I just realized something. Let me look at `_discover_files()` again: + +```python +if "__init__.py" in files: + skip_subtrees.append(root) + yield root +else: + yield from ( + os.path.join(root, file) + for file in files + if file.endswith(".py") + ) +``` + +When a directory contains `__init__.py`, `_discover_files()` yields the DIRECTORY path, not individual files. So for a package like `src/gen/`, it yields `src/gen/`. + +Then in `expand_modules()`, `something = "src/gen/"` (a directory). The code checks: +```python +if os.path.isdir(something): + if _is_ignored_file(...): + continue + # ... walk and expand ... +``` + +The directory `src/gen/` doesn't match `^src/gen/.*$`, so it's expanded. During expansion, `os.walk("src/gen/")` is called, and individual files are checked. But what if the files inside the package are NOT checked against `ignore_paths`? + +Looking at the expansion code more carefully... I need to see the full `expand_modules()` function. Let me read it again. + +Actually, I already read it. The file-level check is there: +```python +for filename in files: + if filename.endswith(".py"): + subfilepath = os.path.join(root, filename) + if _is_ignored_file( + subfilepath, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + continue +``` + +So files ARE checked. Unless... the issue is that when a directory is a package (has `__init__.py`), the package itself is added to the result, and then the package's submodules are also added. But the submodules' paths might be different. + +For example, if `src/gen/` is a package, `expand_modules()` might add: +- `src/gen/__init__.py` with name `gen` +- `src/gen/about.py` with name `gen.about` + +And `subfilepath` for `about.py` would be `src/gen/about.py`, which should match `^src/gen/.*$`. + +I'm going in circles. Let me just look at what the actual failing test does and work backwards from there. + +The failing test is `test_ignore_path_recursive_current_dir`. I couldn't find it in the codebase, which means it might be a test that was added to verify the fix, or it's in a different branch. The `run_tests.sh` script references it. + +Let me think about what "recursive_current_dir" means. It probably tests running `pylint --recursive=y .` from a directory that contains files that should be ignored. + +When you run `pylint --recursive=y .` from the current directory: +1. `_discover_files()` calls `os.walk(".")` +2. `root` will be paths like `.`, `./src`, `./src/gen` +3. The pattern `^src/gen/.*$` won't match `./src/gen` because of the `./` prefix + +So the directory isn't skipped. Then files like `./src/gen/about.py` are yielded. In `expand_modules()`, the path `./src/gen/about.py` is checked against `^src/gen/.*$`, which doesn't match because of the `./` prefix. + +THIS is the bug! When running from the current directory with `.`, paths include the `./` prefix, which breaks pattern matching. + +But wait, the user's bug report shows them running `pylint --recursive=y src/`, not `.`. And the paths in the output are `src\gen\about.py` (Windows backslashes). So the issue there is path separators. + +Actually, there might be TWO issues: +1. Paths from `os.walk()` might have `./` prefix when starting from `.` +2. On Windows, paths use backslashes instead of forward slashes + +Both would break regex patterns that expect forward-slash-separated paths without a `./` prefix. + +Now, looking at the code, I notice that `_discover_files()` doesn't normalize paths at all. It just uses what `os.walk()` returns. And `expand_modules()` also doesn't normalize paths before matching. + +The fix should be to normalize paths before checking them against `ignore_paths` patterns. This could be done by: +1. Using `os.path.normpath()` to normalize separators +2. Stripping `./` prefix from relative paths +3. Or converting to a consistent format (like POSIX-style forward slashes) + +Actually, looking at the existing code more carefully, I see that `_is_ignored_file()` receives the raw path and checks it directly against the regex. There's no normalization. + +Let me check if there's any path normalization happening elsewhere... + +Looking at `_discover_files()`: +```python +for something in files_or_modules: + if os.path.isdir(something) and not os.path.isfile( + os.path.join(something, "__init__.py") + ): + for root, _, files in os.walk(something): +``` + +`os.walk(something)` preserves the format of `something`. If `something = "src/"`, then `root` will be `src`, `src/gen`, etc. If `something = "."`, then `root` will be `.`, `./src`, `./src/gen`, etc. (actually, on some systems `os.walk(".")` might yield `.` as the first root, then `src` without `./` — let me verify). + +Actually, `os.walk(".")` typically yields: +- `.` (for the current directory) +- `./subdir1` +- `./subdir2` + +Or on some systems it might yield: +- `.` +- `subdir1` +- `subdir2` + +It depends on the OS and Python version. But in any case, the paths might not match user-provided patterns. + +Now, the key insight: `_discover_files()` is supposed to discover files, but it also needs to respect `ignore_paths`. Currently, it only checks `ignore_paths` at the directory level, and the check uses the raw path from `os.walk()`. If the path format doesn't match the pattern, files inside ignored directories will still be discovered and passed to `expand_modules()`. + +But `expand_modules()` ALSO checks `ignore_paths`. So why doesn't it catch them there? + +Ah! I think I finally understand. Let me trace through more carefully: + +1. User runs `pylint --recursive=y src/` +2. `_discover_files(["src/"])` is called +3. `os.walk("src/")` yields `root = "src/gen"` (no trailing slash) +4. `_is_ignored_file("src/gen", ..., ["^src/gen/.*$"])` returns False (directory path doesn't match pattern) +5. Files like `src/gen/about.py` are yielded +6. `expand_modules()` processes `src/gen/about.py` +7. `_is_ignored_file("src/gen/about.py", ..., ["^src/gen/.*$"])` returns True (file path matches pattern) +8. File is skipped! + +So theoretically, this SHOULD work for `pylint --recursive=y src/` with pattern `^src/gen/.*$`. The file-level check in `expand_modules()` should catch it. + +Unless... the issue is that the files are being treated as packages or modules differently. Or maybe the path has backslashes on Windows. + +On Windows: +- `os.walk("src/")` yields `root = "src\gen"` +- `_is_ignored_file("src\gen", ..., ["^src/gen/.*$"])` returns False +- Files like `src\gen\about.py` are yielded +- `expand_modules()` processes `src\gen\about.py` +- `_is_ignored_file("src\gen\about.py", ..., ["^src/gen/.*$"])` returns False because backslashes don't match forward slashes! + +THIS is the bug on Windows! The path separator mismatch causes `ignore_paths` patterns with forward slashes to fail. + +But wait, the user says they're on Windows and the output shows `src\gen\about.py`. So the backslash issue is definitely present. + +However, there's another possibility. Let me look at the test case again. The failing test is `test_ignore_path_recursive_current_dir`. What if the issue is specific to running from the current directory? + +When running `pylint --recursive=y .`: +- `os.walk(".")` yields `root = "./src/gen"` or `root = "src/gen"` depending on the system +- If it's `./src/gen`, the pattern `^src/gen/.*$` won't match +- Files like `./src/gen/about.py` are yielded +- `expand_modules()` checks `./src/gen/about.py` against `^src/gen/.*$` — doesn't match! + +So the `./` prefix breaks the pattern matching. + +OK so there are potentially multiple path normalization issues: +1. `./` prefix when running from current directory +2. Backslashes on Windows +3. Absolute vs relative paths + +The fix should normalize paths before checking against `ignore_paths` patterns. + +Now, where should the fix go? Looking at the code: + +Option 1: Fix in `_is_in_ignore_list_re()` — normalize the element before matching +Option 2: Fix in `_discover_files()` — normalize paths before yielding and checking +Option 3: Fix in `expand_modules()` — normalize paths before checking + +The most comprehensive fix would be to normalize paths in `_is_in_ignore_list_re()` or `_is_ignored_file()`, since that's where the matching happens. + +But wait, we need to be careful. `ignore_paths` patterns are matched against both basenames (for `ignore_patterns`) and full paths (for `ignore_paths`). The `_is_ignored_file()` function handles both: + +```python +def _is_ignored_file( + element: str, + ignore_list: list[str], + ignore_list_re: list[Pattern[str]], + ignore_list_paths_re: list[Pattern[str]], +) -> bool: + basename = os.path.basename(element) + return ( + basename in ignore_list + or _is_in_ignore_list_re(basename, ignore_list_re) + or _is_in_ignore_list_re(element, ignore_list_paths_re) + ) +``` + +For `ignore_paths`, the full path `element` is checked. We should normalize `element` before checking against `ignore_list_paths_re`. + +What normalization should we do? +1. `os.path.normpath(element)` — normalizes `.`, `..`, and path separators +2. Maybe also convert to forward slashes for consistent matching? + +Actually, looking at the docstring for `ignore_paths` and existing tests, the patterns are expected to use forward slashes (POSIX style) or be platform-independent. Let me check the documentation... + +The bug report mentions `^src/gen/.*$` with forward slashes. On Windows, the actual paths use backslashes. So the fix should convert backslashes to forward slashes before matching, OR we should document that patterns need to handle both. + +Looking at the existing test `test_ignore_path_recursive`: +```python +"--ignore-paths=.*ignored.*", +``` + +This pattern uses `.*` which would match any characters including backslashes. So it works on both platforms. But the user's pattern `^src/gen/.*$` uses explicit forward slashes, which don't match backslashes. + +I think the proper fix is to normalize paths by converting to forward slashes before matching against `ignore_paths` patterns. This is a common approach in cross-platform tools. + +Let me also check if there's any existing path normalization in pylint... + +Actually, I just realized I should look at this more systematically. Let me check what `os.walk()` returns on different platforms and how the paths flow through the code. + +On Unix with `os.walk("src/")`: +- `root = "src/gen"` +- `os.path.join(root, "about.py") = "src/gen/about.py"` + +On Windows with `os.walk("src/")`: +- `root = "src\gen"` +- `os.path.join(root, "about.py") = "src\gen\about.py"` + +The pattern `^src/gen/.*$` matches the Unix path but not the Windows path. + +For `os.walk(".")` on Unix: +- `root = "."` or `root = "./src"` or `root = "src"` depending on implementation + +Actually, Python's `os.walk()` on Unix typically yields relative paths without `./` prefix for subdirectories. Let me verify: + +```python +import os +for root, dirs, files in os.walk("."): + print(root) +``` + +This typically prints: +``` +. +./subdir1 +./subdir2/subsubdir +``` + +Wait no, actually it usually prints: +``` +. +subdir1 +subdir2/subsubdir +``` + +The `.` is only for the starting directory itself. Subdirectories don't have `./` prefix. + +Hmm, but what about when the starting path is explicitly `./`? Let me think... + +Actually, `os.walk("./")` might yield `./` as the first root, then `./subdir1`, etc. Or it might normalize to `.` and `subdir1`. It depends on the OS. + +OK, I think I've been overthinking this. Let me focus on what I know for certain: + +1. The bug report shows `ignore_paths` not working with `--recursive=y` +2. The user is on Windows and paths use backslashes +3. The pattern uses forward slashes +4. `_is_ignored_file()` checks paths directly against regex without normalization + +The fix should normalize paths before regex matching. The best place to do this is in `_is_in_ignore_list_re()` or `_is_ignored_file()`. + +Actually, wait. Let me re-read the bug report one more time: + +> When running recursively, it seems `ignore-paths` in my settings in pyproject.toml is completely ignored + +"Completely ignored" suggests that NO files are being filtered, not just some. If the issue were only backslashes on Windows, then the pattern would fail for all files in `src/gen/`, which would make it seem "completely ignored" for that directory. + +But the user also says: + +> src\gen\* should not be checked + +And the output shows files from `src\gen\` ARE being checked. So the pattern IS being ignored for those files. + +Now, what about the failing test `test_ignore_path_recursive_current_dir`? I couldn't find this test in the codebase. It might be a test that was added after the bug was reported, or it might be in a patch that hasn't been applied yet. + +Given the test name, it probably tests running pylint recursively from the current directory with `ignore_paths` configured. The issue might be that when running from the current directory, the paths have a different format that doesn't match the patterns. + +Let me think about what happens when you run `pylint --recursive=y .` with `ignore-paths = ["^src/gen/.*$"]`: + +1. `_discover_files(["."])` is called +2. `os.walk(".")` yields `root = "."`, then `root = "src"`, then `root = "src/gen"` +3. For `root = "src/gen"`, `_is_ignored_file("src/gen", ..., ["^src/gen/.*$"])` returns False +4. Files in `src/gen/` are yielded +5. `expand_modules()` processes each file +6. For `src/gen/about.py`, `_is_ignored_file("src/gen/about.py", ..., ["^src/gen/.*$"])` returns True +7. File is skipped! + +So even from the current directory, the file-level check should work. Unless... + +What if the issue is that `_discover_files()` yields the directory `src/gen` as a package (because it has `__init__.py`), and then `expand_modules()` expands the package but doesn't check individual files against `ignore_paths`? + +Let me re-read `expand_modules()` for the package case: + +```python +if os.path.isdir(something): + if _is_ignored_file( + something, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + continue + + skip_subtrees: list[str] = [] + for root, _, files in os.walk(something): + if any(root.startswith(s) for s in skip_subtrees): + continue + + if _is_ignored_file( + root, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + skip_subtrees.append(root) + continue + + if "__init__.py" in files: + skip_subtrees.append(root) + modpath = _modpath_from_file(...) + submodname = ".".join(modpath) + result.append({...}) + else: + for filename in files: + if filename.endswith(".py"): + subfilepath = os.path.join(root, filename) + if _is_ignored_file( + subfilepath, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + continue + # ... add to result ... +``` + +Wait, I see it now! When `something` is a directory (like `src/gen` which is a package), the code walks it with `os.walk(something)`. For each subdirectory `root`, it checks `_is_ignored_file(root, ...)`. If `root` matches, it skips the subtree. + +But what about the files in the package directory itself? If `src/gen` is the package root and contains `__init__.py`, the code adds the package to the result. But it does NOT check individual `.py` files in that directory against `ignore_paths` when `__init__.py` is present! + +Look at this logic: +```python +if "__init__.py" in files: + skip_subtrees.append(root) + modpath = _modpath_from_file(...) + submodname = ".".join(modpath) + result.append({...}) +else: + for filename in files: + if filename.endswith(".py"): + subfilepath = os.path.join(root, filename) + if _is_ignored_file(...): + continue + # ... add to result ... +``` + +When `__init__.py` is in the directory, ONLY the package itself is added. Individual `.py` files in the same directory are NOT processed. But wait, those files would be submodules of the package, and they should be discovered when `os.walk()` visits them... no, `os.walk()` visits directories, and if `root = "src/gen"`, the files in that directory are `files`, not subdirectories. + +So when `__init__.py` is present, the code adds the package directory as a module, but it doesn't add individual `.py` files in that directory as separate modules. Those files are considered part of the package and would be imported when the package is analyzed. + +But then how does pylint analyze individual files in a package? When you pass a package directory to pylint, it analyzes the package and its submodules. The `__init__.py` makes it a package, and pylint handles the rest. + +So if `src/gen` is a package with `__init__.py`, passing `src/gen` to pylint would cause pylint to analyze the entire package, including `about.py`, `design.py`, etc. The `ignore_paths` check at the directory level (`root = "src/gen"`) doesn't match `^src/gen/.*$`, so the package is analyzed. + +THIS is the bug! When a directory is a package (contains `__init__.py`), the `ignore_paths` pattern `^src/gen/.*$` doesn't match the directory path `src/gen`, so the entire package is analyzed. The individual files inside the package are never checked against `ignore_paths` because they're discovered as part of the package, not as individual file paths. + +Wait, but the user's output shows individual modules like `src\gen\about.py` being checked. How does that happen? + +Looking at the output: +``` +************* Module about +src\gen\about.py:2:0: R2044: Line with empty comment (empty-comment) +``` + +The module name is `about`, not `gen.about`. This suggests that `about.py` is being analyzed as a standalone module, not as part of a package. How does that happen? + +If `src/gen` doesn't have `__init__.py`, then it's not a package. In that case, `_discover_files()` would yield individual files like `src/gen/about.py`. Then `expand_modules()` would process each file individually, and the file-level `ignore_paths` check should work. + +But wait, the user's bug report doesn't mention whether `src/gen` has `__init__.py` or not. Looking at the output, the module names are `about`, `design`, `resources_rc`, `settings`, `update_checker` — these are all top-level module names, not `gen.about`, `gen.design`, etc. This strongly suggests that `src/gen` does NOT have `__init__.py`, so the files are treated as individual modules. + +If `src/gen` doesn't have `__init__.py`, then `_discover_files()` yields individual files, and `expand_modules()` checks each file against `ignore_paths`. So why doesn't it work? + +Let me think about this again... + +Oh! I think I finally see it. Let me look at `_discover_files()` one more time: + +```python +for something in files_or_modules: + if os.path.isdir(something) and not os.path.isfile( + os.path.join(something, "__init__.py") + ): + for root, _, files in os.walk(something): + if any(root.startswith(s) for s in skip_subtrees): + continue + + if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, + ): + skip_subtrees.append(root) + continue + + if "__init__.py" in files: + skip_subtrees.append(root) + yield root + else: + yield from ( + os.path.join(root, file) + for file in files + if file.endswith(".py") + ) + else: + yield something +``` + +The condition is: `if os.path.isdir(something) and not os.path.isfile(os.path.join(something, "__init__.py"))` + +This means: if `something` is a directory AND it does NOT contain `__init__.py`, then walk it recursively. Otherwise, yield `something` as-is. + +So when the user runs `pylint --recursive=y src/`: +- `something = "src/"` +- `os.path.isdir("src/")` is True +- `os.path.isfile(os.path.join("src/", "__init__.py"))` — does `src/` have `__init__.py`? Probably not if `src/` is just a source directory. +- So the condition is True, and we enter the `os.walk()` loop. + +Inside the loop: +- `root = "src/gen"` (or `src\gen` on Windows) +- `_is_ignored_file("src/gen", ..., ["^src/gen/.*$"])` returns False +- `__init__.py` not in files (probably) +- Yield `src/gen/about.py`, `src/gen/design.py`, etc. + +Then in `check()`: +- `files_or_modules = ("src/gen/about.py", "src/gen/design.py", ...)` +- These go to `_check_files()` → `_get_file_descriptors()` → `_expand_files()` → `expand_modules()` + +In `expand_modules()`: +- `something = "src/gen/about.py"` +- `os.path.isdir("src/gen/about.py")` is False +- So we go to the else branch: +```python +else: + try: + modpath = _modpath_from_file(...) + except ImportError: + ... + else: + submodname = ".".join(modpath) + if _is_ignored_file( + something, + ignore_list, + ignore_list_re, + ignore_list_paths_re, + ): + continue + result.append({...}) +``` + +Wait! Here it is! For non-directory items, `_is_ignored_file()` IS called. So `src/gen/about.py` SHOULD be checked against `^src/gen/.*$`. + +Unless... the path format is wrong. On Windows, `something = "src\gen\about.py"`, and the pattern `^src/gen/.*$` uses forward slashes. The regex won't match because backslashes ≠ forward slashes. + +THIS is the root cause on Windows! The path separator mismatch. + +But what about on Unix? If the user runs `pylint --recursive=y src/` on Unix, `something = "src/gen/about.py"`, and `^src/gen/.*$` SHOULD match. + +Unless there's another issue. Let me think about what `test_ignore_path_recursive_current_dir` might be testing... + +Maybe the issue is when running `pylint --recursive=y .` from a directory that contains a `pyproject.toml` with `ignore-paths`. In that case, the discovered paths might have a different format. + +Actually, I just realized something. Let me look at the test more carefully. The test name is `test_ignore_path_recursive_current_dir`. The existing tests are: + +```python +def test_ignore_path_recursive(self): + self._runtest([ + join(HERE, "regrtest_data", "directory"), + "--recursive=y", + "--ignore-paths=.*ignored.*", + ], code=0) +``` + +This test passes an absolute path. What if the failing test passes `.` or a relative path? + +When you pass an absolute path like `/path/to/tests/regrtest_data/directory`, `os.walk()` yields absolute paths. The pattern `.*ignored.*` matches any string containing `ignored`, so it works with absolute paths too. + +But if the pattern were `^src/gen/.*$` and the path were `/home/user/project/src/gen/about.py`, the pattern wouldn't match because of the absolute path prefix! + +So the issue might be that patterns are anchored with `^` and expect relative paths, but when running from a different directory or with absolute paths, the actual paths don't match. + +This is a fundamental design issue with `ignore_paths`: the patterns are matched against whatever path format is passed to pylint, which can be relative, absolute, with `./`, with backslashes, etc. + +The fix should normalize paths before matching. Let me think about what normalization makes sense: + +1. Convert to absolute paths? No, that would break relative patterns. +2. Convert to relative paths? Maybe, but relative to what? +3. Normalize path separators to forward slashes? Yes, this is common. +4. Strip `./` prefix? Maybe. + +Actually, looking at how other tools handle this, a common approach is to: +1. Use `os.path.normpath()` to normalize `.`, `..`, and path separators +2. Convert to forward slashes for regex matching +3. Make paths relative to the project root or current directory + +But this is complex. A simpler fix might be to just normalize path separators in `_is_in_ignore_list_re()` when checking against `ignore_list_paths_re`. + +Wait, but that might break existing patterns that intentionally use backslashes for Windows paths. + +Let me look at the documentation for `ignore_paths` to see what format is expected... + +The bug report shows the user using `^src/gen/.*$` with forward slashes. This suggests that forward slashes are the expected format, even on Windows. So normalizing backslashes to forward slashes would be the right fix. + +OK, let me now formulate the fix: + +**ROOT CAUSE:** + +In `pylint/lint/expand_modules.py`, the `_is_ignored_file()` function checks if a path matches `ignore_paths` patterns using `_is_in_ignore_list_re()`. This function uses `re.Pattern.match()` to match the path against the pattern. + +When `--recursive=y` is used, `_discover_files()` in `pylint/lint/pylinter.py` discovers files using `os.walk()`, which returns paths with the native path separator (`\` on Windows, `/` on Unix). The discovered file paths are then passed to `expand_modules()`, which also checks `ignore_paths`. + +The problem is that `ignore_paths` patterns like `^src/gen/.*$` use forward slashes, but on Windows the actual paths use backslashes (`src\gen\about.py`). The regex `^src/gen/.*$` does NOT match `src\gen\about.py` because backslashes are not forward slashes. + +Additionally, when running from the current directory with `.`, paths may include a `./` prefix, which also breaks anchored patterns. + +As a result, files inside ignored directories are not filtered out when using `--recursive=y` on Windows or when paths have unexpected prefixes. + +**FIX PLAN:** + +File: `pylint/lint/expand_modules.py` + +Function: `_is_in_ignore_list_re()` (line 44) + +Change: Normalize the path by converting backslashes to forward slashes before matching against regex patterns. This ensures that patterns using forward slashes work correctly on Windows. + +```python +def _is_in_ignore_list_re(element: str, ignore_list_re: list[Pattern[str]]) -> bool: + """Determines if the element is matched in a regex ignore-list.""" + # Normalize path separators for cross-platform matching + element = element.replace(os.sep, "/") + return any(file_pattern.match(element) for file_pattern in ignore_list_re) +``` + +Wait, but this changes behavior for ALL regex matching, including `ignore_patterns` which matches basenames. Basenames shouldn't have path separators, so this is safe. + +Actually, let me reconsider. The `_is_in_ignore_list_re()` function is used for both `ignore_patterns` (matching basenames) and `ignore_paths` (matching full paths). Normalizing path separators only affects paths with separators, so it's safe for both. + +But wait, there's another issue. The `./` prefix when running from the current directory. Should we also strip that? + +Let me think... If the user runs `pylint --recursive=y .` and the pattern is `^src/gen/.*$`, the discovered path might be `./src/gen/about.py`. Even with forward slashes, `./src/gen/about.py` doesn't match `^src/gen/.*$` because of the `./` prefix. + +Should we strip `./`? That might be risky because `.` is a valid path component. But in the context of discovered files, `./` at the beginning is just noise. + +Actually, `os.path.normpath()` would normalize `./src/gen/about.py` to `src/gen/about.py`. Let me check: + +```python +import os +os.path.normpath("./src/gen/about.py") # -> "src/gen/about.py" on Unix +os.path.normpath(".\\src\\gen\\about.py") # -> "src\\gen\\about.py" on Windows +``` + +So `os.path.normpath()` removes the `./` prefix and also normalizes `..` and multiple separators. This is useful. + +But `os.path.normpath()` doesn't convert backslashes to forward slashes on Windows. It uses the native separator. + +So the full normalization would be: +1. `os.path.normpath(element)` — normalize `.`, `..`, multiple separators +2. `.replace(os.sep, "/")` — convert to forward slashes + +Let me verify this works: +- Windows: `os.path.normpath("src\\gen\\about.py")` → `"src\\gen\\about.py"` → replace → `"src/gen/about.py"` ✓ +- Unix with `./`: `os.path.normpath("./src/gen/about.py")` → `"src/gen/about.py"` → replace → `"src/gen/about.py"` ✓ +- Absolute path: `os.path.normpath("/home/user/src/gen/about.py")` → `"/home/user/src/gen/about.py"` → replace → `"/home/user/src/gen/about.py"` (unchanged on Unix) + +This looks good. But wait, should we also make paths relative to something? If the user passes an absolute path to pylint, the discovered paths will be absolute, and a relative pattern like `^src/gen/.*$` won't match. + +Looking at the existing test: +```python +self._runtest([ + join(HERE, "regrtest_data", "directory"), + "--recursive=y", + "--ignore-paths=.*ignored.*", +], code=0) +``` + +This uses `.*ignored.*` which matches anywhere in the path, so it works with absolute paths. The user's pattern `^src/gen/.*$` is anchored and expects a relative path. + +I think the fix should focus on path separator normalization, as that's the most obvious bug. The `./` prefix issue might be a separate concern. + +Actually, let me re-read the bug report. The user says: + +```toml +[tool.pylint.MASTER] +ignore-paths = [ + # Auto generated + "^src/gen/.*$", +] +``` + +And they run: `pylint --recursive=y src/` + +On Windows, `src/` becomes `src\` and `os.walk("src/")` yields `src\gen`, `src\gen\about.py`, etc. The pattern `^src/gen/.*$` doesn't match these. + +So the fix is definitely to normalize path separators. + +But wait, I should also check if there's an issue with how `_discover_files()` checks directories. Let me trace through again: + +1. `os.walk("src/")` yields `root = "src\gen"` (on Windows) +2. `_is_ignored_file("src\gen", ..., ["^src/gen/.*$"])` is called +3. `_is_in_ignore_list_re("src\gen", [re.compile("^src/gen/.*$")])` is called +4. `"src\gen".replace(os.sep, "/")` → `"src/gen"` +5. `re.compile("^src/gen/.*$").match("src/gen")` → None (because `.*` needs at least one char after `/`) +6. Returns False +7. Directory is not skipped + +So even with path normalization, the directory `src/gen` still won't match `^src/gen/.*$` because the pattern expects a file path, not a directory path. + +Then individual files are yielded: +1. `src\gen\about.py` is yielded +2. In `expand_modules()`, `_is_ignored_file("src\gen\about.py", ..., ["^src/gen/.*$"])` is called +3. `"src\gen\about.py".replace(os.sep, "/")` → `"src/gen/about.py"` +4. `re.compile("^src/gen/.*$").match("src/gen/about.py")` → Match! +5. Returns True +6. File is skipped + +So with path normalization, the file-level check works. But the directory-level check still doesn't. Is that a problem? + +Not really, because the files are still filtered out at the file level. The only downside is that `os.walk()` descends into ignored directories, which is slightly inefficient but not incorrect. + +However, if we want to be thorough, we could also fix the directory-level check. One way is to check if any file in the directory would match the pattern. But that's complex. + +A simpler approach: in `_discover_files()`, when checking if a directory should be skipped, also check if the directory path (with a trailing slash) matches any `ignore_paths` pattern. This would allow patterns like `^src/gen/.*$` to match directories. + +But this might break existing behavior. Let me think... + +Actually, a better approach is to not worry about the directory-level check in `_discover_files()` and just ensure the file-level check in `expand_modules()` works correctly. The directory-level check is just an optimization to avoid descending into ignored directories. + +Wait, but there's a subtle issue. If a directory is a package (contains `__init__.py`), `_discover_files()` yields the directory path, not individual files. Then `expand_modules()` treats it as a package and analyzes all submodules. The file-level check in `expand_modules()` only applies to non-directory items. + +Let me re-read `expand_modules()`: + +```python +if os.path.isdir(something): + if _is_ignored_file(...): + continue + # ... walk and expand package ... +``` + +For a package directory, `_is_ignored_file()` is called with the directory path. If the pattern is `^src/gen/.*$`, it won't match `src/gen` even with path normalization. So the package is analyzed. + +Then inside the package expansion: +```python +for root, _, files in os.walk(something): + if _is_ignored_file(root, ...): + skip_subtrees.append(root) + continue + + if "__init__.py" in files: + # ... add package ... + else: + for filename in files: + if filename.endswith(".py"): + subfilepath = os.path.join(root, filename) + if _is_ignored_file(subfilepath, ...): + continue + # ... add file ... +``` + +For subdirectories inside the package, the directory-level check works. For individual files in non-package subdirectories, the file-level check works. But for the package itself and its direct `__init__.py`, there's no file-level check. + +If `src/gen` is a package with `__init__.py`, and the pattern is `^src/gen/.*$`, the package is analyzed. The `__init__.py` is analyzed as part of the package. Other `.py` files in the same directory are NOT added individually (they're part of the package). But pylint might still analyze them when importing the package. + +This is getting complicated. Let me focus on the specific bug reported by the user. + +The user's output shows individual modules like `about`, `design`, etc. being checked. These are NOT package submodules (they would be named `gen.about`, `gen.design` if they were). So `src/gen` is NOT a package. The files are discovered as individual files and should be filtered by `expand_modules()`. + +The only reason they're not filtered is path separator mismatch on Windows. So the fix is to normalize path separators. + +Let me also consider the failing test `test_ignore_path_recursive_current_dir`. What might this test check? + +I think it might check running `pylint --recursive=y .` from a directory where `ignore_paths` is configured in `pyproject.toml`. In this case, the paths discovered by `os.walk(".")` might have a `./` prefix, which breaks anchored patterns. + +To fix this, we should also normalize paths with `os.path.normpath()` to remove `./` prefixes. + +**FINAL FIX PLAN:** + +File: `pylint/lint/expand_modules.py` + +Function: `_is_in_ignore_list_re()` (around line 44) + +Change: Add path normalization before regex matching to handle: +1. Backslashes on Windows (convert to forward slashes) +2. `./` prefix when running from current directory (normalize with `os.path.normpath()`) + +```python +def _is_in_ignore_list_re(element: str, ignore_list_re: list[Pattern[str]]) -> bool: + """Determines if the element is matched in a regex ignore-list.""" + # Normalize path for cross-platform matching: + # - Remove ./ prefix and normalize ., .. with normpath + # - Convert native path separators to forward slashes + element = os.path.normpath(element).replace(os.sep, "/") + return any(file_pattern.match(element) for file_pattern in ignore_list_re) +``` + +Wait, `os.path.normpath()` on Windows converts forward slashes to backslashes. So if we call `os.path.normpath("src/gen/about.py")` on Windows, it becomes `"src\gen\about.py"`. Then `.replace(os.sep, "/")` converts it back to `"src/gen/about.py"`. This is fine. + +But on Unix, `os.path.normpath("src/gen/about.py")` returns `"src/gen/about.py"` (unchanged). Then `.replace(os.sep, "/")` is `.replace("/", "/")` which is a no-op. + +And for `./src/gen/about.py` on Unix: +- `os.path.normpath("./src/gen/about.py")` → `"src/gen/about.py"` +- `.replace("/", "/")` → `"src/gen/about.py"` + +This looks correct. + +However, I want to double-check one thing. `os.path.normpath()` might also change absolute paths. For example, on Windows: +- `os.path.normpath("C:\\project\\src\\gen\\about.py")` → `"C:\\project\\src\\gen\\about.py"` +- `.replace("\\", "/")` → `"C:/project/src/gen/about.py"` + +This is fine. The pattern `^src/gen/.*$` still won't match an absolute path, but that's expected behavior. If the user wants to match absolute paths, they should use a pattern like `^.*/src/gen/.*$` or `^C:/project/src/gen/.*$`. + +Actually, wait. There's a subtle issue. `os.path.normpath()` behavior differs between platforms. On Windows, it might add a drive letter or change the path format. But since we're just using it to normalize `.` and `..`, and then converting to forward slashes, it should be fine. + +Let me also consider whether we should apply this normalization only to `ignore_paths` or to both `ignore_patterns` and `ignore_paths`. The `_is_in_ignore_list_re()` function is used for both. Normalizing basenames (for `ignore_patterns`) shouldn't cause issues because basenames don't contain path separators or `.` prefixes. + +Actually, there is one edge case. If a filename contains backslashes (which is impossible on most filesystems but theoretically possible), normalizing it might change the behavior. But this is extremely unlikely. + +Let me also check if there are any other places where path normalization is needed. Looking at `_discover_files()`: + +```python +if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): +``` + +This checks directories. With the fix to `_is_in_ignore_list_re()`, directory paths will also be normalized. But as I noted earlier, `^src/gen/.*$` still won't match `src/gen` because it's a directory path without a trailing slash and filename. + +Should we also fix the directory-level check? I think it's worth considering. One approach is to append a trailing slash to directory paths before checking `ignore_paths`. But this is tricky because `_is_ignored_file()` doesn't know if the element is a directory or a file. + +Actually, a simpler approach is to not worry about the directory-level check. The file-level check in `expand_modules()` will catch individual files. The only issue is when a directory is a package (contains `__init__.py`), in which case the entire package is analyzed. + +But looking at the user's bug report, `src/gen` does NOT appear to be a package (module names are not prefixed with `gen.`). So the file-level check should work. + +Let me verify my understanding by looking at the test data: + +``` +tests/regrtest_data/directory/ + ignored_subdirectory/ + failing.py + package/ + __init__.py + module.py + subpackage/ + __init__.py + module.py + subdirectory/ + module.py + subsubdirectory/ + module.py +``` + +The test `test_ignore_path_recursive` uses `--ignore-paths=.*ignored.*` and `--ignore-paths=.*failing.*`. Both patterns use `.*` which matches any characters including path separators. So they work regardless of path format. + +The failing test `test_ignore_path_recursive_current_dir` presumably uses a more specific pattern that fails when running from the current directory. I can't see the test, but based on the name, it probably runs `pylint --recursive=y .` with a pattern like `^subdirectory/.*$` or similar. + +When running `pylint --recursive=y .`: +- `os.walk(".")` yields `root = "./subdirectory"` or `root = "subdirectory"` +- If it's `./subdirectory`, the pattern `^subdirectory/.*$` won't match +- Files like `./subdirectory/module.py` are yielded +- In `expand_modules()`, the file-level check also fails if the path has `./` prefix + +So normalizing with `os.path.normpath()` would fix this. + +OK, I'm confident in my analysis. Let me write the final answer. + +Actually, one more thing. I should check if `_is_ignored_file()` is also called in `_discover_files()` and whether the normalization there would help. + +In `_discover_files()`: +```python +if _is_ignored_file( + root, + self.config.ignore, + self.config.ignore_patterns, + self.config.ignore_paths, +): + skip_subtrees.append(root) + continue +``` + +With the fix, `root` would be normalized before matching. For `root = "src/gen"` on Unix with pattern `^src/gen/.*$`: +- Normalized: `"src/gen"` +- Pattern: `^src/gen/.*$` +- `match("src/gen")` → None (no match because `.*` needs at least one char) + +So the directory still wouldn't be skipped. But as I said, this is OK because the file-level check will catch individual files. + +For `root = "./subdirectory"` on Unix with pattern `^subdirectory/.*$`: +- Normalized: `"subdirectory"` +- Pattern: `^subdirectory/.*$` +- `match("subdirectory")` → None + +Still no match at the directory level. But the file-level check would work: +- `subfilepath = "./subdirectory/module.py"` +- Normalized: `"subdirectory/module.py"` +- Pattern: `^subdirectory/.*$` +- `match("subdirectory/module.py")` → Match! + +So the file-level check works. + +I think the fix is solid. Let me also consider if there are any side effects. + +One potential side effect: if a user has a pattern that intentionally uses backslashes for Windows paths, like `^src\\gen\\.*$`, normalizing backslashes to forward slashes would break it. But this is unlikely because: +1. The documentation probably shows forward slashes +2. Cross-platform patterns typically use forward slashes +3. The bug report shows the user using forward slashes + +Another potential side effect: `os.path.normpath()` might change the case of paths on Windows (e.g., `SRC` → `src`). But regex matching is case-sensitive by default, so this could be an issue. However, `os.path.normpath()` doesn't change case on Windows. `os.path.normcase()` does, but we're not using that. + +Actually wait, I just realized something. `os.path.normpath()` on Windows might convert forward slashes to backslashes, but it doesn't change case. And then we convert backslashes to forward slashes. So the case is preserved. + +One more thing: `os.path.normpath()` might add a trailing backslash to some paths on Windows. For example, `os.path.normpath("src/")` on Windows might return `"src\\"`. Then `.replace("\\", "/")` gives `"src/"`. This is fine. + +OK, I'm done analyzing. Here's my final answer: + +--- + +**ROOT CAUSE:** + +The bug is in `pylint/lint/expand_modules.py` in the `_is_in_ignore_list_re()` function (line 44-46). This function checks if a path matches `ignore_paths` regex patterns using `re.Pattern.match()`. + +When `--recursive=y` is used, `_discover_files()` in `pylint/lint/pylinter.py` discovers files using `os.walk()`, which returns paths with the native OS path separator (`\` on Windows, `/` on Unix). Additionally, when running from the current directory (e.g., `pylint --recursive=y .`), paths may include a `./` prefix. + +The `ignore_paths` patterns (e.g., `^src/gen/.*$`) are typically written with forward slashes and expect clean relative paths. However: +1. On Windows, discovered paths use backslashes (`src\gen\about.py`), so forward-slash patterns never match. +2. When running from `.`, paths may have a `./` prefix (`./src/gen/about.py`), so anchored patterns (`^src/gen/.*$`) never match. + +As a result, files inside ignored directories are not filtered out when using `--recursive=y`, making `ignore_paths` appear "completely ignored." + +**FIX PLAN:** + +File: `pylint/lint/expand_modules.py` + +Function: `_is_in_ignore_list_re()` (lines 44-46) + +Change: Normalize the path before regex matching by: +1. Calling `os.path.normpath()` to remove `./` prefixes and normalize `.` / `..` +2. Converting native path separators to forward slashes for cross-platform matching + +```python +def _is_in_ignore_list_re(element: str, ignore_list_re: list[Pattern[str]]) -> bool: + """Determines if the element is matched in a regex ignore-list.""" + # Normalize path separators and remove ./ prefix for cross-platform matching + element = os.path.normpath(element).replace(os.sep, "/") + return any(file_pattern.match(element) for file_pattern in ignore_list_re) +``` + +This ensures that patterns like `^src/gen/.*$` correctly match paths on all platforms and regardless of how pylint is invoked (with relative paths, absolute paths, or from the current directory with `.`). \ No newline at end of file diff --git a/benchmarks/results/swe_7080_kimi_planner_RESOLVED_report.json b/benchmarks/results/swe_7080_kimi_planner_RESOLVED_report.json new file mode 100644 index 0000000..81b626d --- /dev/null +++ b/benchmarks/results/swe_7080_kimi_planner_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7080" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7080" + ], + "resolved_ids": [ + "pylint-dev__pylint-7080" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_7080_kimi_planner_preds.json b/benchmarks/results/swe_7080_kimi_planner_preds.json new file mode 100644 index 0000000..d4a9009 --- /dev/null +++ b/benchmarks/results/swe_7080_kimi_planner_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..c0db52fb 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -43,6 +43,8 @@ def get_python_path(filepath: str) -> str:\n \n def _is_in_ignore_list_re(element: str, ignore_list_re: list[Pattern[str]]) -> bool:\n \"\"\"Determines if the element is matched in a regex ignore-list.\"\"\"\n+ # Normalize path separators and remove ./ prefix for cross-platform matching\n+ element = os.path.normpath(element).replace(os.sep, \"/\")\n return any(file_pattern.match(element) for file_pattern in ignore_list_re)\n \n \n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_7993_kimi_worker_RESOLVED_report.json b/benchmarks/results/swe_7993_kimi_worker_RESOLVED_report.json new file mode 100644 index 0000000..14324d5 --- /dev/null +++ b/benchmarks/results/swe_7993_kimi_worker_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7993" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7993" + ], + "resolved_ids": [ + "pylint-dev__pylint-7993" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_7993_kimi_worker_preds.json b/benchmarks/results/swe_7993_kimi_worker_preds.json new file mode 100644 index 0000000..73370bd --- /dev/null +++ b/benchmarks/results/swe_7993_kimi_worker_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..95d1490b7 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -12,6 +12,7 @@ from __future__ import annotations\n \n import os\n import re\n+import string\n import sys\n import warnings\n from dataclasses import asdict, fields\n@@ -174,16 +175,26 @@ class TextReporter(BaseReporter):\n # Set template to the currently selected template\n self._template = template\n \n- # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n+ # Use string.Formatter().parse() to correctly handle escaped braces\n+ # and field names with format specifications\n+ formatter = string.Formatter()\n+ parts: list[str] = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ # Re-escape braces in literals so a later .format() won't re-interpret them\n+ if literal:\n+ parts.append(literal.replace(\"{\", \"{{\").replace(\"}\", \"}}\"))\n+ if field_name is None:\n+ continue # no field here (covers escaped braces)\n+ if field_name not in MESSAGE_FIELDS:\n warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n+ f\"Don't recognize the argument '{field_name}' in the --msg-template. \"\n \"Are you sure it is supported on the current version of pylint?\"\n )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ continue # drop unknown fields entirely, as before\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/throughputProbe.ts b/benchmarks/throughputProbe.ts index 135ea5a..e5f60c1 100644 --- a/benchmarks/throughputProbe.ts +++ b/benchmarks/throughputProbe.ts @@ -2,10 +2,12 @@ // Created: 2026-06-09 // Purpose: ZDR(data_collection:deny) 조건에서 후보 모델의 provider/throughput 변동 측정. // 실제 운영(provider 자동선택)과 동일 조건. tok/s + provider 분포 + TTFT. -// Dependencies: tsx, OPENROUTER_API +// Dependencies: tsx, OPENROUTER_API_KEY (auto-loaded from the repo .env) // Test Status: profiling // -// 실행: source ~/dev/VEGA/.env && npx tsx benchmarks/throughputProbe.ts +// 실행: npx tsx benchmarks/throughputProbe.ts + +import { loadEnvFile } from '../src/core/envFile.js'; const API = 'https://openrouter.ai/api/v1/chat/completions'; const PROMPT = @@ -50,8 +52,9 @@ async function probe(apiKey: string, model: string): Promise { } async function main() { - const apiKey = process.env.OPENROUTER_API; - if (!apiKey) { console.error('OPENROUTER_API not set'); process.exit(1); } + loadEnvFile(); + const apiKey = process.env.OPENROUTER_API_KEY ?? process.env.OPENROUTER_API; + if (!apiKey) { console.error('OPENROUTER_API_KEY not set (checked shell env and .env)'); process.exit(1); } for (const model of MODELS) { console.log(`\n=== ${model} (ZDR, ${SAMPLES} samples) ===`); diff --git a/src/core/config.ts b/src/core/config.ts index 2fb03af..efa8e91 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -607,7 +607,7 @@ export function generateSampleConfig(): string { # Default CLI adapter for worker/reviewer stages # Options: codex, openrouter, lmstudio, local, gpt # - codex: OpenAI Codex via PKCE login (openswarm auth login --provider codex) -# - openrouter: OpenRouter API key (OPENROUTER_API env var or openswarm auth login --provider openrouter) +# - openrouter: OpenRouter API key (OPENROUTER_API_KEY env var or openswarm auth login --provider openrouter) # - lmstudio: LM Studio local server (set LMSTUDIO_BASE_URL / LMSTUDIO_MODEL) # - local: Ollama local models (ollama pull ) # - gpt: OpenAI Chat API via OAuth (openswarm auth login --provider gpt) From 711e1695a4c87e5fab66ce670ca94a85f7c0e49f Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 18 Jun 2026 19:08:32 +0900 Subject: [PATCH 2/5] fix(bench): pre-apply gold test_patch to block self-authored test masking SWE-bench images ship base_commit only; the FAIL_TO_PASS test is applied at eval time. The implementer therefore reinvented the gold test, which on 7080 validated a wrong fix (local "pass" while official grading failed). Now sweBench.ts git-applies the instance test_patch to the sandbox before baselining: local verification runs the real oracle, the touched test files join protectedFiles, and the gold test is absorbed into the baseline so the extracted model_patch stays source-only (no manual test-hunk stripping). Verified on 5859: worker created zero test files, model_patch was source-only, official grading RESOLVED. Also adds a missing-test_patch WARNING (no silent fallback). This is harness defect #8. Refs: INT-1462 --- .../swe_5859_fix8_RESOLVED_report.json | 23 +++++++++ .../swe_5859_fix8_sourceonly_preds.json | 7 +++ benchmarks/sweBench.ts | 49 ++++++++++++++++++- 3 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 benchmarks/results/swe_5859_fix8_RESOLVED_report.json create mode 100644 benchmarks/results/swe_5859_fix8_sourceonly_preds.json diff --git a/benchmarks/results/swe_5859_fix8_RESOLVED_report.json b/benchmarks/results/swe_5859_fix8_RESOLVED_report.json new file mode 100644 index 0000000..c5df85d --- /dev/null +++ b/benchmarks/results/swe_5859_fix8_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-5859" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-5859" + ], + "resolved_ids": [ + "pylint-dev__pylint-5859" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_5859_fix8_sourceonly_preds.json b/benchmarks/results/swe_5859_fix8_sourceonly_preds.json new file mode 100644 index 0000000..42c2fb8 --- /dev/null +++ b/benchmarks/results/swe_5859_fix8_sourceonly_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-5859", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/checkers/misc.py b/pylint/checkers/misc.py\nindex 69149e61a..28978365f 100644\n--- a/pylint/checkers/misc.py\n+++ b/pylint/checkers/misc.py\n@@ -121,9 +121,9 @@ class EncodingChecker(BaseChecker):\n \n notes = \"|\".join(re.escape(note) for note in self.config.notes)\n if self.config.notes_rgx:\n- regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})\\b\"\n+ regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})(?=\\W|$)\"\n else:\n- regex_string = rf\"#\\s*({notes})\\b\"\n+ regex_string = rf\"#\\s*({notes})(?=\\W|$)\"\n \n self._fixme_pattern = re.compile(regex_string, re.I)\n \n" + } +] \ No newline at end of file diff --git a/benchmarks/sweBench.ts b/benchmarks/sweBench.ts index 29acaac..65a6c37 100644 --- a/benchmarks/sweBench.ts +++ b/benchmarks/sweBench.ts @@ -13,7 +13,8 @@ // 3. git diff로 model_patch 추출 → prediction JSON 작성 // → 채점은 별도: python -m swebench.harness.run_evaluation -p preds.json ... // -// 실행: OPENROUTER_API=... npx tsx benchmarks/sweBench.ts [outPreds.json] +// 실행: npx tsx benchmarks/sweBench.ts [outPreds.json] +// (OPENROUTER_API_KEY required — auto-loaded from the repo .env) // ============================================ import { mkdtemp, rm, writeFile, readFile } from 'node:fs/promises'; @@ -24,6 +25,7 @@ import { promisify } from 'node:util'; import { runWorker } from '../src/agents/worker.js'; import { setDefaultAdapter } from '../src/adapters/index.js'; import { initLocale } from '../src/locale/index.js'; +import { loadEnvFile } from '../src/core/envFile.js'; const exec = promisify(execFile); @@ -33,12 +35,29 @@ interface SweInstance { base_commit: string; problem_statement: string; FAIL_TO_PASS: string; + // The gold test diff. SWE-bench images ship base_commit only; the official + // grader applies this at eval time. We pre-apply it so local verification + // runs the REAL FAIL_TO_PASS test (defect #8) — see solveOne. + test_patch?: string; } function imageFor(id: string): string { return `swebench/sweb.eval.x86_64.${id.replace('__', '_1776_')}:latest`; } +/** + * Extract the file paths a SWE-bench test_patch touches (the `+++ b/` + * targets). These feed protectedFiles so the implementer cannot edit the test + * oracle once it has been pre-applied. + */ +function testFilesFromPatch(testPatch: string): string[] { + const files = new Set(); + for (const m of testPatch.matchAll(/^\+\+\+ b\/(\S+)/gm)) { + if (m[1] !== '/dev/null') files.add(m[1]); + } + return [...files]; +} + /** * Diagnosis section for hybrid stage 2 (implementer). Includes mechanical * finishing instructions — in the first hybrid run the implementer had the @@ -87,6 +106,29 @@ async function solveOne(inst: SweInstance, model: string): Promise<{ pred: Recor log(' extracting /testbed...'); await sh('docker', ['cp', `${container}:/testbed/.`, hostDir], { timeoutMs: 120_000 }); + // Pre-apply the gold test_patch BEFORE baselining (defect #8). The extracted + // /testbed is at base_commit without the FAIL_TO_PASS test, so the implementer + // would otherwise invent its own test — which can validate a WRONG fix (seen on + // 7080). Baking the real test into the baseline (a) makes local verification use + // the true oracle and (b) keeps it out of the extracted model_patch automatically + // (git diff baseSha excludes it), so model_patch stays source-only as the grader + // requires. The official harness applies test_patch itself at eval time. + let protectedTestFiles: string[] = []; + if (inst.test_patch && inst.test_patch.trim()) { + const tpPath = join(tmpdir(), `swe-tp-${inst.instance_id.replace(/[^a-z0-9]/gi, '-')}.diff`); + await writeFile(tpPath, inst.test_patch); + try { + await sh('git', ['apply', '--whitespace=nowarn', tpPath], { cwd: hostDir, timeoutMs: 30_000 }); + protectedTestFiles = testFilesFromPatch(inst.test_patch); + log(` applied gold test_patch — protecting ${protectedTestFiles.length} test file(s): ${protectedTestFiles.join(', ')}`); + } catch (err) { + log(` WARNING: gold test_patch did not apply (${err instanceof Error ? err.message.split('\n')[0] : String(err)}) — falling back to implementer-authored test (defect #8 risk)`); + } + await rm(tpPath, { force: true }).catch(() => {}); + } else { + log(' WARNING: instance has no test_patch — local verification relies on an implementer-authored test (defect #8 risk)'); + } + // git baseline 고정 (patch 추출 기준). 추출된 /testbed는 이미 git repo. await sh('git', ['add', '-A'], { cwd: hostDir, timeoutMs: 30_000 }).catch(() => {}); await sh('git', ['-c', 'user.email=b@b', '-c', 'user.name=b', 'commit', '-qm', 'baseline', '--allow-empty'], { cwd: hostDir, timeoutMs: 30_000 }).catch(() => {}); @@ -166,7 +208,9 @@ async function solveOne(inst: SweInstance, model: string): Promise<{ pred: Recor nudgeMaxOnNoEdit: 2, // Verification-harness protection — on 7993 the implementer blamed test // failures on run_tests.sh and edited it 5 times, dismantling verification. - protectedFiles: ['run_tests.sh'], + // The pre-applied gold test files join the protected set (defect #8) so the + // implementer can't rewrite the oracle to make a wrong fix pass. + protectedFiles: ['run_tests.sh', ...protectedTestFiles], // run_tests.sh = docker cp + in-container pytest — the 30s default times // out into a silent no-output failure the model reads as a broken env. bashTimeoutMs: 240_000, @@ -189,6 +233,7 @@ async function solveOne(inst: SweInstance, model: string): Promise<{ pred: Recor } async function main() { + loadEnvFile(); initLocale('en'); setDefaultAdapter('openrouter'); const file = process.argv[2]; From 2611dad458ae95f1dee206fef205f4395e568158 Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 18 Jun 2026 19:08:59 +0900 Subject: [PATCH 3/5] refactor(adapters): extract shared worker/reviewer result parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpt, local, and openrouter each carried a byte-identical (modulo formatting) copy of eight result-parsing functions, already drifting in comments — a parse fix would have needed three-way sync. New src/adapters/resultParsing.ts exports parseWorkerResult / parseReviewerResult; the three adapters delegate in one line and drop their local copies (and the now-orphaned t import). codex is intentionally left on its own variant: its text fallbacks salvage filesChanged/commands/issues from prose and its JSON extractor differs, so sharing would regress behavior or touch untested paths. Adapter suite 76/76 green. The openrouter.ts edit also carries the INT-1460 key-source change. Refs: INT-1441 --- src/adapters/gpt.ts | 136 ++----------------------------- src/adapters/local.ts | 125 ++-------------------------- src/adapters/openrouter.ts | 133 +++--------------------------- src/adapters/resultParsing.ts | 148 ++++++++++++++++++++++++++++++++++ 4 files changed, 167 insertions(+), 375 deletions(-) create mode 100644 src/adapters/resultParsing.ts diff --git a/src/adapters/gpt.ts b/src/adapters/gpt.ts index 6a37a6c..2e5d5b5 100644 --- a/src/adapters/gpt.ts +++ b/src/adapters/gpt.ts @@ -13,8 +13,8 @@ import type { ReviewResult, } from './types.js'; import { AuthProfileStore, ensureValidToken } from '../auth/index.js'; -import { t } from '../locale/index.js'; import { runAgenticLoop, loopResultToCliResult, type ChatMessage, type AgenticLoopOptions } from './agenticLoop.js'; +import { parseWorkerResult, parseReviewerResult } from './resultParsing.js'; import type { ToolDefinition } from './tools.js'; const OPENAI_API_BASE = 'https://api.openai.com/v1'; @@ -154,13 +154,11 @@ export class GptCliAdapter implements CliAdapter { } parseWorkerOutput(raw: CliRunResult): WorkerResult { - const text = raw.stdout; - return extractWorkerResultJson(text) ?? extractWorkerFromText(text); + return parseWorkerResult(raw.stdout); } parseReviewerOutput(raw: CliRunResult): ReviewResult { - const text = raw.stdout; - return extractReviewerResultJson(text) ?? extractReviewerFromText(text); + return parseReviewerResult(raw.stdout); } } @@ -198,129 +196,5 @@ async function refreshAndRetry(store: AuthProfileStore): Promise { return ensureValidToken(store, PROFILE_KEY); } -// Worker/Reviewer output parsing (Codex 어댑터와 동일한 로직) - -function extractWorkerResultJson(text: string): WorkerResult | null { - const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); - const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"success"'); - if (!jsonStr) return null; - - try { - const parsed = JSON.parse(jsonStr); - return { - success: Boolean(parsed.success), - summary: parsed.summary || t('common.fallback.noSummary'), - filesChanged: Array.isArray(parsed.filesChanged) ? parsed.filesChanged : [], - commands: Array.isArray(parsed.commands) ? parsed.commands : [], - output: text, - error: parsed.error, - confidencePercent: typeof parsed.confidencePercent === 'number' - ? parsed.confidencePercent : undefined, - haltReason: parsed.haltReason || undefined, - }; - } catch { - return null; - } -} - -function extractWorkerFromText(text: string): WorkerResult { - // Only an explicit failure phrase marks the run as failed. Loose words like - // "error" or "fail" appear in normal coding prose ("error handling", "the - // failing test") and used to cause false negatives. git-diff promotion in - // worker.ts is the real success signal; this is just the non-repo fallback. - const failed = isExplicitFailure(text); - - return { - success: !failed, - summary: extractSummary(text), - filesChanged: [], - commands: [], - output: text, - error: failed ? extractErrorMessage(text) : undefined, - }; -} - -function extractReviewerResultJson(text: string): ReviewResult | null { - const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); - const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"decision"'); - if (!jsonStr) return null; - - try { - const parsed = JSON.parse(jsonStr); - const decision = parsed.decision === 'approve' || parsed.decision === 'reject' - ? parsed.decision - : 'revise'; - return { - decision, - feedback: typeof parsed.feedback === 'string' ? parsed.feedback : t('common.fallback.noSummary'), - issues: Array.isArray(parsed.issues) - ? parsed.issues.filter((v: unknown): v is string => typeof v === 'string') - : [], - suggestions: Array.isArray(parsed.suggestions) - ? parsed.suggestions.filter((v: unknown): v is string => typeof v === 'string') - : [], - }; - } catch { - return null; - } -} - -function extractReviewerFromText(text: string): ReviewResult { - const lower = text.toLowerCase(); - const decision = lower.includes('approve') - ? 'approve' - : lower.includes('reject') - ? 'reject' - : 'revise'; - return { - decision, - feedback: extractSummary(text), - issues: [], - suggestions: [], - }; -} - -// Helpers - -function findJsonObject(text: string, marker: string): string | null { - const idx = text.indexOf(marker); - if (idx < 0) return null; - - // marker 앞의 '{' 찾기 - let start = text.lastIndexOf('{', idx); - if (start < 0) return null; - - let depth = 0; - for (let i = start; i < text.length; i++) { - if (text[i] === '{') depth++; - if (text[i] === '}') { - depth--; - if (depth === 0) { - return text.slice(start, i + 1); - } - } - } - return null; -} - -// Detect a real failure declaration, not incidental "error"/"fail" prose. -// Matches explicit statements like "failed to", "unable to", "could not", -// "cannot complete", or an explicit JSON success:false. -function isExplicitFailure(text: string): boolean { - if (/"success"\s*:\s*false/i.test(text)) return true; - return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); -} - -function extractSummary(text: string): string { - const lines = text.split('\n').filter((l) => l.trim().length > 10); - if (lines.length === 0) return t('common.fallback.noSummary'); - const summary = lines[0].trim(); - return summary.length > 200 ? `${summary.slice(0, 200)}...` : summary; -} - -function extractErrorMessage(text: string): string { - const errorMatch = text.match(/(?:error|exception|failed?):\s*(.+)/i); - if (errorMatch) return errorMatch[1].slice(0, 200); - const lines = text.split('\n').filter((l) => /error|fail/i.test(l)); - return lines.length > 0 ? lines[0].slice(0, 200) : 'Unknown error'; -} +// Worker/Reviewer output parsing lives in ./resultParsing.ts (shared with the +// local, openrouter, and codex adapters — INT-1441). diff --git a/src/adapters/local.ts b/src/adapters/local.ts index fa8dce6..cf9121f 100644 --- a/src/adapters/local.ts +++ b/src/adapters/local.ts @@ -12,8 +12,8 @@ import type { WorkerResult, ReviewResult, } from './types.js'; -import { t } from '../locale/index.js'; import { runAgenticLoop, loopResultToCliResult, type ChatMessage, type AgenticLoopOptions } from './agenticLoop.js'; +import { parseWorkerResult, parseReviewerResult } from './resultParsing.js'; import type { ToolDefinition } from './tools.js'; // 로컬 프로바이더 기본 URL 후보 (우선순위 순) @@ -273,13 +273,11 @@ export class LocalModelAdapter implements CliAdapter { } parseWorkerOutput(raw: CliRunResult): WorkerResult { - const text = raw.stdout; - return extractWorkerResultJson(text) ?? extractWorkerFromText(text); + return parseWorkerResult(raw.stdout); } parseReviewerOutput(raw: CliRunResult): ReviewResult { - const text = raw.stdout; - return extractReviewerResultJson(text) ?? extractReviewerFromText(text); + return parseReviewerResult(raw.stdout); } } @@ -305,118 +303,5 @@ interface OpenAICompatResponse { model?: string; } -// Worker/Reviewer 출력 파싱 (GPT 어댑터와 동일 로직) - -function extractWorkerResultJson(text: string): WorkerResult | null { - const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); - const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"success"'); - if (!jsonStr) return null; - - try { - const parsed = JSON.parse(jsonStr); - return { - success: Boolean(parsed.success), - summary: parsed.summary || t('common.fallback.noSummary'), - filesChanged: Array.isArray(parsed.filesChanged) ? parsed.filesChanged : [], - commands: Array.isArray(parsed.commands) ? parsed.commands : [], - output: text, - error: parsed.error, - confidencePercent: typeof parsed.confidencePercent === 'number' - ? parsed.confidencePercent : undefined, - haltReason: parsed.haltReason || undefined, - }; - } catch { - return null; - } -} - -function extractWorkerFromText(text: string): WorkerResult { - // Only an explicit failure phrase marks the run as failed (see gpt.ts). - // git-diff promotion in worker.ts is the real success signal. - const failed = isExplicitFailure(text); - - return { - success: !failed, - summary: extractSummary(text), - filesChanged: [], - commands: [], - output: text, - error: failed ? extractErrorMessage(text) : undefined, - }; -} - -function extractReviewerResultJson(text: string): ReviewResult | null { - const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); - const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"decision"'); - if (!jsonStr) return null; - - try { - const parsed = JSON.parse(jsonStr); - const decision = parsed.decision === 'approve' || parsed.decision === 'reject' - ? parsed.decision - : 'revise'; - return { - decision, - feedback: typeof parsed.feedback === 'string' ? parsed.feedback : t('common.fallback.noSummary'), - issues: Array.isArray(parsed.issues) - ? parsed.issues.filter((v: unknown): v is string => typeof v === 'string') - : [], - suggestions: Array.isArray(parsed.suggestions) - ? parsed.suggestions.filter((v: unknown): v is string => typeof v === 'string') - : [], - }; - } catch { - return null; - } -} - -function extractReviewerFromText(text: string): ReviewResult { - const lower = text.toLowerCase(); - const decision = lower.includes('approve') - ? 'approve' - : lower.includes('reject') - ? 'reject' - : 'revise'; - return { - decision, - feedback: extractSummary(text), - issues: [], - suggestions: [], - }; -} - -function findJsonObject(text: string, marker: string): string | null { - const idx = text.indexOf(marker); - if (idx < 0) return null; - let start = text.lastIndexOf('{', idx); - if (start < 0) return null; - let depth = 0; - for (let i = start; i < text.length; i++) { - if (text[i] === '{') depth++; - if (text[i] === '}') { - depth--; - if (depth === 0) return text.slice(start, i + 1); - } - } - return null; -} - -// Detect a real failure declaration, not incidental "error"/"fail" prose (see gpt.ts). -function isExplicitFailure(text: string): boolean { - if (/"success"\s*:\s*false/i.test(text)) return true; - return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); -} - -function extractSummary(text: string): string { - const lines = text.split('\n').filter(l => l.trim().length > 10); - if (lines.length === 0) return t('common.fallback.noSummary'); - const summary = lines[0].trim(); - return summary.length > 200 ? `${summary.slice(0, 200)}...` : summary; -} - -function extractErrorMessage(text: string): string { - const errorMatch = text.match(/(?:error|exception|failed?):\s*(.+)/i); - if (errorMatch) return errorMatch[1].slice(0, 200); - const lines = text.split('\n').filter(l => /error|fail/i.test(l)); - return lines.length > 0 ? lines[0].slice(0, 200) : 'Unknown error'; -} +// Worker/Reviewer output parsing lives in ./resultParsing.ts (shared with the +// gpt, openrouter, and codex adapters — INT-1441). diff --git a/src/adapters/openrouter.ts b/src/adapters/openrouter.ts index fb36860..b6e33da 100644 --- a/src/adapters/openrouter.ts +++ b/src/adapters/openrouter.ts @@ -13,22 +13,22 @@ import type { ReviewResult, } from './types.js'; import { AuthProfileStore, ensureValidToken } from '../auth/index.js'; -import { t } from '../locale/index.js'; import { runAgenticLoop, loopResultToCliResult, type ChatMessage, type AgenticLoopOptions, } from './agenticLoop.js'; +import { parseWorkerResult, parseReviewerResult } from './resultParsing.js'; import type { ToolDefinition } from './tools.js'; const OPENROUTER_API_BASE = 'https://openrouter.ai/api/v1'; const DEFAULT_MODEL = 'openai/gpt-5'; const PROFILE_KEY = 'openrouter:default'; -/** OPENROUTER_API env var → immediate API key (no PKCE needed). */ +/** OPENROUTER_API_KEY env var (legacy: OPENROUTER_API) → immediate API key (no PKCE needed). */ function getEnvApiKey(): string | undefined { - return process.env.OPENROUTER_API?.trim() || undefined; + return process.env.OPENROUTER_API_KEY?.trim() || process.env.OPENROUTER_API?.trim() || undefined; } // Attribution headers — OpenRouter surfaces these in its analytics UI so @@ -67,7 +67,7 @@ export class OpenRouterCliAdapter implements CliAdapter { async run(options: CliRunOptions): Promise { const startTime = Date.now(); - // Prefer OPENROUTER_API env var (e.g. sourced from VEGA .env) + // Prefer OPENROUTER_API_KEY env var (auto-loaded from .env by the CLI entrypoint) let apiKey: string | undefined = getEnvApiKey(); if (!apiKey) { const store = new AuthProfileStore(); @@ -77,7 +77,7 @@ export class OpenRouterCliAdapter implements CliAdapter { return { exitCode: 1, stdout: '', - stderr: `Auth error: ${err instanceof Error ? err.message : String(err)}. Set OPENROUTER_API env var or run: openswarm auth login --provider openrouter`, + stderr: `Auth error: ${err instanceof Error ? err.message : String(err)}. Set OPENROUTER_API_KEY env var or run: openswarm auth login --provider openrouter`, durationMs: Date.now() - startTime, }; } @@ -120,11 +120,11 @@ export class OpenRouterCliAdapter implements CliAdapter { } parseWorkerOutput(raw: CliRunResult): WorkerResult { - return extractWorkerResultJson(raw.stdout) ?? extractWorkerFromText(raw.stdout); + return parseWorkerResult(raw.stdout); } parseReviewerOutput(raw: CliRunResult): ReviewResult { - return extractReviewerResultJson(raw.stdout) ?? extractReviewerFromText(raw.stdout); + return parseReviewerResult(raw.stdout); } } @@ -240,120 +240,5 @@ export function applyPromptCaching(messages: ChatMessage[], model: string): unkn // ----- Worker/Reviewer output parsing (mirrors gpt.ts) ----- -function extractWorkerResultJson(text: string): WorkerResult | null { - const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); - const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"success"'); - if (!jsonStr) return null; - - try { - const parsed = JSON.parse(jsonStr); - return { - success: Boolean(parsed.success), - summary: parsed.summary || t('common.fallback.noSummary'), - filesChanged: Array.isArray(parsed.filesChanged) ? parsed.filesChanged : [], - commands: Array.isArray(parsed.commands) ? parsed.commands : [], - output: text, - error: parsed.error, - confidencePercent: - typeof parsed.confidencePercent === 'number' ? parsed.confidencePercent : undefined, - haltReason: parsed.haltReason || undefined, - }; - } catch { - return null; - } -} - -function extractWorkerFromText(text: string): WorkerResult { - // Only an explicit failure phrase marks the run as failed (see gpt.ts). - // git-diff promotion in worker.ts is the real success signal. - const failed = isExplicitFailure(text); - - return { - success: !failed, - summary: extractSummary(text), - filesChanged: [], - commands: [], - output: text, - error: failed ? extractErrorMessage(text) : undefined, - }; -} - -function extractReviewerResultJson(text: string): ReviewResult | null { - const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); - const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"decision"'); - if (!jsonStr) return null; - - try { - const parsed = JSON.parse(jsonStr); - const decision = - parsed.decision === 'approve' || parsed.decision === 'reject' ? parsed.decision : 'revise'; - return { - decision, - feedback: - typeof parsed.feedback === 'string' ? parsed.feedback : t('common.fallback.noSummary'), - issues: Array.isArray(parsed.issues) - ? parsed.issues.filter((v: unknown): v is string => typeof v === 'string') - : [], - suggestions: Array.isArray(parsed.suggestions) - ? parsed.suggestions.filter((v: unknown): v is string => typeof v === 'string') - : [], - }; - } catch { - return null; - } -} - -function extractReviewerFromText(text: string): ReviewResult { - const lower = text.toLowerCase(); - const decision = lower.includes('approve') - ? 'approve' - : lower.includes('reject') - ? 'reject' - : 'revise'; - return { - decision, - feedback: extractSummary(text), - issues: [], - suggestions: [], - }; -} - -function findJsonObject(text: string, marker: string): string | null { - const idx = text.indexOf(marker); - if (idx < 0) return null; - - const start = text.lastIndexOf('{', idx); - if (start < 0) return null; - - let depth = 0; - for (let i = start; i < text.length; i++) { - if (text[i] === '{') depth++; - if (text[i] === '}') { - depth--; - if (depth === 0) { - return text.slice(start, i + 1); - } - } - } - return null; -} - -// Detect a real failure declaration, not incidental "error"/"fail" prose (see gpt.ts). -function isExplicitFailure(text: string): boolean { - if (/"success"\s*:\s*false/i.test(text)) return true; - return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); -} - -function extractSummary(text: string): string { - const lines = text.split('\n').filter((l) => l.trim().length > 10); - if (lines.length === 0) return t('common.fallback.noSummary'); - const summary = lines[0].trim(); - return summary.length > 200 ? `${summary.slice(0, 200)}...` : summary; -} - -function extractErrorMessage(text: string): string { - const errorMatch = text.match(/(?:error|exception|failed?):\s*(.+)/i); - if (errorMatch) return errorMatch[1].slice(0, 200); - const lines = text.split('\n').filter((l) => /error|fail/i.test(l)); - return lines.length > 0 ? lines[0].slice(0, 200) : 'Unknown error'; -} +// Worker/Reviewer output parsing lives in ./resultParsing.ts (shared with the +// gpt, local, and codex adapters — INT-1441). diff --git a/src/adapters/resultParsing.ts b/src/adapters/resultParsing.ts new file mode 100644 index 0000000..cd16f3a --- /dev/null +++ b/src/adapters/resultParsing.ts @@ -0,0 +1,148 @@ +// ============================================ +// OpenSwarm - Shared adapter result parsing +// ============================================ +// +// Worker/Reviewer result extraction shared by the gpt, local, and openrouter +// adapters. These three adapters each ran a byte-for-byte copy of the same +// eight functions; the copies had already drifted in formatting and comment +// wording (a latent correctness risk if one copy were fixed and the others +// not). This module is the single source of truth — each adapter delegates to +// `parseWorkerResult` / `parseReviewerResult`. + +import type { WorkerResult, ReviewResult } from './types.js'; +import { t } from '../locale/index.js'; + +/** JSON-first worker parse: fenced ```json block, else a `"success"`-anchored object. */ +function extractWorkerResultJson(text: string): WorkerResult | null { + const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); + const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"success"'); + if (!jsonStr) return null; + + try { + const parsed = JSON.parse(jsonStr); + return { + success: Boolean(parsed.success), + summary: parsed.summary || t('common.fallback.noSummary'), + filesChanged: Array.isArray(parsed.filesChanged) ? parsed.filesChanged : [], + commands: Array.isArray(parsed.commands) ? parsed.commands : [], + output: text, + error: parsed.error, + confidencePercent: + typeof parsed.confidencePercent === 'number' ? parsed.confidencePercent : undefined, + haltReason: parsed.haltReason || undefined, + }; + } catch { + return null; + } +} + +/** Text fallback when no JSON result is present. */ +function extractWorkerFromText(text: string): WorkerResult { + // Only an explicit failure phrase marks the run as failed. Loose words like + // "error" or "fail" appear in normal coding prose ("error handling", "the + // failing test") and used to cause false negatives. git-diff promotion in + // worker.ts is the real success signal; this is just the non-repo fallback. + const failed = isExplicitFailure(text); + + return { + success: !failed, + summary: extractSummary(text), + filesChanged: [], + commands: [], + output: text, + error: failed ? extractErrorMessage(text) : undefined, + }; +} + +/** JSON-first reviewer parse: fenced ```json block, else a `"decision"`-anchored object. */ +function extractReviewerResultJson(text: string): ReviewResult | null { + const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); + const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"decision"'); + if (!jsonStr) return null; + + try { + const parsed = JSON.parse(jsonStr); + const decision = + parsed.decision === 'approve' || parsed.decision === 'reject' ? parsed.decision : 'revise'; + return { + decision, + feedback: + typeof parsed.feedback === 'string' ? parsed.feedback : t('common.fallback.noSummary'), + issues: Array.isArray(parsed.issues) + ? parsed.issues.filter((v: unknown): v is string => typeof v === 'string') + : [], + suggestions: Array.isArray(parsed.suggestions) + ? parsed.suggestions.filter((v: unknown): v is string => typeof v === 'string') + : [], + }; + } catch { + return null; + } +} + +/** Text fallback when no JSON reviewer result is present. */ +function extractReviewerFromText(text: string): ReviewResult { + const lower = text.toLowerCase(); + const decision = lower.includes('approve') + ? 'approve' + : lower.includes('reject') + ? 'reject' + : 'revise'; + return { + decision, + feedback: extractSummary(text), + issues: [], + suggestions: [], + }; +} + +/** Brace-balanced scan for the JSON object containing `marker`. */ +function findJsonObject(text: string, marker: string): string | null { + const idx = text.indexOf(marker); + if (idx < 0) return null; + + const start = text.lastIndexOf('{', idx); + if (start < 0) return null; + + let depth = 0; + for (let i = start; i < text.length; i++) { + if (text[i] === '{') depth++; + if (text[i] === '}') { + depth--; + if (depth === 0) { + return text.slice(start, i + 1); + } + } + } + return null; +} + +/** Detect a real failure declaration, not incidental "error"/"fail" prose. */ +function isExplicitFailure(text: string): boolean { + if (/"success"\s*:\s*false/i.test(text)) return true; + return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); +} + +function extractSummary(text: string): string { + const lines = text.split('\n').filter((l) => l.trim().length > 10); + if (lines.length === 0) return t('common.fallback.noSummary'); + const summary = lines[0].trim(); + return summary.length > 200 ? `${summary.slice(0, 200)}...` : summary; +} + +function extractErrorMessage(text: string): string { + const errorMatch = text.match(/(?:error|exception|failed?):\s*(.+)/i); + if (errorMatch) return errorMatch[1].slice(0, 200); + const lines = text.split('\n').filter((l) => /error|fail/i.test(l)); + return lines.length > 0 ? lines[0].slice(0, 200) : 'Unknown error'; +} + +/** JSON-first with text fallback — the canonical worker-output parse. */ +export function parseWorkerResult(text: string): WorkerResult { + return extractWorkerResultJson(text) ?? extractWorkerFromText(text); +} + +/** JSON-first with text fallback — the canonical reviewer-output parse. */ +export function parseReviewerResult(text: string): ReviewResult { + return extractReviewerResultJson(text) ?? extractReviewerFromText(text); +} From d6a5e359bb3302a78473e6be51e079dea234ff90 Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 18 Jun 2026 19:09:11 +0900 Subject: [PATCH 4/5] fix(adapters,memory): bound read cache + fix distillation type contract - tools.ts: the loop-scoped read cache was an unbounded Map (megabytes over an 80-turn run). Add a 64-entry LRU (cacheGet bumps to MRU, cacheSet evicts the oldest). New test covers eviction. - memoryCore.ts: saveMemory let distillation silently override the caller's explicit type (system_pattern/constraint -> belief), escapable only via skipDistillation. New contract: distillation refines the type only when !isVerified; a verified explicit type is authoritative. - agenticLoop.ts: document that an empty final answer intentionally triggers the salvage turn (not an accidental extra call). Full suite 692/692. Refs: INT-1442 --- src/adapters/agenticLoop.ts | 3 +++ src/adapters/tools.test.ts | 19 +++++++++++++++++++ src/adapters/tools.ts | 35 ++++++++++++++++++++++++++++++++--- src/memory/memoryCore.ts | 10 ++++++++-- 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/src/adapters/agenticLoop.ts b/src/adapters/agenticLoop.ts index ae2eb76..a2349b9 100644 --- a/src/adapters/agenticLoop.ts +++ b/src/adapters/agenticLoop.ts @@ -310,6 +310,9 @@ export async function runAgenticLoop(options: AgenticLoopOptions): Promise 0) { onLog?.('▸ Final answer turn (no tools) — loop ended without a final message'); messages.push({ diff --git a/src/adapters/tools.test.ts b/src/adapters/tools.test.ts index 1ae7daf..0fe00d5 100644 --- a/src/adapters/tools.test.ts +++ b/src/adapters/tools.test.ts @@ -345,6 +345,25 @@ describe('ReadCache', () => { expect(head.content).toContain('line1'); expect(tail.content).toContain('line11'); }); + + it('evicts the least-recently-used entry once the bound (64) is exceeded', async () => { + const filePath = path.join(TMP_DIR, 'cache-lru.txt'); + await fs.writeFile(filePath, Array.from({ length: 80 }, (_, i) => `row${i + 1}`).join('\n') + '\n'); + const cache = createReadCache(); + + // 65 distinct ranges (offset 0..64, limit 1) — one past the 64-entry cap, + // so the first read (offset 0, the LRU) must be evicted. + for (let off = 0; off <= 64; off++) { + await executeTool(makeCall('read_file', { path: filePath, offset: off, limit: 1 }), TMP_DIR, cache); + } + + // The evicted entry re-reads from disk (no "unchanged" marker)... + const evicted = await executeTool(makeCall('read_file', { path: filePath, offset: 0, limit: 1 }), TMP_DIR, cache); + expect(evicted.content).not.toContain('unchanged'); + // ...while a recently-read entry is still cached. + const recent = await executeTool(makeCall('read_file', { path: filePath, offset: 64, limit: 1 }), TMP_DIR, cache); + expect(recent.content).toContain('unchanged since last read'); + }); }); // ────────────────────────────────────────────── diff --git a/src/adapters/tools.ts b/src/adapters/tools.ts index d4b59cd..1e39dd9 100644 --- a/src/adapters/tools.ts +++ b/src/adapters/tools.ts @@ -144,7 +144,14 @@ export interface ToolResult { * (모델이 edit 후 "고쳐졌나?" 확인하려 재read하는 패턴) 디스크를 다시 읽지 않고 * 캐시된 내용 + "변경 없음" 힌트를 반환해 토큰·턴 낭비를 줄인다. * edit_file/write_file 성공 시 해당 경로를 무효화해 stale read를 막는다. + * + * LRU-bounded: a single 80-turn SWE run reading many offsets of large files + * could otherwise retain megabytes of numbered content for the whole loop. + * The Map preserves insertion order, so eviction drops the least-recently-used + * key once MAX_READ_CACHE_ENTRIES is exceeded. */ +const MAX_READ_CACHE_ENTRIES = 64; + export interface ReadCache { store: Map; } @@ -153,6 +160,27 @@ export function createReadCache(): ReadCache { return { store: new Map() }; } +/** Cache read that bumps the key to most-recently-used. */ +function cacheGet(cache: ReadCache, key: string): string | undefined { + const value = cache.store.get(key); + if (value === undefined) return undefined; + // Re-insert to move to the end (MRU) so eviction targets truly-old entries. + cache.store.delete(key); + cache.store.set(key, value); + return value; +} + +/** Cache write with LRU eviction once the entry cap is exceeded. */ +function cacheSet(cache: ReadCache, key: string, value: string): void { + cache.store.delete(key); + cache.store.set(key, value); + while (cache.store.size > MAX_READ_CACHE_ENTRIES) { + const oldest = cache.store.keys().next().value; + if (oldest === undefined) break; + cache.store.delete(oldest); + } +} + /** 캐시에서 한 파일의 모든 범위 엔트리를 제거 (edit/write 후 stale 방지) */ function invalidateCache(cache: ReadCache | undefined, filePath: string): void { if (!cache) return; @@ -223,10 +251,11 @@ export async function executeTool( // 같은 루프에서 이미 같은 범위를 읽었으면 디스크 재접근 없이 캐시 반환. // 모델에게 "변경 없음"을 알려 추가 확인 read를 유도하지 않는다. - if (cache?.store.has(cacheKey)) { + const cached = cache ? cacheGet(cache, cacheKey) : undefined; + if (cached !== undefined) { return { tool_call_id: callId, - content: `(unchanged since last read — cached)\n${cache.store.get(cacheKey)!}`, + content: `(unchanged since last read — cached)\n${cached}`, is_error: false, }; } @@ -239,7 +268,7 @@ export async function executeTool( ? `\n... (${lines.length - offset - limit} more lines)` : ''; const result = numbered + truncated; - cache?.store.set(cacheKey, result); + if (cache) cacheSet(cache, cacheKey, result); return { tool_call_id: callId, content: result, is_error: false }; } diff --git a/src/memory/memoryCore.ts b/src/memory/memoryCore.ts index 424b8ca..ab79d9b 100644 --- a/src/memory/memoryCore.ts +++ b/src/memory/memoryCore.ts @@ -532,8 +532,14 @@ export async function saveMemory( return null; } - // Use distillation-suggested type if more appropriate - if (distillation.type !== type && isCognitiveType(distillation.type)) { + // Distillation may refine the type for UNVERIFIED content — a best-effort + // classification the caller didn't firmly assert. But when the caller marks + // the memory isVerified, its explicit type is authoritative and must not be + // overridden (e.g. system_pattern / constraint silently downgraded to + // belief). Previously the only escape was skipDistillation, a trap for any + // caller that didn't know the flag (see repoKnowledge.ts). Contract: explicit + // type + isVerified wins; unverified content is still auto-refined. + if (!options?.isVerified && distillation.type !== type && isCognitiveType(distillation.type)) { console.log(`[Memory] Type adjusted by distillation: ${type} → ${distillation.type}`); type = distillation.type; } From 5d61641f0c85557daac910eee1a8a10d0b3c730f Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 18 Jun 2026 19:09:20 +0900 Subject: [PATCH 5/5] fix(issues): deterministic event order via rowid tiebreaker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getEvents/getRecentEvents ordered by created_at DESC alone, but created_at is ms-precision TEXT — a created event and an immediate addEvent landing in the same millisecond ordered non-deterministically, intermittently failing the issueStore event test. Add rowid DESC as a tiebreaker (monotonic with insertion order), making the order total. No schema change. Isolated 25/25 and full suite 692/692 across repeated runs. Refs: INT-1437 --- src/issues/sqliteStore.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/issues/sqliteStore.ts b/src/issues/sqliteStore.ts index dad8e84..937a221 100644 --- a/src/issues/sqliteStore.ts +++ b/src/issues/sqliteStore.ts @@ -497,14 +497,19 @@ export class SqliteIssueStore implements IIssueStore { } getEvents(issueId: string, limit = 50): IssueEvent[] { + // rowid DESC is the tiebreaker: created_at is ms-precision TEXT, so events + // written in the same millisecond (e.g. createIssue's 'created' + an + // immediate addEvent) would otherwise order non-deterministically. rowid is + // monotonic with insertion order, so the newest event always sorts first. return (this.db.prepare( - 'SELECT * FROM issue_events WHERE issue_id = ? ORDER BY created_at DESC LIMIT ?' + 'SELECT * FROM issue_events WHERE issue_id = ? ORDER BY created_at DESC, rowid DESC LIMIT ?' ).all(issueId, limit) as any[]).map(this.rowToEvent); } getRecentEvents(limit = 20): IssueEvent[] { + // rowid DESC tiebreaker for same-millisecond created_at — see getEvents. return (this.db.prepare( - 'SELECT * FROM issue_events ORDER BY created_at DESC LIMIT ?' + 'SELECT * FROM issue_events ORDER BY created_at DESC, rowid DESC LIMIT ?' ).all(limit) as any[]).map(this.rowToEvent); }