diff --git a/internal/daemon/hooks_test.go b/internal/daemon/hooks_test.go index 74ef52d8..189d85fd 100644 --- a/internal/daemon/hooks_test.go +++ b/internal/daemon/hooks_test.go @@ -1039,9 +1039,20 @@ func TestHandleEventNoLogWhenNoHooksMatch(t *testing.T) { } func TestWaitUntilIdle_ConcurrentEvents(t *testing.T) { - t.Parallel() // A dedicated stress test proving WaitUntilIdle waits past the event-processing boundary // under timing races and concurrent broadcasts. + if runtime.GOOS != "windows" { + t.Parallel() + } + + iterations := 50 + numEvents := 10 + if runtime.GOOS == "windows" { + // Each hook starts a PowerShell process on Windows. Keep the test broad + // enough to exercise concurrent broadcasts without exhausting CI time. + iterations = 5 + numEvents = 4 + } tmpDir := t.TempDir() @@ -1053,11 +1064,10 @@ func TestWaitUntilIdle_ConcurrentEvents(t *testing.T) { }, } - for i := range 50 { + for i := range iterations { hr, broadcaster := setupRunner(t, cfg) var wg sync.WaitGroup - numEvents := 10 for j := range numEvents { wg.Add(1) diff --git a/internal/prompt/templates/claude-code_review.md.gotmpl b/internal/prompt/templates/claude-code_review.md.gotmpl index f7984dc0..6ce4dfa5 100644 --- a/internal/prompt/templates/claude-code_review.md.gotmpl +++ b/internal/prompt/templates/claude-code_review.md.gotmpl @@ -11,6 +11,7 @@ Do NOT build the project, run the test suite, or execute the code while reviewin - **Location**: File and line number where possible - **Problem**: Concise description of the issue - **Fix**: Brief suggested fix +- Separate multiple findings with `---` on its own line. - After the findings, add `## Summary` with a single sentence summarizing the change. - If you find no issues, start immediately with `No issues found.` and then add `Summary: `. diff --git a/internal/prompt/templates/codex_review.md.gotmpl b/internal/prompt/templates/codex_review.md.gotmpl index ba96ab6a..957ae239 100644 --- a/internal/prompt/templates/codex_review.md.gotmpl +++ b/internal/prompt/templates/codex_review.md.gotmpl @@ -12,6 +12,7 @@ Do NOT search or read files outside the repository checkout, except for an expli - **Location**: File and line number where possible - **Problem**: Concise description of the issue - **Fix**: Brief suggested fix +- Separate multiple findings with `---` on its own line. - After the findings, add `## Summary` with a single sentence summarizing the change. - If you find no issues, start immediately with `No issues found.` and then add `Summary: `. diff --git a/internal/prompt/templates/default_dirty.md.gotmpl b/internal/prompt/templates/default_dirty.md.gotmpl index 8433e492..9e3a6d86 100644 --- a/internal/prompt/templates/default_dirty.md.gotmpl +++ b/internal/prompt/templates/default_dirty.md.gotmpl @@ -23,6 +23,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/templates/default_range.md.gotmpl b/internal/prompt/templates/default_range.md.gotmpl index c378ec37..f0291fb4 100644 --- a/internal/prompt/templates/default_range.md.gotmpl +++ b/internal/prompt/templates/default_range.md.gotmpl @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/templates/default_review.md.gotmpl b/internal/prompt/templates/default_review.md.gotmpl index 436b7faf..71684684 100644 --- a/internal/prompt/templates/default_review.md.gotmpl +++ b/internal/prompt/templates/default_review.md.gotmpl @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/templates/default_security.md.gotmpl b/internal/prompt/templates/default_security.md.gotmpl index 8e2d8dbd..4643eca6 100644 --- a/internal/prompt/templates/default_security.md.gotmpl +++ b/internal/prompt/templates/default_security.md.gotmpl @@ -24,6 +24,7 @@ For each finding, provide: - File and line reference - The specific code path an attacker would exploit and what they gain - Suggested remediation +- Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file-level when the issue spans a range) and describe a plausible exploit path. The severity must match the exploitability you described. Drop any finding that fails these checks. diff --git a/internal/prompt/templates/gemini_review.md.gotmpl b/internal/prompt/templates/gemini_review.md.gotmpl index 47816269..77343aac 100644 --- a/internal/prompt/templates/gemini_review.md.gotmpl +++ b/internal/prompt/templates/gemini_review.md.gotmpl @@ -13,6 +13,7 @@ Do NOT build the project, run the test suite, or execute the code while reviewin - **Location**: File and line number - **Problem**: Concise description - **Fix**: Brief suggested fix + - Separate multiple findings with `---` on its own line. - If no issues are found, state "No issues found." ## Review Criteria diff --git a/internal/prompt/templates_test.go b/internal/prompt/templates_test.go index c6813433..c59d3837 100644 --- a/internal/prompt/templates_test.go +++ b/internal/prompt/templates_test.go @@ -109,6 +109,33 @@ func TestGetSystemPrompt_Fallbacks(t *testing.T) { } } +func TestGetSystemPrompt_ReviewPromptsRequireFindingSeparators(t *testing.T) { + fixedTime := time.Date(2030, 6, 15, 0, 0, 0, 0, time.UTC) + mockNow := func() time.Time { return fixedTime } + + tests := []struct { + name string + agent string + command string + }{ + {name: "default review", agent: "test", command: "review"}, + {name: "default dirty", agent: "test", command: "dirty"}, + {name: "default range", agent: "test", command: "range"}, + {name: "default security", agent: "test", command: "security"}, + {name: "claude review", agent: "claude-code", command: "review"}, + {name: "codex review", agent: "codex", command: "review"}, + {name: "gemini review", agent: "gemini", command: "review"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := getSystemPrompt(tt.agent, tt.command, mockNow) + + assert.Contains(t, got, "Separate multiple findings with `---` on its own line.") + }) + } +} + func TestGetSystemPrompt_DateInjection(t *testing.T) { fixedTime := time.Date(2030, 6, 15, 0, 0, 0, 0, time.UTC) fixedDateStr := "Current date: 2030-06-15 (UTC)" diff --git a/internal/prompt/testdata/golden/dirty_review.golden b/internal/prompt/testdata/golden/dirty_review.golden index 40b84451..f5063a7d 100644 --- a/internal/prompt/testdata/golden/dirty_review.golden +++ b/internal/prompt/testdata/golden/dirty_review.golden @@ -23,6 +23,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/dirty_truncated.golden b/internal/prompt/testdata/golden/dirty_truncated.golden index bd53021d..f73289ab 100644 --- a/internal/prompt/testdata/golden/dirty_truncated.golden +++ b/internal/prompt/testdata/golden/dirty_truncated.golden @@ -23,6 +23,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. @@ -112,9 +113,6 @@ index 0000000..1111111 +a line of content +a line of content +a line of content -+a line of content -+a line of content -+a line of content -+a line of conte ++a line of co ... (truncated) ``` diff --git a/internal/prompt/testdata/golden/previous_reviews_with_comments.golden b/internal/prompt/testdata/golden/previous_reviews_with_comments.golden index 245a4910..e832ed5d 100644 --- a/internal/prompt/testdata/golden/previous_reviews_with_comments.golden +++ b/internal/prompt/testdata/golden/previous_reviews_with_comments.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/range_truncated.golden b/internal/prompt/testdata/golden/range_truncated.golden index 37f9c708..b5ec0203 100644 --- a/internal/prompt/testdata/golden/range_truncated.golden +++ b/internal/prompt/testdata/golden/range_truncated.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/range_truncated_codex_in_range.golden b/internal/prompt/testdata/golden/range_truncated_codex_in_range.golden index 9910db58..e55fc6f4 100644 --- a/internal/prompt/testdata/golden/range_truncated_codex_in_range.golden +++ b/internal/prompt/testdata/golden/range_truncated_codex_in_range.golden @@ -12,6 +12,7 @@ Do NOT search or read files outside the repository checkout, except for an expli - **Location**: File and line number where possible - **Problem**: Concise description of the issue - **Fix**: Brief suggested fix +- Separate multiple findings with `---` on its own line. - After the findings, add `## Summary` with a single sentence summarizing the change. - If you find no issues, start immediately with `No issues found.` and then add `Summary: `. diff --git a/internal/prompt/testdata/golden/range_with_in_range_reviews.golden b/internal/prompt/testdata/golden/range_with_in_range_reviews.golden index 8fcb3916..cd59035f 100644 --- a/internal/prompt/testdata/golden/range_with_in_range_reviews.golden +++ b/internal/prompt/testdata/golden/range_with_in_range_reviews.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/security_review.golden b/internal/prompt/testdata/golden/security_review.golden index 0718d11b..8e12e32c 100644 --- a/internal/prompt/testdata/golden/security_review.golden +++ b/internal/prompt/testdata/golden/security_review.golden @@ -24,6 +24,7 @@ For each finding, provide: - File and line reference - The specific code path an attacker would exploit and what they gain - Suggested remediation +- Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file-level when the issue spans a range) and describe a plausible exploit path. The severity must match the exploitability you described. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/single_review_claude_code.golden b/internal/prompt/testdata/golden/single_review_claude_code.golden index 916ba48c..de4d06fb 100644 --- a/internal/prompt/testdata/golden/single_review_claude_code.golden +++ b/internal/prompt/testdata/golden/single_review_claude_code.golden @@ -11,6 +11,7 @@ Do NOT build the project, run the test suite, or execute the code while reviewin - **Location**: File and line number where possible - **Problem**: Concise description of the issue - **Fix**: Brief suggested fix +- Separate multiple findings with `---` on its own line. - After the findings, add `## Summary` with a single sentence summarizing the change. - If you find no issues, start immediately with `No issues found.` and then add `Summary: `. diff --git a/internal/prompt/testdata/golden/single_review_codex.golden b/internal/prompt/testdata/golden/single_review_codex.golden index a01036c6..e6ecdf22 100644 --- a/internal/prompt/testdata/golden/single_review_codex.golden +++ b/internal/prompt/testdata/golden/single_review_codex.golden @@ -12,6 +12,7 @@ Do NOT search or read files outside the repository checkout, except for an expli - **Location**: File and line number where possible - **Problem**: Concise description of the issue - **Fix**: Brief suggested fix +- Separate multiple findings with `---` on its own line. - After the findings, add `## Summary` with a single sentence summarizing the change. - If you find no issues, start immediately with `No issues found.` and then add `Summary: `. diff --git a/internal/prompt/testdata/golden/single_review_default.golden b/internal/prompt/testdata/golden/single_review_default.golden index ffda1445..0aaee175 100644 --- a/internal/prompt/testdata/golden/single_review_default.golden +++ b/internal/prompt/testdata/golden/single_review_default.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/single_review_gemini.golden b/internal/prompt/testdata/golden/single_review_gemini.golden index 248d5e77..5830a6bd 100644 --- a/internal/prompt/testdata/golden/single_review_gemini.golden +++ b/internal/prompt/testdata/golden/single_review_gemini.golden @@ -13,6 +13,7 @@ Do NOT build the project, run the test suite, or execute the code while reviewin - **Location**: File and line number - **Problem**: Concise description - **Fix**: Brief suggested fix + - Separate multiple findings with `---` on its own line. - If no issues are found, state "No issues found." ## Review Criteria diff --git a/internal/prompt/testdata/golden/single_truncated_diff.golden b/internal/prompt/testdata/golden/single_truncated_diff.golden index 6472ce6f..262b831b 100644 --- a/internal/prompt/testdata/golden/single_truncated_diff.golden +++ b/internal/prompt/testdata/golden/single_truncated_diff.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/single_truncated_diff_codex.golden b/internal/prompt/testdata/golden/single_truncated_diff_codex.golden index 7f184a47..f9949d86 100644 --- a/internal/prompt/testdata/golden/single_truncated_diff_codex.golden +++ b/internal/prompt/testdata/golden/single_truncated_diff_codex.golden @@ -12,6 +12,7 @@ Do NOT search or read files outside the repository checkout, except for an expli - **Location**: File and line number where possible - **Problem**: Concise description of the issue - **Fix**: Brief suggested fix +- Separate multiple findings with `---` on its own line. - After the findings, add `## Summary` with a single sentence summarizing the change. - If you find no issues, start immediately with `No issues found.` and then add `Summary: `. diff --git a/internal/prompt/testdata/golden/single_with_additional_context.golden b/internal/prompt/testdata/golden/single_with_additional_context.golden index 23c18cea..735ab246 100644 --- a/internal/prompt/testdata/golden/single_with_additional_context.golden +++ b/internal/prompt/testdata/golden/single_with_additional_context.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/single_with_guidelines.golden b/internal/prompt/testdata/golden/single_with_guidelines.golden index 8980aea4..a84bd532 100644 --- a/internal/prompt/testdata/golden/single_with_guidelines.golden +++ b/internal/prompt/testdata/golden/single_with_guidelines.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/single_with_previous_reviews.golden b/internal/prompt/testdata/golden/single_with_previous_reviews.golden index 7db51f9f..8c6f067f 100644 --- a/internal/prompt/testdata/golden/single_with_previous_reviews.golden +++ b/internal/prompt/testdata/golden/single_with_previous_reviews.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks. diff --git a/internal/prompt/testdata/golden/single_with_severity_filter.golden b/internal/prompt/testdata/golden/single_with_severity_filter.golden index 26af6854..d5200555 100644 --- a/internal/prompt/testdata/golden/single_with_severity_filter.golden +++ b/internal/prompt/testdata/golden/single_with_severity_filter.golden @@ -28,6 +28,7 @@ After reviewing, provide: - File and line reference where possible - What specifically goes wrong if this is not fixed (concrete harm, not "violates best practices") - Suggested fix + - Separate multiple findings with `---` on its own line. Before finalizing, verify your review: every finding must reference the narrowest applicable location (line number when possible, file or diff-level when the issue is an omission or spans a range), the severity must match the impact you described, and no two findings should contradict each other. Drop any finding that fails these checks.