From 4823415234d27d9bd76f796e600bba3131c70114 Mon Sep 17 00:00:00 2001 From: Ian Ker-Seymer Date: Tue, 7 Apr 2026 14:05:45 -0400 Subject: [PATCH 1/2] Ensure summary step will retry failed tests --- ruby/lib/minitest/queue/runner.rb | 10 ++ ruby/test/integration/minitest_redis_test.rb | 120 +++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/ruby/lib/minitest/queue/runner.rb b/ruby/lib/minitest/queue/runner.rb index 5f924ecd..9fa58611 100644 --- a/ruby/lib/minitest/queue/runner.rb +++ b/ruby/lib/minitest/queue/runner.rb @@ -71,6 +71,16 @@ def run_command puts "Retrying failed tests." self.queue = retry_queue end + elsif retry? && !queue.rescue_connection_errors { queue.build.failed_tests }.empty? + # Automatic Buildkite retry (BUILDKITE_RETRY_TYPE=automatic): the main queue is + # exhausted but error_reports from the previous run still exist. Re-run failed + # tests so the report step can succeed if they pass this time. + reset_counters + retry_queue = queue.retry_queue + unless retry_queue.exhausted? + puts "Retrying failed tests." + self.queue = retry_queue + end end queue.rescue_connection_errors { queue.created_at = CI::Queue.time_now.to_f } diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb index cf78a7a2..c0b691ac 100644 --- a/ruby/test/integration/minitest_redis_test.rb +++ b/ruby/test/integration/minitest_redis_test.rb @@ -650,6 +650,126 @@ def test_automatic_retry assert_equal 'All tests were ran already', output end + def test_automatic_retry_reruns_failed_tests_so_report_can_succeed + # Simulate the scenario from the Sam O'Brien / Steph Sachrajda incident: + # A test fails on the first run (flaky infra timeout). Buildkite auto-retries + # the step (LSO). The retried worker finds the main queue exhausted and exits 0, + # making the step green. But error_reports persist in Redis, so the separate + # "Post Merge Testing Summary" (report command) still fails. + # + # Expected fix: automatic retry should re-run failed tests (retry_queue), + # not silently exit 0 when there are unresolved failures. + + # First run: the flaky test fails (FLAKY_TEST_PASS not set → assert_equal '1', nil fails) + out, err = capture_subprocess_io do + system( + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '1', + '--timeout', '1', + '-Itest', + 'test/flaky_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + output = normalize(out.lines.last.strip) + assert_equal 'Ran 2 tests, 2 assertions, 1 failures, 0 errors, 0 skips, 0 requeues in X.XXs', output + + # Buildkite auto-retries the step. The flaky condition is resolved (FLAKY_TEST_PASS=1). + # BUG: with BUILDKITE_RETRY_TYPE=automatic, manual_retry? returns false, so the runner + # skips retry_queue and finds the main queue exhausted → exits 0 without re-running + # the failed test. Error report stays in Redis. + out, err = capture_subprocess_io do + system( + { 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'automatic', 'FLAKY_TEST_PASS' => '1' }, + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '1', + '--timeout', '1', + '-Itest', + 'test/flaky_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + # After the fix, the automatic retry should re-run the failed test and report success. + # Currently this is "All tests were ran already" (the bug: test was NOT re-run). + assert_match(/Retrying failed tests/, out) + + # The report step runs after all workers complete. After the fix, the failed test + # passed on automatic retry so error_reports should be empty → report succeeds. + out, err = capture_subprocess_io do + system( + @exe, 'report', + '--queue', @redis_url, + '--build', '1', + '--timeout', '1', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + assert_match(/0 failures/, normalize(out)) + end + + def test_automatic_retry_report_still_fails_when_test_keeps_failing + # Inverse of test_automatic_retry_reruns_failed_tests_so_report_can_succeed. + # When the failing test also fails on automatic retry, the report must still + # report the failure — we must not incorrectly suppress errors. + + # First run: flaky test fails + out, err = capture_subprocess_io do + system( + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '1', + '--timeout', '1', + '-Itest', + 'test/flaky_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + output = normalize(out.lines.last.strip) + assert_equal 'Ran 2 tests, 2 assertions, 1 failures, 0 errors, 0 skips, 0 requeues in X.XXs', output + + # Automatic retry: FLAKY_TEST_PASS still not set → test fails again on retry + out, err = capture_subprocess_io do + system( + { 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'automatic' }, + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '1', + '--timeout', '1', + '-Itest', + 'test/flaky_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + + # Report must still fail — the test failed on retry too + out, err = capture_subprocess_io do + system( + @exe, 'report', + '--queue', @redis_url, + '--build', '1', + '--timeout', '1', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + assert_match(/1 failures/, normalize(out)) + end + def test_retry_fails_when_test_run_is_expired out, err = capture_subprocess_io do system( From 631e4414de60943019e853a9df49aeac4ff69be5 Mon Sep 17 00:00:00 2001 From: Ian Ker-Seymer Date: Tue, 7 Apr 2026 14:09:14 -0400 Subject: [PATCH 2/2] Bump to v0.87.0 --- ruby/Gemfile.lock | 2 +- ruby/lib/ci/queue/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ruby/Gemfile.lock b/ruby/Gemfile.lock index b7883774..6f0f5cc8 100644 --- a/ruby/Gemfile.lock +++ b/ruby/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - ci-queue (0.86.0) + ci-queue (0.87.0) logger GEM diff --git a/ruby/lib/ci/queue/version.rb b/ruby/lib/ci/queue/version.rb index 90519e6b..b142f74c 100644 --- a/ruby/lib/ci/queue/version.rb +++ b/ruby/lib/ci/queue/version.rb @@ -2,7 +2,7 @@ module CI module Queue - VERSION = '0.86.0' + VERSION = '0.87.0' DEV_SCRIPTS_ROOT = ::File.expand_path('../../../../../redis', __FILE__) RELEASE_SCRIPTS_ROOT = ::File.expand_path('../redis', __FILE__) end