From e0e493bbf33f0627ed22f7afe6eb03f61cab2f0a Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Sun, 19 Apr 2026 19:52:03 +0530 Subject: [PATCH 01/10] refactor: replace deprecated local_resources flag with local_cpu_resources and local_ram_resources in CI workflows --- .github/workflows/linux_ci.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index c8d8722..72c2040 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -44,7 +44,8 @@ jobs: run: | bazel build //... \ --jobs=4 \ - --local_resources=cpu=2,memory=4096 \ + --local_cpu_resources=2 \ + --local_ram_resources=4096 \ --host_jvm_args=-Xmx2g \ --verbose_failures @@ -52,7 +53,8 @@ jobs: run: | bazel test //... \ --jobs=4 \ - --local_resources=cpu=2,memory=4096 \ + --local_cpu_resources=2 \ + --local_ram_resources=4096 \ --host_jvm_args=-Xmx2g \ --test_output=all \ --verbose_failures @@ -61,7 +63,8 @@ jobs: run: | bazel test --config=tsan //... \ --jobs=2 \ - --local_resources=cpu=2,memory=4096 \ + --local_cpu_resources=2 \ + --local_ram_resources=4096 \ --host_jvm_args=-Xmx2g \ --test_output=all \ --verbose_failures @@ -70,7 +73,8 @@ jobs: run: | bazel test --config=asan //... \ --jobs=2 \ - --local_resources=cpu=2,memory=4096 \ + --local_cpu_resources=2 \ + --local_ram_resources=4096 \ --host_jvm_args=-Xmx2g \ --test_output=all \ --verbose_failures @@ -79,7 +83,8 @@ jobs: run: | bazel test --config=msan //... \ --jobs=2 \ - --local_resources=cpu=2,memory=4096 \ + --local_cpu_resources=2 \ + --local_ram_resources=4096 \ --host_jvm_args=-Xmx2g \ --test_output=all \ --verbose_failures From a141f2c998a1b4a30a6c0ef12e5a2d2f779d2168 Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Sun, 19 Apr 2026 20:14:04 +0530 Subject: [PATCH 02/10] refactor: centralize Bazel CI configurations into a dedicated bazelrc file --- .github/workflows/ci.bazelrc | 19 +++++++++++++++ .github/workflows/linux_ci.yml | 42 +++++++--------------------------- 2 files changed, 27 insertions(+), 34 deletions(-) create mode 100644 .github/workflows/ci.bazelrc diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc new file mode 100644 index 0000000..f464f37 --- /dev/null +++ b/.github/workflows/ci.bazelrc @@ -0,0 +1,19 @@ +# ============================================================ +# DCodeX: GitHub Actions CI Specialized Config +# Optimized for free-tier runners (2 CPUs, 7GB RAM) +# ============================================================ + +# 1. MEMORY LIMITS +# Limit Bazel server heap to 2GB to leave room for Clang/LLD +startup --host_jvm_args=-Xmx2g + +# 2. RESOURCE THROTTLING +# Limit parallel execution to avoid OOM and CPU contention +build --local_cpu_resources=2 +build --local_ram_resources=4096 +build --jobs=4 + +# 3. OBSERVABILITY +# Ensure full output in CI logs +test --test_output=all +build --verbose_failures diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 72c2040..84bdfeb 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -42,52 +42,26 @@ jobs: - name: Bazel Build run: | - bazel build //... \ - --jobs=4 \ - --local_cpu_resources=2 \ - --local_ram_resources=4096 \ - --host_jvm_args=-Xmx2g \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc build //... - name: Bazel Test (Standard) run: | - bazel test //... \ - --jobs=4 \ - --local_cpu_resources=2 \ - --local_ram_resources=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc test //... - name: Bazel Test (TSan) run: | - bazel test --config=tsan //... \ - --jobs=2 \ - --local_cpu_resources=2 \ - --local_ram_resources=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \ + --jobs=2 - name: Bazel Test (ASan) run: | - bazel test --config=asan //... \ - --jobs=2 \ - --local_cpu_resources=2 \ - --local_ram_resources=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \ + --jobs=2 - name: Bazel Test (MSan) run: | - bazel test --config=msan //... \ - --jobs=2 \ - --local_cpu_resources=2 \ - --local_ram_resources=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=msan //... \ + --jobs=2 - name: Upload Test Logs on Failure if: failure() From c772ee4a0e67269987b9d0edd52471477b06b437 Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:26:11 +0530 Subject: [PATCH 03/10] feat: configure MSan build settings and suppression rules for CI --- .github/workflows/ci.bazelrc | 6 ++++++ .github/workflows/msan_suppressions.txt | 11 +++++++++++ 2 files changed, 17 insertions(+) create mode 100644 .github/workflows/msan_suppressions.txt diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc index f464f37..09157f1 100644 --- a/.github/workflows/ci.bazelrc +++ b/.github/workflows/ci.bazelrc @@ -17,3 +17,9 @@ build --jobs=4 # Ensure full output in CI logs test --test_output=all build --verbose_failures + +# 4. SANITIZER HARDENING +# Use suppressions to ignore system library false positives +test:msan --action_env=MSAN_OPTIONS="halt_on_error=1:exitcode=77:suppressions=.github/workflows/msan_suppressions.txt" +# Recommended for LLVM 19+ +build:msan --copt=-fsanitize-memory-param-retval diff --git a/.github/workflows/msan_suppressions.txt b/.github/workflows/msan_suppressions.txt new file mode 100644 index 0000000..238243a --- /dev/null +++ b/.github/workflows/msan_suppressions.txt @@ -0,0 +1,11 @@ +# MSan suppressions for DCodeX +# These ignore false positives from the non-instrumented system libstdc++ +# and googletest internals. + +# Ignore googletest internal message formatting +interceptor_via_lib:libstdc++.so.6 +interceptor_via_fun:testing::Message::Message +interceptor_via_fun:testing::internal::* + +# Ignore specific system library calls that touch uninstrumented memory +interceptor_via_lib:libc.so.6 From 77bc7d7ca20099fe11e19f6ebe0f4a33a4c5af07 Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:32:22 +0530 Subject: [PATCH 04/10] feat: add TSan proof-of-detection test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a deliberately buggy BuggyMetricsCollector that mirrors DCodeX's real DynamicWorkerCoordinator::Metrics pattern. It contains intentional data races (unprotected read-modify-write on shared counters and a TOCTOU race in GetCacheHitRate) that are invisible to normal testing but caught deterministically by ThreadSanitizer. The test is tagged 'manual' so it never runs in //..., and a new CI step ('TSan Proof-of-Detection') runs it under --config=tsan and EXPECTS failure. If TSan fails to detect the race, the CI itself fails — proving the sanitizer pipeline is broken. Verified locally: - Without TSan: PASSED (race is invisible) - With TSan: FAILED with 'ThreadSanitizer: data race' (race caught) --- .github/workflows/linux_ci.yml | 33 ++++++ src/engine/BUILD | 19 +++ src/engine/tsan_proof_test.cc | 207 +++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 src/engine/tsan_proof_test.cc diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 84bdfeb..308beff 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -53,6 +53,39 @@ jobs: bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \ --jobs=2 + - name: "🔬 TSan Proof-of-Detection" + run: | + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " PROOF: Running deliberately buggy code under TSan." + echo " TSan MUST detect the data race for this step to pass." + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + set +e + bazel --bazelrc=.github/workflows/ci.bazelrc test \ + --config=tsan \ + --runs_per_test=1 \ + --test_output=all \ + //src/engine:tsan_proof_test 2>&1 | tee /tmp/tsan_proof.log + EXIT_CODE=$? + set -e + + if [ "$EXIT_CODE" -eq 0 ]; then + echo "" + echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!" + echo "::error::The sanitizer pipeline is BROKEN. Investigate immediately." + exit 1 + fi + + if grep -q "ThreadSanitizer" /tmp/tsan_proof.log; then + echo "" + echo "✅ ThreadSanitizer correctly detected the data race." + echo " The sanitizer pipeline is verified and operational." + echo " Your codebase is protected against concurrency bugs." + else + echo "" + echo "::error::Test failed but NOT due to TSan detection. Check logs." + exit 1 + fi + - name: Bazel Test (ASan) run: | bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \ diff --git a/src/engine/BUILD b/src/engine/BUILD index 33c044b..9035596 100755 --- a/src/engine/BUILD +++ b/src/engine/BUILD @@ -232,6 +232,25 @@ cc_test( ], ) +# ── TSan Proof-of-Detection ────────────────────────────────────────── +# This test contains DELIBERATE data races. It is tagged "manual" so it +# is excluded from `bazel test //...`. The CI runs it in a special step +# that EXPECTS TSan to detect the race and exit with code 66. If TSan +# does NOT detect the race, the CI step fails — proving the sanitizer +# pipeline is broken. +cc_test( + name = "tsan_proof_test", + srcs = ["tsan_proof_test.cc"], + copts = ["-std=c++23"], + linkstatic = True, + linkopts = ["-pthread"], + tags = ["manual", "no-sandbox", "exclusive"], + deps = [ + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + test_suite( name = "concurrency_tests", tests = [ diff --git a/src/engine/tsan_proof_test.cc b/src/engine/tsan_proof_test.cc new file mode 100644 index 0000000..8b95014 --- /dev/null +++ b/src/engine/tsan_proof_test.cc @@ -0,0 +1,207 @@ +// Copyright 2024 DCodeX Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ===================================================================== +// TSan PROOF-OF-DETECTION Test +// ===================================================================== +// +// PURPOSE: +// This file contains a DELIBERATELY BUGGY implementation of a pattern +// that is common in high-performance systems like DCodeX: a metrics +// collector where a developer removed mutex protection for "performance", +// introducing a subtle data race. +// +// WHY IT'S SUBTLE: +// On x86-64, aligned 64-bit reads and writes are naturally atomic at +// the hardware level. This means that in normal testing — even with +// hundreds of threads — the race almost never manifests as a wrong +// answer. The program "works fine" in dev, in staging, and even in +// production for weeks. Then, one day, the compiler reorders a store +// past a load, or the CPU's store buffer batches two writes, and you +// get a corrupted metrics snapshot served to a monitoring dashboard. +// +// WHAT THIS PROVES: +// ThreadSanitizer instruments every memory access at compile time and +// detects the happens-before violation DETERMINISTICALLY, regardless +// of whether the hardware would actually reorder. This test is run in +// CI with --config=tsan and is EXPECTED TO FAIL with exit code 66. +// If TSan does NOT detect the race, the CI step fails — proving the +// sanitizer pipeline itself is broken. +// +// DO NOT FIX THE BUGS IN THIS FILE. THEY ARE INTENTIONAL. +// ===================================================================== + +#include + +#include +#include +#include +#include + +namespace { + +// ===================================================================== +// DELIBERATELY BUGGY: Unprotected Execution Metrics Collector +// +// This mirrors the real DCodeX pattern: +// DynamicWorkerCoordinator::Metrics → MetricsSnapshot +// GetMetrics() → GetSnapshot() +// completed_requests_ → total_executions_ +// +// The real code uses absl::Mutex + ABSL_GUARDED_BY correctly. +// This "optimized" version removes the lock. The race is on the +// compound read-modify-write of the shared struct fields. +// ===================================================================== + +struct MetricsSnapshot { + int64_t total_executions; + int64_t cache_hits; + int64_t cache_misses; + double cumulative_latency_ms; + double avg_latency_ms; +}; + +class BuggyMetricsCollector { + public: + BuggyMetricsCollector() + : total_executions_(0), + cache_hits_(0), + cache_misses_(0), + cumulative_latency_ms_(0.0) {} + + // BUG: Multiple threads call this concurrently without synchronization. + // The individual increments LOOK atomic on x86, but: + // 1) They are not atomic in the C++ memory model. + // 2) The compound operation (read + modify + write) on + // cumulative_latency_ms_ is NEVER atomic, even on x86. + // 3) The cache_hit branch means different threads take different + // code paths, increasing the window for interleaving. + void RecordExecution(double latency_ms, bool cache_hit) { + total_executions_++; // DATA RACE + cumulative_latency_ms_ += latency_ms; // DATA RACE (compound RMW) + + if (cache_hit) { + cache_hits_++; // DATA RACE + } else { + cache_misses_++; // DATA RACE + } + } + + // BUG: TOCTOU race — reads of total_executions_ and cumulative_latency_ms_ + // are not atomic with respect to each other. A writer can increment + // total_executions_ between our read of cumulative_latency_ms_ and + // our read of total_executions_, producing a snapshot where + // avg_latency = cumulative / (N+1) instead of cumulative / N. + MetricsSnapshot GetSnapshot() const { + MetricsSnapshot snap; + snap.total_executions = total_executions_; // DATA RACE (read) + snap.cache_hits = cache_hits_; // DATA RACE (read) + snap.cache_misses = cache_misses_; // DATA RACE (read) + snap.cumulative_latency_ms = cumulative_latency_ms_;// DATA RACE (read) + + // TOCTOU: total_executions_ may have changed between the reads above. + if (snap.total_executions > 0) { + snap.avg_latency_ms = + snap.cumulative_latency_ms / static_cast(snap.total_executions); + } else { + snap.avg_latency_ms = 0.0; + } + return snap; + } + + double GetCacheHitRate() const { + int64_t total = total_executions_; // DATA RACE (read) + if (total == 0) return 0.0; + int64_t hits = cache_hits_; // DATA RACE (read, TOCTOU with above) + return static_cast(hits) / static_cast(total); + } + + private: + // Shared mutable state with NO synchronization. + // In the real DCodeX code, these would be protected by absl::Mutex + // or std::atomic. Here, they are deliberately unprotected. + int64_t total_executions_; + int64_t cache_hits_; + int64_t cache_misses_; + double cumulative_latency_ms_; +}; + +} // namespace + +// ===================================================================== +// TEST: Exercises the race with realistic DCodeX-like traffic patterns. +// +// Without TSan: Passes on x86 (hardware hides the race). +// With TSan: Fails with exit code 66 on the FIRST racy access. +// ===================================================================== +TEST(TsanProof, CatchesMetricsCollectorRace) { + BuggyMetricsCollector collector; + + constexpr int kWriterThreads = 6; + constexpr int kReaderThreads = 2; + constexpr int kOpsPerWriter = 5000; + constexpr int kReadsPerReader = 2000; + + std::atomic start{false}; + std::vector threads; + threads.reserve(kWriterThreads + kReaderThreads); + + // --- Writer threads (simulate concurrent gRPC Execute requests) --- + for (int t = 0; t < kWriterThreads; ++t) { + threads.emplace_back([&collector, &start, t]() { + while (!start.load(std::memory_order_acquire)) { + // spin-wait for synchronized start + } + for (int i = 0; i < kOpsPerWriter; ++i) { + double latency = 1.0 + static_cast(i % 50) * 0.1; + bool cache_hit = ((i + t) % 3) != 0; // ~67% hit rate + collector.RecordExecution(latency, cache_hit); + } + }); + } + + // --- Reader threads (simulate monitoring/GetSystemMetrics RPCs) --- + for (int r = 0; r < kReaderThreads; ++r) { + threads.emplace_back([&collector, &start]() { + while (!start.load(std::memory_order_acquire)) { + // spin-wait for synchronized start + } + volatile double sink = 0.0; + for (int i = 0; i < kReadsPerReader; ++i) { + MetricsSnapshot snap = collector.GetSnapshot(); + double hit_rate = collector.GetCacheHitRate(); + + // Consume values to prevent compiler optimization. + // We do NOT assert here — the values may be torn/inconsistent + // due to the race, and that's the whole point. Only TSan + // should report the error. + sink += snap.avg_latency_ms + hit_rate; + } + (void)sink; + }); + } + + // Release all threads simultaneously to maximize contention. + start.store(true, std::memory_order_release); + + for (auto& t : threads) { + t.join(); + } + + // Without TSan, this test passes — the values may be slightly off on + // some architectures, but no assertion checks them during the race. + // With TSan, the process was killed long before reaching this line. + MetricsSnapshot final_snap = collector.GetSnapshot(); + EXPECT_GT(final_snap.total_executions, 0); +} From d30d25f578e57cbd746f87f039695399a8824a84 Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Mon, 20 Apr 2026 07:50:50 +0530 Subject: [PATCH 05/10] fix: resolve CI bash pipefail and bazel flag deprecation warnings * Refactor the TSan proof step in CI to avoid bash pipefail conflicts when capturing Bazel exit codes. * Read the test log directly from bazel-testlogs to reliably detect the ThreadSanitizer data race footprint. * Update ci.bazelrc to use the non-deprecated local_resources=cpu=N syntax for Bazel 7.x. --- .github/workflows/ci.bazelrc | 4 ++-- .github/workflows/linux_ci.yml | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc index 09157f1..7ac6878 100644 --- a/.github/workflows/ci.bazelrc +++ b/.github/workflows/ci.bazelrc @@ -9,8 +9,8 @@ startup --host_jvm_args=-Xmx2g # 2. RESOURCE THROTTLING # Limit parallel execution to avoid OOM and CPU contention -build --local_cpu_resources=2 -build --local_ram_resources=4096 +build --local_resources=cpu=2 +build --local_resources=memory=4096 build --jobs=4 # 3. OBSERVABILITY diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 308beff..22f81a1 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -59,23 +59,22 @@ jobs: echo " PROOF: Running deliberately buggy code under TSan." echo " TSan MUST detect the data race for this step to pass." echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - set +e + + EXIT_CODE=0 bazel --bazelrc=.github/workflows/ci.bazelrc test \ --config=tsan \ --runs_per_test=1 \ - --test_output=all \ - //src/engine:tsan_proof_test 2>&1 | tee /tmp/tsan_proof.log - EXIT_CODE=$? - set -e + --test_output=errors \ + //src/engine:tsan_proof_test || EXIT_CODE=$? if [ "$EXIT_CODE" -eq 0 ]; then - echo "" echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!" echo "::error::The sanitizer pipeline is BROKEN. Investigate immediately." exit 1 fi - if grep -q "ThreadSanitizer" /tmp/tsan_proof.log; then + TEST_LOG="bazel-testlogs/src/engine/tsan_proof_test/test.log" + if [ -f "$TEST_LOG" ] && grep -q "ThreadSanitizer" "$TEST_LOG"; then echo "" echo "✅ ThreadSanitizer correctly detected the data race." echo " The sanitizer pipeline is verified and operational." @@ -83,6 +82,7 @@ jobs: else echo "" echo "::error::Test failed but NOT due to TSan detection. Check logs." + cat "$TEST_LOG" || true exit 1 fi From 0bb332b24c2dd4b14260d4fb45e8ca466769321b Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Tue, 28 Apr 2026 23:34:37 +0530 Subject: [PATCH 06/10] ci: retrigger pipeline From f48b6b48fae8dbb8d7ca21938e8a7c608046b67d Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Wed, 29 Apr 2026 00:21:27 +0530 Subject: [PATCH 07/10] fix: exclude sandbox_test from TSan to prevent OOM on CI sandbox_test forks clang++ and compiled binaries as child processes. Under TSan (8x memory overhead), these forked processes exhaust the 7GB CI runner memory, causing 'out of memory: failed to allocate TracePart' errors and spurious exit code 66 failures. Tagged sandbox_test with 'no-tsan' and added --test_tag_filters=-no-tsan to the CI TSan step. sandbox_test still runs under Standard, ASan, and MSan configurations. --- .github/workflows/linux_ci.yml | 3 ++- src/engine/BUILD | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 22f81a1..904b12b 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -51,7 +51,8 @@ jobs: - name: Bazel Test (TSan) run: | bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \ - --jobs=2 + --jobs=2 \ + --test_tag_filters=-no-tsan - name: "🔬 TSan Proof-of-Detection" run: | diff --git a/src/engine/BUILD b/src/engine/BUILD index 9035596..481663d 100755 --- a/src/engine/BUILD +++ b/src/engine/BUILD @@ -206,6 +206,10 @@ cc_test( timeout = "moderate", linkstatic = True, linkopts = ["-pthread"], + # Excluded from TSan runs: this test forks clang++ and compiled binaries. + # TSan-instrumenting those child processes causes OOM on CI runners + # and provides no concurrency testing value. + tags = ["no-tsan"], deps = [ ":sandbox", ":dynamic_worker_coordinator", From eccba992b0f6d96995dc3dc25d5193d4f048d24a Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Wed, 29 Apr 2026 00:32:37 +0530 Subject: [PATCH 08/10] fix: run sandbox_test under TSan with constrained resources (Option D) Instead of excluding sandbox_test from TSan entirely, split the TSan step into two: 1. Main TSan step: runs all tests except sandbox_test (20 iterations) 2. Constrained TSan step: runs sandbox_test with --runs_per_test=1 and --local_test_jobs=1 to prevent OOM from forked clang++ Also reduce CI log noise: - test_output=errors (only print output for failing tests) - noannounce_rc (suppress the full bazelrc option dump) - show_progress_rate_limit=5 (throttle progress updates) - curses=no (disable terminal control sequences) --- .github/workflows/ci.bazelrc | 11 ++++++++--- .github/workflows/linux_ci.yml | 13 +++++++++++-- src/engine/BUILD | 7 +++---- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc index 7ac6878..fed2ae4 100644 --- a/.github/workflows/ci.bazelrc +++ b/.github/workflows/ci.bazelrc @@ -13,10 +13,15 @@ build --local_resources=cpu=2 build --local_resources=memory=4096 build --jobs=4 -# 3. OBSERVABILITY -# Ensure full output in CI logs -test --test_output=all +# 3. QUIET LOGGING +# Only print output for FAILED tests (not every passing test) +test --test_output=errors build --verbose_failures +# Suppress the full option dump on every command +common --noannounce_rc +# Collapse progress into summary lines +build --show_progress_rate_limit=5 +build --curses=no # 4. SANITIZER HARDENING # Use suppressions to ignore system library false positives diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 904b12b..aff2d9e 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -50,9 +50,18 @@ jobs: - name: Bazel Test (TSan) run: | - bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \ + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \ + //... \ --jobs=2 \ - --test_tag_filters=-no-tsan + --test_tag_filters=-no-sandbox-tsan + + - name: Bazel Test (TSan - Sandbox, constrained) + run: | + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \ + //src/engine:sandbox_test \ + --jobs=1 \ + --runs_per_test=1 \ + --local_test_jobs=1 - name: "🔬 TSan Proof-of-Detection" run: | diff --git a/src/engine/BUILD b/src/engine/BUILD index 481663d..403523d 100755 --- a/src/engine/BUILD +++ b/src/engine/BUILD @@ -206,10 +206,9 @@ cc_test( timeout = "moderate", linkstatic = True, linkopts = ["-pthread"], - # Excluded from TSan runs: this test forks clang++ and compiled binaries. - # TSan-instrumenting those child processes causes OOM on CI runners - # and provides no concurrency testing value. - tags = ["no-tsan"], + # Run under TSan separately with constrained resources (see CI workflow). + # This test forks clang++ which has high memory overhead under TSan. + tags = ["no-sandbox-tsan"], deps = [ ":sandbox", ":dynamic_worker_coordinator", From eac187a8e17cb2053aab455ff354df1d36db37a0 Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Wed, 29 Apr 2026 23:23:32 +0530 Subject: [PATCH 09/10] fix: redirect bazel output to file for TSan proof grep The test log symlink lives at .bazel/testlogs/ (due to --symlink_prefix=.bazel/) not bazel-testlogs/. Instead of guessing the symlink path, capture bazel's stdout+stderr to /tmp/tsan_proof_output.log and grep that directly. --- .github/workflows/linux_ci.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index aff2d9e..a8598d7 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -74,8 +74,8 @@ jobs: bazel --bazelrc=.github/workflows/ci.bazelrc test \ --config=tsan \ --runs_per_test=1 \ - --test_output=errors \ - //src/engine:tsan_proof_test || EXIT_CODE=$? + --test_output=all \ + //src/engine:tsan_proof_test > /tmp/tsan_proof_output.log 2>&1 || EXIT_CODE=$? if [ "$EXIT_CODE" -eq 0 ]; then echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!" @@ -83,16 +83,15 @@ jobs: exit 1 fi - TEST_LOG="bazel-testlogs/src/engine/tsan_proof_test/test.log" - if [ -f "$TEST_LOG" ] && grep -q "ThreadSanitizer" "$TEST_LOG"; then + if grep -q "ThreadSanitizer" /tmp/tsan_proof_output.log; then echo "" echo "✅ ThreadSanitizer correctly detected the data race." echo " The sanitizer pipeline is verified and operational." echo " Your codebase is protected against concurrency bugs." else echo "" - echo "::error::Test failed but NOT due to TSan detection. Check logs." - cat "$TEST_LOG" || true + echo "::error::Test failed but NOT due to TSan detection. Check logs:" + cat /tmp/tsan_proof_output.log exit 1 fi From c344ec844cd0f494f3d2dc5655b8be4932a017fc Mon Sep 17 00:00:00 2001 From: Sam-Si <13261099+Sam-Si@users.noreply.github.com> Date: Thu, 30 Apr 2026 10:16:34 +0530 Subject: [PATCH 10/10] =?UTF-8?q?fix:=20remove=20MSan=20from=20CI=20?= =?UTF-8?q?=E2=80=94=20requires=20instrumented=20libc++?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSan fundamentally requires ALL linked libraries to be compiled with MSan instrumentation. The system libstdc++ on GitHub Actions runners is not instrumented, causing every test to fail in googletest's static initialization (testing::Message::Message) before any DCodeX code even executes. TSan and ASan remain — they work correctly with non-instrumented system libraries. The MSan config is preserved in .bazelrc for local use with a custom toolchain. --- .github/workflows/linux_ci.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index a8598d7..ccf2f74 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -100,10 +100,11 @@ jobs: bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \ --jobs=2 - - name: Bazel Test (MSan) - run: | - bazel --bazelrc=.github/workflows/ci.bazelrc test --config=msan //... \ - --jobs=2 + # MSan is intentionally excluded from CI. It requires ALL linked libraries + # (including libstdc++) to be compiled with MSan instrumentation. The system + # libstdc++ on GitHub runners is not instrumented, producing false positives + # in googletest before any DCodeX code even runs. To use MSan, build a + # custom toolchain with instrumented libc++ (see: https://clang.llvm.org/docs/MemorySanitizer.html#handling-external-code). - name: Upload Test Logs on Failure if: failure()