From e0e493bbf33f0627ed22f7afe6eb03f61cab2f0a Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Sun, 19 Apr 2026 19:52:03 +0530
Subject: [PATCH 01/10] refactor: replace deprecated local_resources flag with
 local_cpu_resources and local_ram_resources in CI workflows

---
 .github/workflows/linux_ci.yml | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index c8d8722..72c2040 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -44,7 +44,8 @@ jobs:
       run: |
         bazel build //... \
           --jobs=4 \
-          --local_resources=cpu=2,memory=4096 \
+          --local_cpu_resources=2 \
+          --local_ram_resources=4096 \
           --host_jvm_args=-Xmx2g \
           --verbose_failures
 
@@ -52,7 +53,8 @@ jobs:
       run: |
         bazel test //... \
           --jobs=4 \
-          --local_resources=cpu=2,memory=4096 \
+          --local_cpu_resources=2 \
+          --local_ram_resources=4096 \
           --host_jvm_args=-Xmx2g \
           --test_output=all \
           --verbose_failures
@@ -61,7 +63,8 @@ jobs:
       run: |
         bazel test --config=tsan //... \
           --jobs=2 \
-          --local_resources=cpu=2,memory=4096 \
+          --local_cpu_resources=2 \
+          --local_ram_resources=4096 \
           --host_jvm_args=-Xmx2g \
           --test_output=all \
           --verbose_failures
@@ -70,7 +73,8 @@ jobs:
       run: |
         bazel test --config=asan //... \
           --jobs=2 \
-          --local_resources=cpu=2,memory=4096 \
+          --local_cpu_resources=2 \
+          --local_ram_resources=4096 \
           --host_jvm_args=-Xmx2g \
           --test_output=all \
           --verbose_failures
@@ -79,7 +83,8 @@ jobs:
       run: |
         bazel test --config=msan //... \
           --jobs=2 \
-          --local_resources=cpu=2,memory=4096 \
+          --local_cpu_resources=2 \
+          --local_ram_resources=4096 \
           --host_jvm_args=-Xmx2g \
           --test_output=all \
           --verbose_failures

From a141f2c998a1b4a30a6c0ef12e5a2d2f779d2168 Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Sun, 19 Apr 2026 20:14:04 +0530
Subject: [PATCH 02/10] refactor: centralize Bazel CI configurations into a
 dedicated bazelrc file

---
 .github/workflows/ci.bazelrc   | 19 +++++++++++++++
 .github/workflows/linux_ci.yml | 42 +++++++---------------------------
 2 files changed, 27 insertions(+), 34 deletions(-)
 create mode 100644 .github/workflows/ci.bazelrc

diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc
new file mode 100644
index 0000000..f464f37
--- /dev/null
+++ b/.github/workflows/ci.bazelrc
@@ -0,0 +1,19 @@
+# ============================================================
+# DCodeX: GitHub Actions CI Specialized Config
+# Optimized for free-tier runners (2 CPUs, 7GB RAM)
+# ============================================================
+
+# 1. MEMORY LIMITS
+# Limit Bazel server heap to 2GB to leave room for Clang/LLD
+startup --host_jvm_args=-Xmx2g
+
+# 2. RESOURCE THROTTLING
+# Limit parallel execution to avoid OOM and CPU contention
+build --local_cpu_resources=2
+build --local_ram_resources=4096
+build --jobs=4
+
+# 3. OBSERVABILITY
+# Ensure full output in CI logs
+test --test_output=all
+build --verbose_failures
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 72c2040..84bdfeb 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -42,52 +42,26 @@ jobs:
 
     - name: Bazel Build
       run: |
-        bazel build //... \
-          --jobs=4 \
-          --local_cpu_resources=2 \
-          --local_ram_resources=4096 \
-          --host_jvm_args=-Xmx2g \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc build //...
 
     - name: Bazel Test (Standard)
       run: |
-        bazel test //... \
-          --jobs=4 \
-          --local_cpu_resources=2 \
-          --local_ram_resources=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc test //...
 
     - name: Bazel Test (TSan)
       run: |
-        bazel test --config=tsan //... \
-          --jobs=2 \
-          --local_cpu_resources=2 \
-          --local_ram_resources=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \
+          --jobs=2
 
     - name: Bazel Test (ASan)
       run: |
-        bazel test --config=asan //... \
-          --jobs=2 \
-          --local_cpu_resources=2 \
-          --local_ram_resources=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \
+          --jobs=2
 
     - name: Bazel Test (MSan)
       run: |
-        bazel test --config=msan //... \
-          --jobs=2 \
-          --local_cpu_resources=2 \
-          --local_ram_resources=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=msan //... \
+          --jobs=2
 
     - name: Upload Test Logs on Failure
       if: failure()

From c772ee4a0e67269987b9d0edd52471477b06b437 Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Sun, 19 Apr 2026 22:26:11 +0530
Subject: [PATCH 03/10] feat: configure MSan build settings and suppression
 rules for CI

---
 .github/workflows/ci.bazelrc            |  6 ++++++
 .github/workflows/msan_suppressions.txt | 11 +++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 .github/workflows/msan_suppressions.txt

diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc
index f464f37..09157f1 100644
--- a/.github/workflows/ci.bazelrc
+++ b/.github/workflows/ci.bazelrc
@@ -17,3 +17,9 @@ build --jobs=4
 # Ensure full output in CI logs
 test --test_output=all
 build --verbose_failures
+
+# 4. SANITIZER HARDENING
+# Use suppressions to ignore system library false positives
+test:msan --action_env=MSAN_OPTIONS="halt_on_error=1:exitcode=77:suppressions=.github/workflows/msan_suppressions.txt"
+# Recommended for LLVM 19+
+build:msan --copt=-fsanitize-memory-param-retval
diff --git a/.github/workflows/msan_suppressions.txt b/.github/workflows/msan_suppressions.txt
new file mode 100644
index 0000000..238243a
--- /dev/null
+++ b/.github/workflows/msan_suppressions.txt
@@ -0,0 +1,11 @@
+# MSan suppressions for DCodeX
+# These ignore false positives from the non-instrumented system libstdc++
+# and googletest internals.
+
+# Ignore googletest internal message formatting
+interceptor_via_lib:libstdc++.so.6
+interceptor_via_fun:testing::Message::Message
+interceptor_via_fun:testing::internal::*
+
+# Ignore specific system library calls that touch uninstrumented memory
+interceptor_via_lib:libc.so.6

From 77bc7d7ca20099fe11e19f6ebe0f4a33a4c5af07 Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Sun, 19 Apr 2026 22:32:22 +0530
Subject: [PATCH 04/10] feat: add TSan proof-of-detection test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a deliberately buggy BuggyMetricsCollector that mirrors DCodeX's
real DynamicWorkerCoordinator::Metrics pattern. It contains intentional
data races (unprotected read-modify-write on shared counters and a
TOCTOU race in GetCacheHitRate) that are invisible to normal testing
but caught deterministically by ThreadSanitizer.

The test is tagged 'manual' so it never runs in //..., and a new CI
step ('TSan Proof-of-Detection') runs it under --config=tsan and
EXPECTS failure. If TSan fails to detect the race, the CI itself
fails — proving the sanitizer pipeline is broken.

Verified locally:
  - Without TSan: PASSED (race is invisible)
  - With TSan: FAILED with 'ThreadSanitizer: data race' (race caught)
---
 .github/workflows/linux_ci.yml |  33 ++++++
 src/engine/BUILD               |  19 +++
 src/engine/tsan_proof_test.cc  | 207 +++++++++++++++++++++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 src/engine/tsan_proof_test.cc

diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 84bdfeb..308beff 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -53,6 +53,39 @@ jobs:
         bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \
           --jobs=2
 
+    - name: "🔬 TSan Proof-of-Detection"
+      run: |
+        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+        echo "  PROOF: Running deliberately buggy code under TSan."
+        echo "  TSan MUST detect the data race for this step to pass."
+        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+        set +e
+        bazel --bazelrc=.github/workflows/ci.bazelrc test \
+          --config=tsan \
+          --runs_per_test=1 \
+          --test_output=all \
+          //src/engine:tsan_proof_test 2>&1 | tee /tmp/tsan_proof.log
+        EXIT_CODE=$?
+        set -e
+
+        if [ "$EXIT_CODE" -eq 0 ]; then
+          echo ""
+          echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!"
+          echo "::error::The sanitizer pipeline is BROKEN. Investigate immediately."
+          exit 1
+        fi
+
+        if grep -q "ThreadSanitizer" /tmp/tsan_proof.log; then
+          echo ""
+          echo "✅ ThreadSanitizer correctly detected the data race."
+          echo "   The sanitizer pipeline is verified and operational."
+          echo "   Your codebase is protected against concurrency bugs."
+        else
+          echo ""
+          echo "::error::Test failed but NOT due to TSan detection. Check logs."
+          exit 1
+        fi
+
     - name: Bazel Test (ASan)
       run: |
         bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \
diff --git a/src/engine/BUILD b/src/engine/BUILD
index 33c044b..9035596 100755
--- a/src/engine/BUILD
+++ b/src/engine/BUILD
@@ -232,6 +232,25 @@ cc_test(
     ],
 )
 
+# ── TSan Proof-of-Detection ──────────────────────────────────────────
+# This test contains DELIBERATE data races. It is tagged "manual" so it
+# is excluded from `bazel test //...`. The CI runs it in a special step
+# that EXPECTS TSan to detect the race and exit with code 66. If TSan
+# does NOT detect the race, the CI step fails — proving the sanitizer
+# pipeline is broken.
+cc_test(
+    name = "tsan_proof_test",
+    srcs = ["tsan_proof_test.cc"],
+    copts = ["-std=c++23"],
+    linkstatic = True,
+    linkopts = ["-pthread"],
+    tags = ["manual", "no-sandbox", "exclusive"],
+    deps = [
+        "@googletest//:gtest",
+        "@googletest//:gtest_main",
+    ],
+)
+
 test_suite(
     name = "concurrency_tests",
     tests = [
diff --git a/src/engine/tsan_proof_test.cc b/src/engine/tsan_proof_test.cc
new file mode 100644
index 0000000..8b95014
--- /dev/null
+++ b/src/engine/tsan_proof_test.cc
@@ -0,0 +1,207 @@
+// Copyright 2024 DCodeX Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// =====================================================================
+// TSan PROOF-OF-DETECTION Test
+// =====================================================================
+//
+// PURPOSE:
+//   This file contains a DELIBERATELY BUGGY implementation of a pattern
+//   that is common in high-performance systems like DCodeX: a metrics
+//   collector where a developer removed mutex protection for "performance",
+//   introducing a subtle data race.
+//
+// WHY IT'S SUBTLE:
+//   On x86-64, aligned 64-bit reads and writes are naturally atomic at
+//   the hardware level. This means that in normal testing — even with
+//   hundreds of threads — the race almost never manifests as a wrong
+//   answer. The program "works fine" in dev, in staging, and even in
+//   production for weeks. Then, one day, the compiler reorders a store
+//   past a load, or the CPU's store buffer batches two writes, and you
+//   get a corrupted metrics snapshot served to a monitoring dashboard.
+//
+// WHAT THIS PROVES:
+//   ThreadSanitizer instruments every memory access at compile time and
+//   detects the happens-before violation DETERMINISTICALLY, regardless
+//   of whether the hardware would actually reorder. This test is run in
+//   CI with --config=tsan and is EXPECTED TO FAIL with exit code 66.
+//   If TSan does NOT detect the race, the CI step fails — proving the
+//   sanitizer pipeline itself is broken.
+//
+// DO NOT FIX THE BUGS IN THIS FILE. THEY ARE INTENTIONAL.
+// =====================================================================
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cmath>
+#include <thread>
+#include <vector>
+
+namespace {
+
+// =====================================================================
+// DELIBERATELY BUGGY: Unprotected Execution Metrics Collector
+//
+// This mirrors the real DCodeX pattern:
+//   DynamicWorkerCoordinator::Metrics  → MetricsSnapshot
+//   GetMetrics()                       → GetSnapshot()
+//   completed_requests_               → total_executions_
+//
+// The real code uses absl::Mutex + ABSL_GUARDED_BY correctly.
+// This "optimized" version removes the lock. The race is on the
+// compound read-modify-write of the shared struct fields.
+// =====================================================================
+
+struct MetricsSnapshot {
+  int64_t total_executions;
+  int64_t cache_hits;
+  int64_t cache_misses;
+  double cumulative_latency_ms;
+  double avg_latency_ms;
+};
+
+class BuggyMetricsCollector {
+ public:
+  BuggyMetricsCollector()
+      : total_executions_(0),
+        cache_hits_(0),
+        cache_misses_(0),
+        cumulative_latency_ms_(0.0) {}
+
+  // BUG: Multiple threads call this concurrently without synchronization.
+  // The individual increments LOOK atomic on x86, but:
+  //   1) They are not atomic in the C++ memory model.
+  //   2) The compound operation (read + modify + write) on
+  //      cumulative_latency_ms_ is NEVER atomic, even on x86.
+  //   3) The cache_hit branch means different threads take different
+  //      code paths, increasing the window for interleaving.
+  void RecordExecution(double latency_ms, bool cache_hit) {
+    total_executions_++;                       // DATA RACE
+    cumulative_latency_ms_ += latency_ms;      // DATA RACE (compound RMW)
+
+    if (cache_hit) {
+      cache_hits_++;                           // DATA RACE
+    } else {
+      cache_misses_++;                         // DATA RACE
+    }
+  }
+
+  // BUG: TOCTOU race — reads of total_executions_ and cumulative_latency_ms_
+  // are not atomic with respect to each other. A writer can increment
+  // total_executions_ between our read of cumulative_latency_ms_ and
+  // our read of total_executions_, producing a snapshot where
+  // avg_latency = cumulative / (N+1) instead of cumulative / N.
+  MetricsSnapshot GetSnapshot() const {
+    MetricsSnapshot snap;
+    snap.total_executions = total_executions_;          // DATA RACE (read)
+    snap.cache_hits = cache_hits_;                      // DATA RACE (read)
+    snap.cache_misses = cache_misses_;                  // DATA RACE (read)
+    snap.cumulative_latency_ms = cumulative_latency_ms_;// DATA RACE (read)
+
+    // TOCTOU: total_executions_ may have changed between the reads above.
+    if (snap.total_executions > 0) {
+      snap.avg_latency_ms =
+          snap.cumulative_latency_ms / static_cast<double>(snap.total_executions);
+    } else {
+      snap.avg_latency_ms = 0.0;
+    }
+    return snap;
+  }
+
+  double GetCacheHitRate() const {
+    int64_t total = total_executions_;   // DATA RACE (read)
+    if (total == 0) return 0.0;
+    int64_t hits = cache_hits_;          // DATA RACE (read, TOCTOU with above)
+    return static_cast<double>(hits) / static_cast<double>(total);
+  }
+
+ private:
+  // Shared mutable state with NO synchronization.
+  // In the real DCodeX code, these would be protected by absl::Mutex
+  // or std::atomic. Here, they are deliberately unprotected.
+  int64_t total_executions_;
+  int64_t cache_hits_;
+  int64_t cache_misses_;
+  double cumulative_latency_ms_;
+};
+
+}  // namespace
+
+// =====================================================================
+// TEST: Exercises the race with realistic DCodeX-like traffic patterns.
+//
+// Without TSan: Passes on x86 (hardware hides the race).
+// With TSan:    Fails with exit code 66 on the FIRST racy access.
+// =====================================================================
+TEST(TsanProof, CatchesMetricsCollectorRace) {
+  BuggyMetricsCollector collector;
+
+  constexpr int kWriterThreads = 6;
+  constexpr int kReaderThreads = 2;
+  constexpr int kOpsPerWriter = 5000;
+  constexpr int kReadsPerReader = 2000;
+
+  std::atomic<bool> start{false};
+  std::vector<std::thread> threads;
+  threads.reserve(kWriterThreads + kReaderThreads);
+
+  // --- Writer threads (simulate concurrent gRPC Execute requests) ---
+  for (int t = 0; t < kWriterThreads; ++t) {
+    threads.emplace_back([&collector, &start, t]() {
+      while (!start.load(std::memory_order_acquire)) {
+        // spin-wait for synchronized start
+      }
+      for (int i = 0; i < kOpsPerWriter; ++i) {
+        double latency = 1.0 + static_cast<double>(i % 50) * 0.1;
+        bool cache_hit = ((i + t) % 3) != 0;  // ~67% hit rate
+        collector.RecordExecution(latency, cache_hit);
+      }
+    });
+  }
+
+  // --- Reader threads (simulate monitoring/GetSystemMetrics RPCs) ---
+  for (int r = 0; r < kReaderThreads; ++r) {
+    threads.emplace_back([&collector, &start]() {
+      while (!start.load(std::memory_order_acquire)) {
+        // spin-wait for synchronized start
+      }
+      volatile double sink = 0.0;
+      for (int i = 0; i < kReadsPerReader; ++i) {
+        MetricsSnapshot snap = collector.GetSnapshot();
+        double hit_rate = collector.GetCacheHitRate();
+
+        // Consume values to prevent compiler optimization.
+        // We do NOT assert here — the values may be torn/inconsistent
+        // due to the race, and that's the whole point. Only TSan
+        // should report the error.
+        sink += snap.avg_latency_ms + hit_rate;
+      }
+      (void)sink;
+    });
+  }
+
+  // Release all threads simultaneously to maximize contention.
+  start.store(true, std::memory_order_release);
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // Without TSan, this test passes — the values may be slightly off on
+  // some architectures, but no assertion checks them during the race.
+  // With TSan, the process was killed long before reaching this line.
+  MetricsSnapshot final_snap = collector.GetSnapshot();
+  EXPECT_GT(final_snap.total_executions, 0);
+}

From d30d25f578e57cbd746f87f039695399a8824a84 Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Mon, 20 Apr 2026 07:50:50 +0530
Subject: [PATCH 05/10] fix: resolve CI bash pipefail and bazel flag
 deprecation warnings

* Refactor the TSan proof step in CI to avoid bash pipefail conflicts
  when capturing Bazel exit codes.
* Read the test log directly from bazel-testlogs to reliably detect
  the ThreadSanitizer data race footprint.
* Update ci.bazelrc to use the non-deprecated local_resources=cpu=N
  syntax for Bazel 7.x.
---
 .github/workflows/ci.bazelrc   |  4 ++--
 .github/workflows/linux_ci.yml | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc
index 09157f1..7ac6878 100644
--- a/.github/workflows/ci.bazelrc
+++ b/.github/workflows/ci.bazelrc
@@ -9,8 +9,8 @@ startup --host_jvm_args=-Xmx2g
 
 # 2. RESOURCE THROTTLING
 # Limit parallel execution to avoid OOM and CPU contention
-build --local_cpu_resources=2
-build --local_ram_resources=4096
+build --local_resources=cpu=2
+build --local_resources=memory=4096
 build --jobs=4
 
 # 3. OBSERVABILITY
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 308beff..22f81a1 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -59,23 +59,22 @@ jobs:
         echo "  PROOF: Running deliberately buggy code under TSan."
         echo "  TSan MUST detect the data race for this step to pass."
         echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-        set +e
+        
+        EXIT_CODE=0
         bazel --bazelrc=.github/workflows/ci.bazelrc test \
           --config=tsan \
           --runs_per_test=1 \
-          --test_output=all \
-          //src/engine:tsan_proof_test 2>&1 | tee /tmp/tsan_proof.log
-        EXIT_CODE=$?
-        set -e
+          --test_output=errors \
+          //src/engine:tsan_proof_test || EXIT_CODE=$?
 
         if [ "$EXIT_CODE" -eq 0 ]; then
-          echo ""
           echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!"
           echo "::error::The sanitizer pipeline is BROKEN. Investigate immediately."
           exit 1
         fi
 
-        if grep -q "ThreadSanitizer" /tmp/tsan_proof.log; then
+        TEST_LOG="bazel-testlogs/src/engine/tsan_proof_test/test.log"
+        if [ -f "$TEST_LOG" ] && grep -q "ThreadSanitizer" "$TEST_LOG"; then
           echo ""
           echo "✅ ThreadSanitizer correctly detected the data race."
           echo "   The sanitizer pipeline is verified and operational."
@@ -83,6 +82,7 @@ jobs:
         else
           echo ""
           echo "::error::Test failed but NOT due to TSan detection. Check logs."
+          cat "$TEST_LOG" || true
           exit 1
         fi
 

From 0bb332b24c2dd4b14260d4fb45e8ca466769321b Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Tue, 28 Apr 2026 23:34:37 +0530
Subject: [PATCH 06/10] ci: retrigger pipeline


From f48b6b48fae8dbb8d7ca21938e8a7c608046b67d Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Wed, 29 Apr 2026 00:21:27 +0530
Subject: [PATCH 07/10] fix: exclude sandbox_test from TSan to prevent OOM on
 CI

sandbox_test forks clang++ and compiled binaries as child processes.
Under TSan (8x memory overhead), these forked processes exhaust the
7GB CI runner memory, causing 'out of memory: failed to allocate
TracePart' errors and spurious exit code 66 failures.

Tagged sandbox_test with 'no-tsan' and added --test_tag_filters=-no-tsan
to the CI TSan step. sandbox_test still runs under Standard, ASan,
and MSan configurations.
---
 .github/workflows/linux_ci.yml | 3 ++-
 src/engine/BUILD               | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 22f81a1..904b12b 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -51,7 +51,8 @@ jobs:
     - name: Bazel Test (TSan)
       run: |
         bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \
-          --jobs=2
+          --jobs=2 \
+          --test_tag_filters=-no-tsan
 
     - name: "🔬 TSan Proof-of-Detection"
       run: |
diff --git a/src/engine/BUILD b/src/engine/BUILD
index 9035596..481663d 100755
--- a/src/engine/BUILD
+++ b/src/engine/BUILD
@@ -206,6 +206,10 @@ cc_test(
     timeout = "moderate",
     linkstatic = True,
     linkopts = ["-pthread"],
+    # Excluded from TSan runs: this test forks clang++ and compiled binaries.
+    # TSan-instrumenting those child processes causes OOM on CI runners
+    # and provides no concurrency testing value.
+    tags = ["no-tsan"],
     deps = [
         ":sandbox",
         ":dynamic_worker_coordinator",

From eccba992b0f6d96995dc3dc25d5193d4f048d24a Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Wed, 29 Apr 2026 00:32:37 +0530
Subject: [PATCH 08/10] fix: run sandbox_test under TSan with constrained
 resources (Option D)

Instead of excluding sandbox_test from TSan entirely, split the TSan
step into two:
  1. Main TSan step: runs all tests except sandbox_test (20 iterations)
  2. Constrained TSan step: runs sandbox_test with --runs_per_test=1
     and --local_test_jobs=1 to prevent OOM from forked clang++

Also reduce CI log noise:
  - test_output=errors (only print output for failing tests)
  - noannounce_rc (suppress the full bazelrc option dump)
  - show_progress_rate_limit=5 (throttle progress updates)
  - curses=no (disable terminal control sequences)
---
 .github/workflows/ci.bazelrc   | 11 ++++++++---
 .github/workflows/linux_ci.yml | 13 +++++++++++--
 src/engine/BUILD               |  7 +++----
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc
index 7ac6878..fed2ae4 100644
--- a/.github/workflows/ci.bazelrc
+++ b/.github/workflows/ci.bazelrc
@@ -13,10 +13,15 @@ build --local_resources=cpu=2
 build --local_resources=memory=4096
 build --jobs=4
 
-# 3. OBSERVABILITY
-# Ensure full output in CI logs
-test --test_output=all
+# 3. QUIET LOGGING
+# Only print output for FAILED tests (not every passing test)
+test --test_output=errors
 build --verbose_failures
+# Suppress the full option dump on every command
+common --noannounce_rc
+# Collapse progress into summary lines
+build --show_progress_rate_limit=5
+build --curses=no
 
 # 4. SANITIZER HARDENING
 # Use suppressions to ignore system library false positives
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 904b12b..aff2d9e 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -50,9 +50,18 @@ jobs:
 
     - name: Bazel Test (TSan)
       run: |
-        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan //... \
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \
+          //... \
           --jobs=2 \
-          --test_tag_filters=-no-tsan
+          --test_tag_filters=-no-sandbox-tsan
+
+    - name: Bazel Test (TSan - Sandbox, constrained)
+      run: |
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \
+          //src/engine:sandbox_test \
+          --jobs=1 \
+          --runs_per_test=1 \
+          --local_test_jobs=1
 
     - name: "🔬 TSan Proof-of-Detection"
       run: |
diff --git a/src/engine/BUILD b/src/engine/BUILD
index 481663d..403523d 100755
--- a/src/engine/BUILD
+++ b/src/engine/BUILD
@@ -206,10 +206,9 @@ cc_test(
     timeout = "moderate",
     linkstatic = True,
     linkopts = ["-pthread"],
-    # Excluded from TSan runs: this test forks clang++ and compiled binaries.
-    # TSan-instrumenting those child processes causes OOM on CI runners
-    # and provides no concurrency testing value.
-    tags = ["no-tsan"],
+    # Run under TSan separately with constrained resources (see CI workflow).
+    # This test forks clang++ which has high memory overhead under TSan.
+    tags = ["no-sandbox-tsan"],
     deps = [
         ":sandbox",
         ":dynamic_worker_coordinator",

From eac187a8e17cb2053aab455ff354df1d36db37a0 Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Wed, 29 Apr 2026 23:23:32 +0530
Subject: [PATCH 09/10] fix: redirect bazel output to file for TSan proof grep

The test log symlink lives at .bazel/testlogs/ (due to
--symlink_prefix=.bazel/) not bazel-testlogs/. Instead of
guessing the symlink path, capture bazel's stdout+stderr
to /tmp/tsan_proof_output.log and grep that directly.
---
 .github/workflows/linux_ci.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index aff2d9e..a8598d7 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -74,8 +74,8 @@ jobs:
         bazel --bazelrc=.github/workflows/ci.bazelrc test \
           --config=tsan \
           --runs_per_test=1 \
-          --test_output=errors \
-          //src/engine:tsan_proof_test || EXIT_CODE=$?
+          --test_output=all \
+          //src/engine:tsan_proof_test > /tmp/tsan_proof_output.log 2>&1 || EXIT_CODE=$?
 
         if [ "$EXIT_CODE" -eq 0 ]; then
           echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!"
@@ -83,16 +83,15 @@ jobs:
           exit 1
         fi
 
-        TEST_LOG="bazel-testlogs/src/engine/tsan_proof_test/test.log"
-        if [ -f "$TEST_LOG" ] && grep -q "ThreadSanitizer" "$TEST_LOG"; then
+        if grep -q "ThreadSanitizer" /tmp/tsan_proof_output.log; then
           echo ""
           echo "✅ ThreadSanitizer correctly detected the data race."
           echo "   The sanitizer pipeline is verified and operational."
           echo "   Your codebase is protected against concurrency bugs."
         else
           echo ""
-          echo "::error::Test failed but NOT due to TSan detection. Check logs."
-          cat "$TEST_LOG" || true
+          echo "::error::Test failed but NOT due to TSan detection. Check logs:"
+          cat /tmp/tsan_proof_output.log
           exit 1
         fi
 

From c344ec844cd0f494f3d2dc5655b8be4932a017fc Mon Sep 17 00:00:00 2001
From: Sam-Si <13261099+Sam-Si@users.noreply.github.com>
Date: Thu, 30 Apr 2026 10:16:34 +0530
Subject: [PATCH 10/10] =?UTF-8?q?fix:=20remove=20MSan=20from=20CI=20?=
 =?UTF-8?q?=E2=80=94=20requires=20instrumented=20libc++?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MSan fundamentally requires ALL linked libraries to be compiled with
MSan instrumentation. The system libstdc++ on GitHub Actions runners
is not instrumented, causing every test to fail in googletest's
static initialization (testing::Message::Message) before any DCodeX
code even executes.

TSan and ASan remain — they work correctly with non-instrumented
system libraries. The MSan config is preserved in .bazelrc for local
use with a custom toolchain.
---
 .github/workflows/linux_ci.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index a8598d7..ccf2f74 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -100,10 +100,11 @@ jobs:
         bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \
           --jobs=2
 
-    - name: Bazel Test (MSan)
-      run: |
-        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=msan //... \
-          --jobs=2
+    # MSan is intentionally excluded from CI. It requires ALL linked libraries
+    # (including libstdc++) to be compiled with MSan instrumentation. The system
+    # libstdc++ on GitHub runners is not instrumented, producing false positives
+    # in googletest before any DCodeX code even runs. To use MSan, build a
+    # custom toolchain with instrumented libc++ (see: https://clang.llvm.org/docs/MemorySanitizer.html#handling-external-code).
 
     - name: Upload Test Logs on Failure
       if: failure()