Sam-Si · Sam-Si · Apr 30, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc
@@ -0,0 +1,30 @@
+# ============================================================
+# DCodeX: GitHub Actions CI Specialized Config
+# Optimized for free-tier runners (2 CPUs, 7GB RAM)
+# ============================================================
+
+# 1. MEMORY LIMITS
+# Limit Bazel server heap to 2GB to leave room for Clang/LLD
+startup --host_jvm_args=-Xmx2g
+
+# 2. RESOURCE THROTTLING
+# Limit parallel execution to avoid OOM and CPU contention
+build --local_resources=cpu=2
+build --local_resources=memory=4096
+build --jobs=4
+
+# 3. QUIET LOGGING
+# Only print output for FAILED tests (not every passing test)
+test --test_output=errors
+build --verbose_failures
+# Suppress the full option dump on every command
+common --noannounce_rc
+# Collapse progress into summary lines
+build --show_progress_rate_limit=5
+build --curses=no
+
+# 4. SANITIZER HARDENING
+# Use suppressions to ignore system library false positives
+test:msan --action_env=MSAN_OPTIONS="halt_on_error=1:exitcode=77:suppressions=.github/workflows/msan_suppressions.txt"
+# Recommended for LLVM 19+
+build:msan --copt=-fsanitize-memory-param-retval
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
@@ -42,47 +42,69 @@ jobs:
 
     - name: Bazel Build
       run: |
-        bazel build //... \
-          --jobs=4 \
-          --local_resources=cpu=2,memory=4096 \
-          --host_jvm_args=-Xmx2g \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc build //...
 
     - name: Bazel Test (Standard)
       run: |
-        bazel test //... \
-          --jobs=4 \
-          --local_resources=cpu=2,memory=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc test //...
 
     - name: Bazel Test (TSan)
       run: |
-        bazel test --config=tsan //... \
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \
+          //... \
           --jobs=2 \
-          --local_resources=cpu=2,memory=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+          --test_tag_filters=-no-sandbox-tsan
 
-    - name: Bazel Test (ASan)
+    - name: Bazel Test (TSan - Sandbox, constrained)
       run: |
-        bazel test --config=asan //... \
-          --jobs=2 \
-          --local_resources=cpu=2,memory=4096 \
-          --host_jvm_args=-Xmx2g \
-          --test_output=all \
-          --verbose_failures
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \
+          //src/engine:sandbox_test \
+          --jobs=1 \
+          --runs_per_test=1 \
+          --local_test_jobs=1
 
-    - name: Bazel Test (MSan)
+    - name: "🔬 TSan Proof-of-Detection"
       run: |
-        bazel test --config=msan //... \
-          --jobs=2 \
-          --local_resources=cpu=2,memory=4096 \
-          --host_jvm_args=-Xmx2g \
+        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+        echo "  PROOF: Running deliberately buggy code under TSan."
+        echo "  TSan MUST detect the data race for this step to pass."
+        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+        EXIT_CODE=0
+        bazel --bazelrc=.github/workflows/ci.bazelrc test \
+          --config=tsan \
+          --runs_per_test=1 \
           --test_output=all \
-          --verbose_failures
+          //src/engine:tsan_proof_test > /tmp/tsan_proof_output.log 2>&1 || EXIT_CODE=$?
+
+        if [ "$EXIT_CODE" -eq 0 ]; then
+          echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!"
+          echo "::error::The sanitizer pipeline is BROKEN. Investigate immediately."
+          exit 1
+        fi
+
+        if grep -q "ThreadSanitizer" /tmp/tsan_proof_output.log; then
+          echo ""
+          echo "✅ ThreadSanitizer correctly detected the data race."
+          echo "   The sanitizer pipeline is verified and operational."
+          echo "   Your codebase is protected against concurrency bugs."
+        else
+          echo ""
+          echo "::error::Test failed but NOT due to TSan detection. Check logs:"
+          cat /tmp/tsan_proof_output.log
+          exit 1
+        fi
+
+    - name: Bazel Test (ASan)
+      run: |
+        bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \
+          --jobs=2
+
+    # MSan is intentionally excluded from CI. It requires ALL linked libraries
+    # (including libstdc++) to be compiled with MSan instrumentation. The system
+    # libstdc++ on GitHub runners is not instrumented, producing false positives
+    # in googletest before any DCodeX code even runs. To use MSan, build a
+    # custom toolchain with instrumented libc++ (see: https://clang.llvm.org/docs/MemorySanitizer.html#handling-external-code).
 
     - name: Upload Test Logs on Failure
       if: failure()

diff --git a/.github/workflows/msan_suppressions.txt b/.github/workflows/msan_suppressions.txt
@@ -0,0 +1,11 @@
+# MSan suppressions for DCodeX
+# These ignore false positives from the non-instrumented system libstdc++
+# and googletest internals.
+
+# Ignore googletest internal message formatting
+interceptor_via_lib:libstdc++.so.6
+interceptor_via_fun:testing::Message::Message
+interceptor_via_fun:testing::internal::*
+
+# Ignore specific system library calls that touch uninstrumented memory
+interceptor_via_lib:libc.so.6
diff --git a/src/engine/BUILD b/src/engine/BUILD
@@ -206,6 +206,9 @@ cc_test(
     timeout = "moderate",
     linkstatic = True,
     linkopts = ["-pthread"],
+    # Run under TSan separately with constrained resources (see CI workflow).
+    # This test forks clang++ which has high memory overhead under TSan.
+    tags = ["no-sandbox-tsan"],
     deps = [
         ":sandbox",
         ":dynamic_worker_coordinator",
@@ -232,6 +235,25 @@ cc_test(
     ],
 )
 
+# ── TSan Proof-of-Detection ──────────────────────────────────────────
+# This test contains DELIBERATE data races. It is tagged "manual" so it
+# is excluded from `bazel test //...`. The CI runs it in a special step
+# that EXPECTS TSan to detect the race and exit with code 66. If TSan
+# does NOT detect the race, the CI step fails — proving the sanitizer
+# pipeline is broken.
+cc_test(
+    name = "tsan_proof_test",
+    srcs = ["tsan_proof_test.cc"],
+    copts = ["-std=c++23"],
+    linkstatic = True,
+    linkopts = ["-pthread"],
+    tags = ["manual", "no-sandbox", "exclusive"],
+    deps = [
+        "@googletest//:gtest",
+        "@googletest//:gtest_main",
+    ],
+)
+
 test_suite(
     name = "concurrency_tests",
     tests = [

diff --git a/src/engine/tsan_proof_test.cc b/src/engine/tsan_proof_test.cc
@@ -0,0 +1,207 @@
+// Copyright 2024 DCodeX Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// =====================================================================
+// TSan PROOF-OF-DETECTION Test
+// =====================================================================
+//
+// PURPOSE:
+//   This file contains a DELIBERATELY BUGGY implementation of a pattern
+//   that is common in high-performance systems like DCodeX: a metrics
+//   collector where a developer removed mutex protection for "performance",
+//   introducing a subtle data race.
+//
+// WHY IT'S SUBTLE:
+//   On x86-64, aligned 64-bit reads and writes are naturally atomic at
+//   the hardware level. This means that in normal testing — even with
+//   hundreds of threads — the race almost never manifests as a wrong
+//   answer. The program "works fine" in dev, in staging, and even in
+//   production for weeks. Then, one day, the compiler reorders a store
+//   past a load, or the CPU's store buffer batches two writes, and you
+//   get a corrupted metrics snapshot served to a monitoring dashboard.
+//
+// WHAT THIS PROVES:
+//   ThreadSanitizer instruments every memory access at compile time and
+//   detects the happens-before violation DETERMINISTICALLY, regardless
+//   of whether the hardware would actually reorder. This test is run in
+//   CI with --config=tsan and is EXPECTED TO FAIL with exit code 66.
+//   If TSan does NOT detect the race, the CI step fails — proving the
+//   sanitizer pipeline itself is broken.
+//
+// DO NOT FIX THE BUGS IN THIS FILE. THEY ARE INTENTIONAL.
+// =====================================================================
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cmath>
+#include <thread>
+#include <vector>
+
+namespace {
+
+// =====================================================================
+// DELIBERATELY BUGGY: Unprotected Execution Metrics Collector
+//
+// This mirrors the real DCodeX pattern:
+//   DynamicWorkerCoordinator::Metrics  → MetricsSnapshot
+//   GetMetrics()                       → GetSnapshot()
+//   completed_requests_               → total_executions_
+//
+// The real code uses absl::Mutex + ABSL_GUARDED_BY correctly.
+// This "optimized" version removes the lock. The race is on the
+// compound read-modify-write of the shared struct fields.
+// =====================================================================
+
+struct MetricsSnapshot {
+  int64_t total_executions;
+  int64_t cache_hits;
+  int64_t cache_misses;
+  double cumulative_latency_ms;
+  double avg_latency_ms;
+};
+
+class BuggyMetricsCollector {
+ public:
+  BuggyMetricsCollector()
+      : total_executions_(0),
+        cache_hits_(0),
+        cache_misses_(0),
+        cumulative_latency_ms_(0.0) {}
+
+  // BUG: Multiple threads call this concurrently without synchronization.
+  // The individual increments LOOK atomic on x86, but:
+  //   1) They are not atomic in the C++ memory model.
+  //   2) The compound operation (read + modify + write) on
+  //      cumulative_latency_ms_ is NEVER atomic, even on x86.
+  //   3) The cache_hit branch means different threads take different
+  //      code paths, increasing the window for interleaving.
+  void RecordExecution(double latency_ms, bool cache_hit) {
+    total_executions_++;                       // DATA RACE
+    cumulative_latency_ms_ += latency_ms;      // DATA RACE (compound RMW)
+
+    if (cache_hit) {
+      cache_hits_++;                           // DATA RACE
+    } else {
+      cache_misses_++;                         // DATA RACE
+    }
+  }
+
+  // BUG: TOCTOU race — reads of total_executions_ and cumulative_latency_ms_
+  // are not atomic with respect to each other. A writer can increment
+  // total_executions_ between our read of cumulative_latency_ms_ and
+  // our read of total_executions_, producing a snapshot where
+  // avg_latency = cumulative / (N+1) instead of cumulative / N.
+  MetricsSnapshot GetSnapshot() const {
+    MetricsSnapshot snap;
+    snap.total_executions = total_executions_;          // DATA RACE (read)
+    snap.cache_hits = cache_hits_;                      // DATA RACE (read)
+    snap.cache_misses = cache_misses_;                  // DATA RACE (read)
+    snap.cumulative_latency_ms = cumulative_latency_ms_;// DATA RACE (read)
+
+    // TOCTOU: total_executions_ may have changed between the reads above.
+    if (snap.total_executions > 0) {
+      snap.avg_latency_ms =
+          snap.cumulative_latency_ms / static_cast<double>(snap.total_executions);
+    } else {
+      snap.avg_latency_ms = 0.0;
+    }
+    return snap;
+  }
+
+  double GetCacheHitRate() const {
+    int64_t total = total_executions_;   // DATA RACE (read)
+    if (total == 0) return 0.0;
+    int64_t hits = cache_hits_;          // DATA RACE (read, TOCTOU with above)
+    return static_cast<double>(hits) / static_cast<double>(total);
+  }
+
+ private:
+  // Shared mutable state with NO synchronization.
+  // In the real DCodeX code, these would be protected by absl::Mutex
+  // or std::atomic. Here, they are deliberately unprotected.
+  int64_t total_executions_;
+  int64_t cache_hits_;
+  int64_t cache_misses_;
+  double cumulative_latency_ms_;
+};
+
+}  // namespace
+
+// =====================================================================
+// TEST: Exercises the race with realistic DCodeX-like traffic patterns.
+//
+// Without TSan: Passes on x86 (hardware hides the race).
+// With TSan:    Fails with exit code 66 on the FIRST racy access.
+// =====================================================================
+TEST(TsanProof, CatchesMetricsCollectorRace) {
+  BuggyMetricsCollector collector;
+
+  constexpr int kWriterThreads = 6;
+  constexpr int kReaderThreads = 2;
+  constexpr int kOpsPerWriter = 5000;
+  constexpr int kReadsPerReader = 2000;
+
+  std::atomic<bool> start{false};
+  std::vector<std::thread> threads;
+  threads.reserve(kWriterThreads + kReaderThreads);
+
+  // --- Writer threads (simulate concurrent gRPC Execute requests) ---
+  for (int t = 0; t < kWriterThreads; ++t) {
+    threads.emplace_back([&collector, &start, t]() {
+      while (!start.load(std::memory_order_acquire)) {
+        // spin-wait for synchronized start
+      }
+      for (int i = 0; i < kOpsPerWriter; ++i) {
+        double latency = 1.0 + static_cast<double>(i % 50) * 0.1;
+        bool cache_hit = ((i + t) % 3) != 0;  // ~67% hit rate
+        collector.RecordExecution(latency, cache_hit);
+      }
+    });
+  }
+
+  // --- Reader threads (simulate monitoring/GetSystemMetrics RPCs) ---
+  for (int r = 0; r < kReaderThreads; ++r) {
+    threads.emplace_back([&collector, &start]() {
+      while (!start.load(std::memory_order_acquire)) {
+        // spin-wait for synchronized start
+      }
+      volatile double sink = 0.0;
+      for (int i = 0; i < kReadsPerReader; ++i) {
+        MetricsSnapshot snap = collector.GetSnapshot();
+        double hit_rate = collector.GetCacheHitRate();
+
+        // Consume values to prevent compiler optimization.
+        // We do NOT assert here — the values may be torn/inconsistent
+        // due to the race, and that's the whole point. Only TSan
+        // should report the error.
+        sink += snap.avg_latency_ms + hit_rate;
+      }
+      (void)sink;
+    });
+  }
+
+  // Release all threads simultaneously to maximize contention.
+  start.store(true, std::memory_order_release);
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // Without TSan, this test passes — the values may be slightly off on
+  // some architectures, but no assertion checks them during the race.
+  // With TSan, the process was killed long before reaching this line.
+  MetricsSnapshot final_snap = collector.GetSnapshot();
+  EXPECT_GT(final_snap.total_executions, 0);
+}