diff --git a/.github/workflows/ci.bazelrc b/.github/workflows/ci.bazelrc new file mode 100644 index 0000000..fed2ae4 --- /dev/null +++ b/.github/workflows/ci.bazelrc @@ -0,0 +1,30 @@ +# ============================================================ +# DCodeX: GitHub Actions CI Specialized Config +# Optimized for free-tier runners (2 CPUs, 7GB RAM) +# ============================================================ + +# 1. MEMORY LIMITS +# Limit Bazel server heap to 2GB to leave room for Clang/LLD +startup --host_jvm_args=-Xmx2g + +# 2. RESOURCE THROTTLING +# Limit parallel execution to avoid OOM and CPU contention +build --local_resources=cpu=2 +build --local_resources=memory=4096 +build --jobs=4 + +# 3. QUIET LOGGING +# Only print output for FAILED tests (not every passing test) +test --test_output=errors +build --verbose_failures +# Suppress the full option dump on every command +common --noannounce_rc +# Collapse progress into summary lines +build --show_progress_rate_limit=5 +build --curses=no + +# 4. SANITIZER HARDENING +# Use suppressions to ignore system library false positives +test:msan --action_env=MSAN_OPTIONS="halt_on_error=1:exitcode=77:suppressions=.github/workflows/msan_suppressions.txt" +# Recommended for LLVM 19+ +build:msan --copt=-fsanitize-memory-param-retval diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index c8d8722..ccf2f74 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -42,47 +42,69 @@ jobs: - name: Bazel Build run: | - bazel build //... \ - --jobs=4 \ - --local_resources=cpu=2,memory=4096 \ - --host_jvm_args=-Xmx2g \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc build //... - name: Bazel Test (Standard) run: | - bazel test //... \ - --jobs=4 \ - --local_resources=cpu=2,memory=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc test //... - name: Bazel Test (TSan) run: | - bazel test --config=tsan //... \ + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \ + //... \ --jobs=2 \ - --local_resources=cpu=2,memory=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + --test_tag_filters=-no-sandbox-tsan - - name: Bazel Test (ASan) + - name: Bazel Test (TSan - Sandbox, constrained) run: | - bazel test --config=asan //... \ - --jobs=2 \ - --local_resources=cpu=2,memory=4096 \ - --host_jvm_args=-Xmx2g \ - --test_output=all \ - --verbose_failures + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=tsan \ + //src/engine:sandbox_test \ + --jobs=1 \ + --runs_per_test=1 \ + --local_test_jobs=1 - - name: Bazel Test (MSan) + - name: "🔬 TSan Proof-of-Detection" run: | - bazel test --config=msan //... \ - --jobs=2 \ - --local_resources=cpu=2,memory=4096 \ - --host_jvm_args=-Xmx2g \ + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " PROOF: Running deliberately buggy code under TSan." + echo " TSan MUST detect the data race for this step to pass." + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + + EXIT_CODE=0 + bazel --bazelrc=.github/workflows/ci.bazelrc test \ + --config=tsan \ + --runs_per_test=1 \ --test_output=all \ - --verbose_failures + //src/engine:tsan_proof_test > /tmp/tsan_proof_output.log 2>&1 || EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 0 ]; then + echo "::error::❌ CRITICAL: TSan did NOT detect the deliberate data race!" + echo "::error::The sanitizer pipeline is BROKEN. Investigate immediately." + exit 1 + fi + + if grep -q "ThreadSanitizer" /tmp/tsan_proof_output.log; then + echo "" + echo "✅ ThreadSanitizer correctly detected the data race." + echo " The sanitizer pipeline is verified and operational." + echo " Your codebase is protected against concurrency bugs." + else + echo "" + echo "::error::Test failed but NOT due to TSan detection. Check logs:" + cat /tmp/tsan_proof_output.log + exit 1 + fi + + - name: Bazel Test (ASan) + run: | + bazel --bazelrc=.github/workflows/ci.bazelrc test --config=asan //... \ + --jobs=2 + + # MSan is intentionally excluded from CI. It requires ALL linked libraries + # (including libstdc++) to be compiled with MSan instrumentation. The system + # libstdc++ on GitHub runners is not instrumented, producing false positives + # in googletest before any DCodeX code even runs. To use MSan, build a + # custom toolchain with instrumented libc++ (see: https://clang.llvm.org/docs/MemorySanitizer.html#handling-external-code). - name: Upload Test Logs on Failure if: failure() diff --git a/.github/workflows/msan_suppressions.txt b/.github/workflows/msan_suppressions.txt new file mode 100644 index 0000000..238243a --- /dev/null +++ b/.github/workflows/msan_suppressions.txt @@ -0,0 +1,11 @@ +# MSan suppressions for DCodeX +# These ignore false positives from the non-instrumented system libstdc++ +# and googletest internals. + +# Ignore googletest internal message formatting +interceptor_via_lib:libstdc++.so.6 +interceptor_via_fun:testing::Message::Message +interceptor_via_fun:testing::internal::* + +# Ignore specific system library calls that touch uninstrumented memory +interceptor_via_lib:libc.so.6 diff --git a/src/engine/BUILD b/src/engine/BUILD index 33c044b..403523d 100755 --- a/src/engine/BUILD +++ b/src/engine/BUILD @@ -206,6 +206,9 @@ cc_test( timeout = "moderate", linkstatic = True, linkopts = ["-pthread"], + # Run under TSan separately with constrained resources (see CI workflow). + # This test forks clang++ which has high memory overhead under TSan. + tags = ["no-sandbox-tsan"], deps = [ ":sandbox", ":dynamic_worker_coordinator", @@ -232,6 +235,25 @@ cc_test( ], ) +# ── TSan Proof-of-Detection ────────────────────────────────────────── +# This test contains DELIBERATE data races. It is tagged "manual" so it +# is excluded from `bazel test //...`. The CI runs it in a special step +# that EXPECTS TSan to detect the race and exit with code 66. If TSan +# does NOT detect the race, the CI step fails — proving the sanitizer +# pipeline is broken. +cc_test( + name = "tsan_proof_test", + srcs = ["tsan_proof_test.cc"], + copts = ["-std=c++23"], + linkstatic = True, + linkopts = ["-pthread"], + tags = ["manual", "no-sandbox", "exclusive"], + deps = [ + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + test_suite( name = "concurrency_tests", tests = [ diff --git a/src/engine/tsan_proof_test.cc b/src/engine/tsan_proof_test.cc new file mode 100644 index 0000000..8b95014 --- /dev/null +++ b/src/engine/tsan_proof_test.cc @@ -0,0 +1,207 @@ +// Copyright 2024 DCodeX Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ===================================================================== +// TSan PROOF-OF-DETECTION Test +// ===================================================================== +// +// PURPOSE: +// This file contains a DELIBERATELY BUGGY implementation of a pattern +// that is common in high-performance systems like DCodeX: a metrics +// collector where a developer removed mutex protection for "performance", +// introducing a subtle data race. +// +// WHY IT'S SUBTLE: +// On x86-64, aligned 64-bit reads and writes are naturally atomic at +// the hardware level. This means that in normal testing — even with +// hundreds of threads — the race almost never manifests as a wrong +// answer. The program "works fine" in dev, in staging, and even in +// production for weeks. Then, one day, the compiler reorders a store +// past a load, or the CPU's store buffer batches two writes, and you +// get a corrupted metrics snapshot served to a monitoring dashboard. +// +// WHAT THIS PROVES: +// ThreadSanitizer instruments every memory access at compile time and +// detects the happens-before violation DETERMINISTICALLY, regardless +// of whether the hardware would actually reorder. This test is run in +// CI with --config=tsan and is EXPECTED TO FAIL with exit code 66. +// If TSan does NOT detect the race, the CI step fails — proving the +// sanitizer pipeline itself is broken. +// +// DO NOT FIX THE BUGS IN THIS FILE. THEY ARE INTENTIONAL. +// ===================================================================== + +#include + +#include +#include +#include +#include + +namespace { + +// ===================================================================== +// DELIBERATELY BUGGY: Unprotected Execution Metrics Collector +// +// This mirrors the real DCodeX pattern: +// DynamicWorkerCoordinator::Metrics → MetricsSnapshot +// GetMetrics() → GetSnapshot() +// completed_requests_ → total_executions_ +// +// The real code uses absl::Mutex + ABSL_GUARDED_BY correctly. +// This "optimized" version removes the lock. The race is on the +// compound read-modify-write of the shared struct fields. +// ===================================================================== + +struct MetricsSnapshot { + int64_t total_executions; + int64_t cache_hits; + int64_t cache_misses; + double cumulative_latency_ms; + double avg_latency_ms; +}; + +class BuggyMetricsCollector { + public: + BuggyMetricsCollector() + : total_executions_(0), + cache_hits_(0), + cache_misses_(0), + cumulative_latency_ms_(0.0) {} + + // BUG: Multiple threads call this concurrently without synchronization. + // The individual increments LOOK atomic on x86, but: + // 1) They are not atomic in the C++ memory model. + // 2) The compound operation (read + modify + write) on + // cumulative_latency_ms_ is NEVER atomic, even on x86. + // 3) The cache_hit branch means different threads take different + // code paths, increasing the window for interleaving. + void RecordExecution(double latency_ms, bool cache_hit) { + total_executions_++; // DATA RACE + cumulative_latency_ms_ += latency_ms; // DATA RACE (compound RMW) + + if (cache_hit) { + cache_hits_++; // DATA RACE + } else { + cache_misses_++; // DATA RACE + } + } + + // BUG: TOCTOU race — reads of total_executions_ and cumulative_latency_ms_ + // are not atomic with respect to each other. A writer can increment + // total_executions_ between our read of cumulative_latency_ms_ and + // our read of total_executions_, producing a snapshot where + // avg_latency = cumulative / (N+1) instead of cumulative / N. + MetricsSnapshot GetSnapshot() const { + MetricsSnapshot snap; + snap.total_executions = total_executions_; // DATA RACE (read) + snap.cache_hits = cache_hits_; // DATA RACE (read) + snap.cache_misses = cache_misses_; // DATA RACE (read) + snap.cumulative_latency_ms = cumulative_latency_ms_;// DATA RACE (read) + + // TOCTOU: total_executions_ may have changed between the reads above. + if (snap.total_executions > 0) { + snap.avg_latency_ms = + snap.cumulative_latency_ms / static_cast(snap.total_executions); + } else { + snap.avg_latency_ms = 0.0; + } + return snap; + } + + double GetCacheHitRate() const { + int64_t total = total_executions_; // DATA RACE (read) + if (total == 0) return 0.0; + int64_t hits = cache_hits_; // DATA RACE (read, TOCTOU with above) + return static_cast(hits) / static_cast(total); + } + + private: + // Shared mutable state with NO synchronization. + // In the real DCodeX code, these would be protected by absl::Mutex + // or std::atomic. Here, they are deliberately unprotected. + int64_t total_executions_; + int64_t cache_hits_; + int64_t cache_misses_; + double cumulative_latency_ms_; +}; + +} // namespace + +// ===================================================================== +// TEST: Exercises the race with realistic DCodeX-like traffic patterns. +// +// Without TSan: Passes on x86 (hardware hides the race). +// With TSan: Fails with exit code 66 on the FIRST racy access. +// ===================================================================== +TEST(TsanProof, CatchesMetricsCollectorRace) { + BuggyMetricsCollector collector; + + constexpr int kWriterThreads = 6; + constexpr int kReaderThreads = 2; + constexpr int kOpsPerWriter = 5000; + constexpr int kReadsPerReader = 2000; + + std::atomic start{false}; + std::vector threads; + threads.reserve(kWriterThreads + kReaderThreads); + + // --- Writer threads (simulate concurrent gRPC Execute requests) --- + for (int t = 0; t < kWriterThreads; ++t) { + threads.emplace_back([&collector, &start, t]() { + while (!start.load(std::memory_order_acquire)) { + // spin-wait for synchronized start + } + for (int i = 0; i < kOpsPerWriter; ++i) { + double latency = 1.0 + static_cast(i % 50) * 0.1; + bool cache_hit = ((i + t) % 3) != 0; // ~67% hit rate + collector.RecordExecution(latency, cache_hit); + } + }); + } + + // --- Reader threads (simulate monitoring/GetSystemMetrics RPCs) --- + for (int r = 0; r < kReaderThreads; ++r) { + threads.emplace_back([&collector, &start]() { + while (!start.load(std::memory_order_acquire)) { + // spin-wait for synchronized start + } + volatile double sink = 0.0; + for (int i = 0; i < kReadsPerReader; ++i) { + MetricsSnapshot snap = collector.GetSnapshot(); + double hit_rate = collector.GetCacheHitRate(); + + // Consume values to prevent compiler optimization. + // We do NOT assert here — the values may be torn/inconsistent + // due to the race, and that's the whole point. Only TSan + // should report the error. + sink += snap.avg_latency_ms + hit_rate; + } + (void)sink; + }); + } + + // Release all threads simultaneously to maximize contention. + start.store(true, std::memory_order_release); + + for (auto& t : threads) { + t.join(); + } + + // Without TSan, this test passes — the values may be slightly off on + // some architectures, but no assertion checks them during the race. + // With TSan, the process was killed long before reaching this line. + MetricsSnapshot final_snap = collector.GetSnapshot(); + EXPECT_GT(final_snap.total_executions, 0); +}