diff --git a/.bazelrc b/.bazelrc index b776f81..fcea679 100755 --- a/.bazelrc +++ b/.bazelrc @@ -46,13 +46,16 @@ build --local_resources=memory=57344 # Linux Sandbox Hardening # --sandbox_tmpfs_path=/tmp : clean /tmp per action (isolation) -# --sandbox_add_mount_pair : ensures /dev/null is available -# NOTE: Do NOT use --sandbox_base=/dev/shm — Docker containers default -# to 64MB /dev/shm which is too small for linking large binaries. -# Stale sandbox state ("File exists" errors) is handled by cleaning -# .bazel/output_base/sandbox before builds (see dcodex-setup.sh). +# --sandbox_writable_path=/dev: makes /dev (including /dev/null) available +# as a read-write bind mount inside the sandbox. Bazel's test-setup.sh +# redirects to /dev/null and fails if it's read-only. +# NOTE: Do NOT also add --sandbox_add_mount_pair=/dev/null — it creates +# a separate read-only bind mount that overrides the writable /dev mount. +# NOTE: --sandbox_base CANNOT be under /tmp when --sandbox_tmpfs_path=/tmp +# is set — the tmpfs mount would wipe out the sandbox working directory. +# Stale sandbox state is purged by dcodex-setup.sh between suites. build:linux --sandbox_tmpfs_path=/tmp -build:linux --sandbox_add_mount_pair=/dev/null +build:linux --sandbox_writable_path=/dev build:linux --dynamic_mode=off build:linux --linkopt=-Wl,--threads=16 @@ -68,6 +71,25 @@ build --strategy=CppCompile=sandboxed,standalone build --strategy=CppLink=sandboxed,standalone build --genrule_strategy=sandboxed,standalone +# Collision-free test sandboxing. Two independent safeguards: +# +# 1. No directory reuse — Bazel 7.x defaults --reuse_sandbox_directories=true +# which recycles sandbox dirs across actions. With high --runs_per_test +# (TSan uses 20) and concurrent --local_test_jobs, the async cleanup +# races against new sandbox setup → "Could not copy inputs (File exists)". +# Builds still benefit from reuse; only tests opt out. +# +# 2. Synchronous cleanup — the default async mode (4 threads) can fall behind +# during bursts of short-lived test actions, leaving stale inodes that trip +# up the next sandbox setup. Synchronous deletion guarantees a clean slate +# after every action. +# +# NOTE: Do NOT set --sandbox_base under /tmp here — it conflicts with +# --sandbox_tmpfs_path=/tmp (Bazel refuses to run when the sandbox working +# directory is below a tmpfs mount point). +test --noreuse_sandbox_directories +test --experimental_sandbox_async_tree_delete_idle_threads=0 + # ------------------------------------------------------------ # 6. CACHING # ------------------------------------------------------------ @@ -124,7 +146,11 @@ build:tsan --linkopt=-fsanitize=thread build:tsan --copt=-DTHREAD_SANITIZER test:tsan --action_env=TSAN_OPTIONS="history_size=7:halt_on_error=1:detect_deadlocks=1:second_deadlock_stack=1:strict_memcmp=1:report_atomic_races=1:force_seq_cst_atomics=1:exitcode=66:symbolize=1:print_suppressions=1" test:tsan --runs_per_test=20 -test:tsan --local_test_jobs=10 +# Keep local_test_jobs low to prevent sandbox directory collisions. +# With --runs_per_test=20, Bazel creates 20 sandbox instances per target. +# At concurrency >4, sandbox cleanup can't keep up, causing +# "Could not copy inputs into sandbox (File exists)" errors. +test:tsan --local_test_jobs=4 # -- Code Coverage -- build:coverage --collect_code_coverage @@ -134,16 +160,26 @@ build:coverage --combined_report=lcov # -- MSan: MemorySanitizer -- # Usage: bazel test --config=msan //... # Detects: reads of uninitialized memory. -# WARNING: MSan requires ALL linked libraries (including libc++) to be -# compiled with -fsanitize=memory. If you see false positives from -# std::string / std::vector, you must provide an MSan-instrumented -# libc++ via: build:msan --linkopt=-stdlib=libc++ (custom build). +# MSan requires -stdlib=libc++ because the MSan runtime has interceptors +# for libc++ but NOT for libstdc++. Without libc++, googletest itself +# triggers false positives before any DCodeX code runs. +# Requires: libc++-dev and libc++abi-dev (installed by dcodex-setup.sh). +# +# --spawn_strategy=standalone: libc++ headers (e.g. /usr/lib/llvm-19/include/c++/v1/) +# are not in Bazel's auto-detected cxx_builtin_include_directories (which are +# detected using libstdc++). The sandbox include validator rejects these as +# "absolute path inclusion" errors. Standalone bypasses the sandbox, which is +# appropriate since MSan inherently depends on system-installed libc++. build:msan --config=sanitizer_common +build:msan --spawn_strategy=standalone build:msan --copt=-fsanitize=memory build:msan --copt=-fsanitize-memory-track-origins=2 build:msan --linkopt=-fsanitize=memory build:msan --copt=-DMEMORY_SANITIZER -test:msan --action_env=MSAN_OPTIONS="halt_on_error=1:exitcode=77" +build:msan --copt=-stdlib=libc++ +build:msan --linkopt=-stdlib=libc++ +build:msan --linkopt=-lc++abi +test:msan --action_env=MSAN_OPTIONS="halt_on_error=1:exitcode=77:suppressions=.github/workflows/msan_suppressions.txt" # -- ASan + UBSan: AddressSanitizer + UndefinedBehaviorSanitizer -- # Usage: bazel test --config=asan //... diff --git a/dcodex-setup.sh b/dcodex-setup.sh index fc60b5a..d48ae63 100755 --- a/dcodex-setup.sh +++ b/dcodex-setup.sh @@ -113,6 +113,20 @@ version_gte() { [[ "$(printf '%s\n%s' "$2" "$1" | sort -V | head -n1)" == "$2" ]] } +# Remove stale sandbox working directories. Leftover sandbox state from +# a previous build/test (Ctrl+C, OOM kill, crash, or Bazel's own async +# cleanup not finishing in time) causes "Could not copy inputs into +# sandbox … (File exists)" on the next run. This is cheap (~instant) +# and only removes sandbox working dirs — disk cache & repo cache are +# untouched. Called once at startup AND before every `bazel test` +# invocation so no stale state ever leaks across sanitizer suites. +purge_sandbox_dirs() { + local sandbox_dir="${REPO_DIR}/.bazel/output_base/sandbox" + if [[ -d "$sandbox_dir" ]]; then + rm -rf "$sandbox_dir" + fi +} + # ───────────────────────────────────────────────────────────────────────────── # STEP 1 — Pre-flight checks # ───────────────────────────────────────────────────────────────────────────── @@ -191,8 +205,10 @@ http://apt.llvm.org/${UBUNTU_CODENAME}/ llvm-toolchain-${UBUNTU_CODENAME}-${LLVM DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ "clang-${LLVM_VERSION}" \ "lld-${LLVM_VERSION}" \ + "libc++-${LLVM_VERSION}-dev" \ + "libc++abi-${LLVM_VERSION}-dev" \ 2>/dev/null - ok "LLVM ${LLVM_VERSION} installed" + ok "LLVM ${LLVM_VERSION} installed (includes libc++ for MSan)" fi # ── Sanitizer runtime headers (needed for --config=asan/msan/tsan) ─────── @@ -206,6 +222,18 @@ http://apt.llvm.org/${UBUNTU_CODENAME}/ llvm-toolchain-${UBUNTU_CODENAME}-${LLVM || warn "libclang-rt-${LLVM_VERSION}-dev not available — sanitizer builds may fail" ok "Sanitizer runtime headers installed" + # ── libc++ (required for MSan — see .bazelrc msan config) ──────────── + # MSan needs -stdlib=libc++ because its runtime has interceptors for + # libc++ but not libstdc++. Without libc++, and other + # standard headers are missing and compilation fails. + info "Ensuring libc++ is installed (required for MSan)..." + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "libc++-${LLVM_VERSION}-dev" \ + "libc++abi-${LLVM_VERSION}-dev" \ + 2>/dev/null \ + || warn "libc++ packages not available — MSan builds will fail" + ok "libc++ installed" + # ── Symlinks ───────────────────────────────────────────────────────────── info "Creating LLVM symlinks..." ln -sf "/usr/bin/clang-${LLVM_VERSION}" /usr/bin/clang @@ -309,16 +337,8 @@ else ok "Skipping bazel clean (incremental build — disk cache preserved)" fi -# Always purge stale sandbox directories. If a previous build was interrupted -# (Ctrl+C, OOM kill, crash), leftover files cause "File exists" errors on the -# next run. This is cheap (~instant) and only removes sandbox working dirs — -# the disk cache and repo cache are untouched. -if [[ -d "${REPO_DIR}/.bazel/output_base/sandbox" ]]; then - rm -rf "${REPO_DIR}/.bazel/output_base/sandbox" - ok "Purged stale sandbox directories" -else - ok "No stale sandbox directories to clean" -fi +purge_sandbox_dirs +ok "Sandbox directories clean" timer @@ -439,6 +459,10 @@ run_sanitizer_suite() { info "Running ${config_name} tests: ${targets[*]}" + # Purge sandbox dirs from previous suite so no stale state leaks across + # sanitizer configurations (asan → tsan → msan). + purge_sandbox_dirs + # Timestamp file for dump_test_logs() to find only fresh logs. touch /tmp/dcodex-test-ts-"$config_name" @@ -474,23 +498,26 @@ if [[ "$MODE" == "test" ]]; then run_sanitizer_suite asan "${ENGINE_TESTS[@]}" || TEST_STATUS_ASAN=$? # ── MSan ────────────────────────────────────────────────────────────── - step "6b/7 MSan Tests (SKIPPED)" - warn "MSan requires ALL linked libraries (including system libstdc++) to be" - warn " compiled with -fsanitize=memory. The system libstdc++ is NOT instrumented," - warn " causing false positives in googletest before any DCodeX code runs." - warn " → Matching CI decision: MSan is excluded from automated testing." - warn " → To run MSan, build a custom toolchain with instrumented libc++." - warn " → See: https://clang.llvm.org/docs/MemorySanitizer.html#handling-external-code" - info "MSan skipped (set RUN_MSAN=1 to force-run with suppressions)" - - if [[ "${RUN_MSAN:-0}" == "1" ]]; then - warn "RUN_MSAN=1 — running MSan anyway (expect false positives)..." + # MSan is gated behind RUN_MSAN=1 because it requires ALL linked libraries + # (including libc++) to be compiled with -fsanitize=memory. The system + # libc++ installed via apt is NOT instrumented, producing false positives + # in googletest and abseil internals. To run MSan properly, build a custom + # LLVM toolchain with an MSan-instrumented libc++ (see: + # https://clang.llvm.org/docs/MemorySanitizer.html#handling-external-code). + RUN_MSAN="${RUN_MSAN:-0}" + MSAN_SKIPPED=0 + step "6b/7 MSan Tests" + if [[ "$RUN_MSAN" == "1" ]]; then + # MSan targets exclude sandbox_test (spawns uninstrumented clang++/python3 + # subprocesses which produce false positives) and tsan_checker (TSan-specific). MSAN_TARGETS=( "//src/engine:warm_worker_pool_test" "//src/engine:dynamic_worker_coordinator_test" - "//src/engine:tsan_checker" ) run_sanitizer_suite msan "${MSAN_TARGETS[@]}" || TEST_STATUS_MSAN=$? + else + MSAN_SKIPPED=1 + warn "MSan tests SKIPPED (set RUN_MSAN=1 to enable — requires instrumented libc++)" fi # ── TSan ────────────────────────────────────────────────────────────── @@ -508,6 +535,7 @@ if [[ "$MODE" == "test" ]]; then # The sandbox_test forks clang++ which has high memory overhead under TSan. if [[ $TEST_STATUS_TSAN -eq 0 ]]; then info "Running sandbox_test under TSan (constrained: 1 job, 1 run)..." + purge_sandbox_dirs touch /tmp/dcodex-test-ts-tsan-sandbox set +e bazel "${BAZEL_JVM_FLAGS[@]}" test \ @@ -537,18 +565,18 @@ if [[ "$MODE" == "test" ]]; then echo "" echo -e "${BOLD}${CYAN}━━━ Test Summary ━━━${NC}" echo -e " ASan + UBSan: $(if [[ $TEST_STATUS_ASAN -eq 0 ]]; then echo -e "${GREEN}PASS${NC}"; else echo -e "${RED}FAIL (exit $TEST_STATUS_ASAN)${NC}"; fi)" - echo -e " MSan: $(if [[ "${RUN_MSAN:-0}" == "1" ]]; then if [[ $TEST_STATUS_MSAN -eq 0 ]]; then echo -e "${GREEN}PASS${NC}"; else echo -e "${RED}FAIL (exit $TEST_STATUS_MSAN)${NC}"; fi; else echo -e "${YELLOW}SKIPPED${NC}"; fi)" + echo -e " MSan: $(if [[ $MSAN_SKIPPED -eq 1 ]]; then echo -e "${YELLOW}SKIP${NC}"; elif [[ $TEST_STATUS_MSAN -eq 0 ]]; then echo -e "${GREEN}PASS${NC}"; else echo -e "${RED}FAIL (exit $TEST_STATUS_MSAN)${NC}"; fi)" echo -e " TSan: $(if [[ $TEST_STATUS_TSAN -eq 0 ]]; then echo -e "${GREEN}PASS${NC}"; else echo -e "${RED}FAIL (exit $TEST_STATUS_TSAN)${NC}"; fi)" echo -e " Duration: $(( TEST_END - TEST_START ))s" - echo -e " Logs: /tmp/dcodex-test-{asan,tsan}.log" + echo -e " Logs: /tmp/dcodex-test-{asan,msan,tsan}.log" echo "" - if [[ $TEST_STATUS_ASAN -eq 0 && $TEST_STATUS_TSAN -eq 0 ]]; then + if [[ $TEST_STATUS_ASAN -eq 0 && ($TEST_STATUS_MSAN -eq 0 || $MSAN_SKIPPED -eq 1) && $TEST_STATUS_TSAN -eq 0 ]]; then ok "All active test suites passed in $(( TEST_END - TEST_START ))s" else FAILED_SUITES="" [[ $TEST_STATUS_ASAN -ne 0 ]] && FAILED_SUITES+="asan " - [[ $TEST_STATUS_MSAN -ne 0 ]] && FAILED_SUITES+="msan " + [[ $TEST_STATUS_MSAN -ne 0 && $MSAN_SKIPPED -eq 0 ]] && FAILED_SUITES+="msan " [[ $TEST_STATUS_TSAN -ne 0 ]] && FAILED_SUITES+="tsan " die "Tests FAILED: ${FAILED_SUITES}— see diagnostic output above and /tmp/dcodex-test-*.log" fi