From 512600703557530897e10f27faad12fae28551a2 Mon Sep 17 00:00:00 2001 From: "randomizedcoder dave.seddon.ca@gmail.com" Date: Wed, 18 Mar 2026 19:27:16 -0700 Subject: [PATCH] nix: add static analysis infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port static analysis framework from the reference Nix implementation, adapted for XDP2's C codebase and Make-based build system. 8 analysis tools at 3 levels: - quick: clang-tidy + cppcheck - standard: + flawfinder, clang-analyzer, gcc-warnings - deep: + gcc-analyzer, semgrep, sanitizers Compilation database generated by parsing `make V=1 VERBOSE=1` output with a custom Python script, since bear's LD_PRELOAD fails in the Nix sandbox and compiledb doesn't recognize Nix wrapper compiler paths. Python triage system aggregates, deduplicates, and prioritizes findings across all tools. Exemptions documented in EXEMPTIONS.md cover cppcheck tool limitations (macro parsing, void pointer arithmetic, container_of patterns) and high-volume style checks (narrowing conversions, reserved identifiers, assignment-in-if) that are intentional in C networking code. Results (analysis-quick): clang-tidy: 18,108 raw → 3,653 after triage cppcheck: 202 raw triage: 14 high-confidence findings Usage: nix build .#analysis-quick nix build .#analysis-standard nix build .#analysis-deep make analysis Co-Authored-By: Claude Opus 4.6 --- Makefile | 38 ++++ flake.nix | 27 +++ nix/analysis/clang-analyzer.nix | 197 ++++++++++++++++ nix/analysis/clang-tidy.nix | 48 ++++ nix/analysis/compile-db.nix | 259 ++++++++++++++++++++++ nix/analysis/cppcheck.nix | 55 +++++ nix/analysis/default.nix | 182 +++++++++++++++ nix/analysis/flawfinder.nix | 40 ++++ nix/analysis/gcc.nix | 215 ++++++++++++++++++ nix/analysis/sanitizers.nix | 207 +++++++++++++++++ nix/analysis/semgrep-rules.yaml | 204 +++++++++++++++++ nix/analysis/semgrep.nix | 59 +++++ nix/analysis/triage/EXEMPTIONS.md | 151 +++++++++++++ nix/analysis/triage/__main__.py | 71 ++++++ nix/analysis/triage/filters.py | 107 +++++++++ nix/analysis/triage/finding.py | 44 ++++ nix/analysis/triage/parsers/__init__.py | 49 ++++ nix/analysis/triage/parsers/clang.py | 36 +++ nix/analysis/triage/parsers/cppcheck.py | 35 +++ nix/analysis/triage/parsers/flawfinder.py | 37 ++++ nix/analysis/triage/parsers/semgrep.py | 38 ++++ nix/analysis/triage/reports.py | 170 ++++++++++++++ nix/analysis/triage/scoring.py | 171 ++++++++++++++ nix/packages.nix | 6 + 24 files changed, 2446 insertions(+) create mode 100644 nix/analysis/clang-analyzer.nix create mode 100644 nix/analysis/clang-tidy.nix create mode 100644 nix/analysis/compile-db.nix create mode 100644 nix/analysis/cppcheck.nix create mode 100644 nix/analysis/default.nix create mode 100644 nix/analysis/flawfinder.nix create mode 100644 nix/analysis/gcc.nix create mode 100644 nix/analysis/sanitizers.nix create mode 100644 nix/analysis/semgrep-rules.yaml create mode 100644 nix/analysis/semgrep.nix create mode 100644 nix/analysis/triage/EXEMPTIONS.md create mode 100644 nix/analysis/triage/__main__.py create mode 100644 nix/analysis/triage/filters.py create mode 100644 nix/analysis/triage/finding.py create mode 100644 nix/analysis/triage/parsers/__init__.py create mode 100644 nix/analysis/triage/parsers/clang.py create mode 100644 nix/analysis/triage/parsers/cppcheck.py create mode 100644 nix/analysis/triage/parsers/flawfinder.py create mode 100644 nix/analysis/triage/parsers/semgrep.py create mode 100644 nix/analysis/triage/reports.py create mode 100644 nix/analysis/triage/scoring.py diff --git a/Makefile b/Makefile index db3bcf9..6260419 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,7 @@ .PHONY: aarch64 aarch64-debug aarch64-samples aarch64-tests test-aarch64 test-aarch64-vm .PHONY: vm-x86 vm-aarch64 vm-riscv64 vm-test-all .PHONY: deb deb-x86 +.PHONY: analysis analysis-quick analysis-standard analysis-deep .PHONY: dev shell check eval # Default target @@ -75,6 +76,12 @@ help: @echo " make vm-riscv64 Build RISC-V MicroVM -> result-vm-riscv64/" @echo " make vm-test-all Run full VM lifecycle tests (all architectures)" @echo "" + @echo "=== Static Analysis ===" + @echo " make analysis Run quick static analysis (alias for analysis-quick)" + @echo " make analysis-quick Run clang-tidy + cppcheck" + @echo " make analysis-standard Run + flawfinder, clang-analyzer, gcc-warnings" + @echo " make analysis-deep Run all 8 tools including gcc-analyzer, semgrep, sanitizers" + @echo "" @echo "=== Packaging ===" @echo " make deb Build Debian package -> result-deb/" @echo "" @@ -280,6 +287,33 @@ deb: deb-x86: deb +# ============================================================================= +# Static Analysis +# ============================================================================= + +# Quick analysis: clang-tidy + cppcheck +analysis: analysis-quick + +analysis-quick: + @echo "Running quick static analysis (clang-tidy + cppcheck)..." + nix build .#analysis-quick -o result-analysis-quick + @echo "" + @cat result-analysis-quick/summary.txt + +# Standard analysis: + flawfinder, clang-analyzer, gcc-warnings +analysis-standard: + @echo "Running standard static analysis..." + nix build .#analysis-standard -o result-analysis-standard + @echo "" + @cat result-analysis-standard/summary.txt + +# Deep analysis: all 8 tools +analysis-deep: + @echo "Running deep static analysis (all tools)..." + nix build .#analysis-deep -o result-analysis-deep + @echo "" + @cat result-analysis-deep/summary.txt + # ============================================================================= # Development # ============================================================================= @@ -313,6 +347,10 @@ eval: nix eval .#xdp2-debug-aarch64 --apply 'x: x.name' 2>/dev/null && echo " xdp2-debug-aarch64: OK" || echo " xdp2-debug-aarch64: FAIL" nix eval .#prebuilt-samples-aarch64 --apply 'x: x.name' 2>/dev/null && echo " prebuilt-samples-aarch64: OK" || echo " prebuilt-samples-aarch64: FAIL" nix eval .#aarch64-tests.all --apply 'x: x.name' 2>/dev/null && echo " aarch64-tests.all: OK" || echo " aarch64-tests.all: FAIL" + @echo "Analysis:" + nix eval .#analysis-quick --apply 'x: x.name' 2>/dev/null && echo " analysis-quick: OK" || echo " analysis-quick: FAIL" + nix eval .#analysis-standard --apply 'x: x.name' 2>/dev/null && echo " analysis-standard: OK" || echo " analysis-standard: FAIL" + nix eval .#analysis-deep --apply 'x: x.name' 2>/dev/null && echo " analysis-deep: OK" || echo " analysis-deep: FAIL" @echo "" @echo "All evaluations completed." diff --git a/flake.nix b/flake.nix index b459f85..9203270 100644 --- a/flake.nix +++ b/flake.nix @@ -111,6 +111,15 @@ xdp2 = xdp2-debug; # Tests use debug build with assertions }; + # ===================================================================== + # Static Analysis Infrastructure + # Ported from reference implementation, adapted for C/Make build system + # ===================================================================== + analysis = import ./nix/analysis { + inherit pkgs lib llvmConfig packagesModule; + src = ./.; + }; + # ===================================================================== # Phase 1: Packaging (x86_64 .deb only) # See: documentation/nix/microvm-implementation-phase1.md @@ -172,6 +181,24 @@ # Usage: nix run .#run-sample-tests inherit run-sample-tests; + # =================================================================== + # Static Analysis + # Usage: nix build .#analysis-quick + # nix build .#analysis-standard + # nix build .#analysis-deep + # =================================================================== + analysis-quick = analysis.quick; + analysis-standard = analysis.standard; + analysis-deep = analysis.deep; + analysis-clang-tidy = analysis.clang-tidy; + analysis-cppcheck = analysis.cppcheck; + analysis-flawfinder = analysis.flawfinder; + analysis-clang-analyzer = analysis.clang-analyzer; + analysis-gcc-warnings = analysis.gcc-warnings; + analysis-gcc-analyzer = analysis.gcc-analyzer; + analysis-semgrep = analysis.semgrep; + analysis-sanitizers = analysis.sanitizers; + # =================================================================== # Phase 1: Packaging outputs (x86_64 .deb only) # See: documentation/nix/microvm-implementation-phase1.md diff --git a/nix/analysis/clang-analyzer.nix b/nix/analysis/clang-analyzer.nix new file mode 100644 index 0000000..19b59f3 --- /dev/null +++ b/nix/analysis/clang-analyzer.nix @@ -0,0 +1,197 @@ +# nix/analysis/clang-analyzer.nix +# +# Clang Static Analyzer (scan-build) for XDP2's C codebase. +# +# Adapted from the reference C++ implementation: +# - Uses C-specific checkers (core.*, security.*, unix.*, alpha.security.*) +# - No C++ checkers (cplusplus.*, alpha.cplusplus.*) +# - Builds via Make instead of Meson+Ninja +# + +{ + lib, + pkgs, + src, + llvmConfig, + nativeBuildInputs, + buildInputs, +}: + +let + llvmPackages = llvmConfig.llvmPackages; + hostPkgs = pkgs.buildPackages; + hostCC = hostPkgs.stdenv.cc; + hostPython = hostPkgs.python3.withPackages (p: [ p.scapy ]); + + host-gcc = hostPkgs.writeShellScript "host-gcc" '' + exec ${hostCC}/bin/gcc \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + "$@" + ''; + + host-gxx = hostPkgs.writeShellScript "host-g++" '' + exec ${hostCC}/bin/g++ \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -I${hostPython}/include/python3.13 \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + -L${hostPython}/lib \ + -Wl,-rpath,${hostPython}/lib \ + "$@" + ''; + + scanBuildCheckers = lib.concatStringsSep " " [ + "-enable-checker core.NullDereference" + "-enable-checker core.DivideZero" + "-enable-checker core.UndefinedBinaryOperatorResult" + "-enable-checker core.uninitialized.Assign" + "-enable-checker security.FloatLoopCounter" + "-enable-checker security.insecureAPI.getpw" + "-enable-checker security.insecureAPI.gets" + "-enable-checker security.insecureAPI.vfork" + "-enable-checker unix.Malloc" + "-enable-checker unix.MallocSizeof" + "-enable-checker unix.MismatchedDeallocator" + "-enable-checker alpha.security.ArrayBoundV2" + "-enable-checker alpha.unix.SimpleStream" + ]; + +in +pkgs.stdenv.mkDerivation { + pname = "xdp2-analysis-clang-analyzer"; + version = "0.1.0"; + inherit src; + + nativeBuildInputs = nativeBuildInputs ++ [ + pkgs.clang-analyzer + ]; + inherit buildInputs; + + hardeningDisable = [ "all" ]; + dontFixup = true; + doCheck = false; + + HOST_CC = "${hostCC}/bin/gcc"; + HOST_CXX = "${hostCC}/bin/g++"; + HOST_LLVM_CONFIG = "${llvmConfig.llvm-config-wrapped}/bin/llvm-config"; + XDP2_CLANG_VERSION = llvmConfig.version; + XDP2_CLANG_RESOURCE_PATH = llvmConfig.paths.clangResourceDir; + + LD_LIBRARY_PATH = lib.makeLibraryPath [ + llvmPackages.llvm + llvmPackages.libclang.lib + hostPkgs.boost + ]; + + postPatch = '' + substituteInPlace thirdparty/cppfront/Makefile \ + --replace-fail 'include ../../src/config.mk' '# config.mk not needed for standalone build' + + sed -i '1i#include \n#include \n' thirdparty/cppfront/include/cpp2util.h + + substituteInPlace src/configure.sh \ + --replace-fail 'CC_GCC="gcc"' 'CC_GCC="''${CC_GCC:-gcc}"' \ + --replace-fail 'CC_CXX="g++"' 'CC_CXX="''${CC_CXX:-g++}"' + ''; + + configurePhase = '' + runHook preConfigure + + cd src + + export PATH="${hostCC}/bin:${hostPython}/bin:$PATH" + export CC_GCC="${host-gcc}" + export CC_CXX="${host-gxx}" + export CC="${host-gcc}" + export CXX="${host-gxx}" + export PKG_CONFIG_PATH="${hostPython}/lib/pkgconfig:$PKG_CONFIG_PATH" + export HOST_CC="$CC" + export HOST_CXX="$CXX" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export CONFIGURE_DEBUG_LEVEL=7 + + bash configure.sh --build-opt-parser + + if grep -q 'PATH_ARG="--with-path=' config.mk; then + sed -i 's|PATH_ARG="--with-path=.*"|PATH_ARG=""|' config.mk + fi + + sed -i 's|^HOST_CC := gcc$|HOST_CC := ${host-gcc}|' config.mk + sed -i 's|^HOST_CXX := g++$|HOST_CXX := ${host-gxx}|' config.mk + echo "HOST_LDFLAGS := -L${hostPkgs.boost}/lib -Wl,-rpath,${hostPkgs.boost}/lib" >> config.mk + + cd .. + + runHook postConfigure + ''; + + buildPhase = '' + runHook preBuild + + export HOST_CC="${hostCC}/bin/gcc" + export HOST_CXX="${hostCC}/bin/g++" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export XDP2_GLIBC_INCLUDE_PATH="${hostPkgs.stdenv.cc.libc.dev}/include" + export XDP2_LINUX_HEADERS_PATH="${hostPkgs.linuxHeaders}/include" + + # Build cppfront first + echo "Building cppfront..." + cd thirdparty/cppfront + $HOST_CXX -std=c++20 source/cppfront.cpp -o cppfront-compiler + cd ../.. + + # Build xdp2-compiler + echo "Building xdp2-compiler..." + cd src/tools/compiler + make -j''${NIX_BUILD_CORES:-1} + cd ../../.. + + # Build xdp2 libraries wrapped with scan-build. + # Use full path to clang-analyzer's scan-build (properly wrapped with Nix shebang). + # The one from llvmPackages.clang has a broken /usr/bin/env shebang. + echo "Running scan-build on xdp2..." + cd src + ${pkgs.clang-analyzer}/bin/scan-build \ + --use-analyzer=${llvmPackages.clang}/bin/clang \ + ${scanBuildCheckers} \ + -o "$NIX_BUILD_TOP/scan-results" \ + make -j''${NIX_BUILD_CORES:-1} \ + 2>&1 | tee "$NIX_BUILD_TOP/scan-build.log" || true + cd .. + + runHook postBuild + ''; + + installPhase = '' + mkdir -p $out + + # Copy HTML reports if produced + if [ -d "$NIX_BUILD_TOP/scan-results" ] && [ "$(ls -A "$NIX_BUILD_TOP/scan-results" 2>/dev/null)" ]; then + mkdir -p $out/html-report + cp -r "$NIX_BUILD_TOP/scan-results"/* $out/html-report/ 2>/dev/null || true + fi + + # Extract finding count from scan-build output + findings=$(grep -oP '\d+ bugs? found' "$NIX_BUILD_TOP/scan-build.log" | grep -oP '^\d+' || echo "0") + echo "$findings" > $out/count.txt + + cp "$NIX_BUILD_TOP/scan-build.log" $out/report.txt + + { + echo "=== Clang Static Analyzer (C) ===" + echo "" + echo "Path-sensitive analysis with C-specific checkers." + echo "Findings: $findings" + } > $out/summary.txt + ''; +} diff --git a/nix/analysis/clang-tidy.nix b/nix/analysis/clang-tidy.nix new file mode 100644 index 0000000..aa30b28 --- /dev/null +++ b/nix/analysis/clang-tidy.nix @@ -0,0 +1,48 @@ +# nix/analysis/clang-tidy.nix +# +# clang-tidy runner for XDP2's C codebase. +# +# Adapted from the reference C++ implementation: +# - Finds .c and .h files instead of .cc +# - Uses C-appropriate checks (no cppcoreguidelines, modernize) +# - No custom plugin (nixTidyChecks not applicable to XDP2) +# + +{ + pkgs, + mkCompileDbReport, +}: + +let + runner = pkgs.writeShellApplication { + name = "run-clang-tidy-analysis"; + runtimeInputs = with pkgs; [ + clang-tools + coreutils + findutils + gnugrep + ]; + text = '' + compile_db="$1" + source_dir="$2" + output_dir="$3" + + echo "=== clang-tidy Analysis (C) ===" + echo "Using compilation database: $compile_db" + + # Find all .c source files in library and tool directories + find "$source_dir/src" -name '*.c' -not -path '*/test*' -print0 | \ + xargs -0 -P "$(nproc)" -I{} \ + clang-tidy \ + -p "$compile_db" \ + --header-filter='src/.*' \ + --checks='-*,bugprone-*,cert-*,clang-analyzer-*,misc-*,readability-*' \ + {} \ + > "$output_dir/report.txt" 2>&1 || true + + findings=$(grep -c ': warning:\|: error:' "$output_dir/report.txt" || echo "0") + echo "$findings" > "$output_dir/count.txt" + ''; + }; +in +mkCompileDbReport "clang-tidy" runner diff --git a/nix/analysis/compile-db.nix b/nix/analysis/compile-db.nix new file mode 100644 index 0000000..7454983 --- /dev/null +++ b/nix/analysis/compile-db.nix @@ -0,0 +1,259 @@ +# nix/analysis/compile-db.nix +# +# Generate compile_commands.json for XDP2. +# +# Unlike the reference Nix project (which uses Meson's built-in compile DB +# generation), XDP2 uses Make. We parse `make V=1 VERBOSE=1` output directly +# because bear's LD_PRELOAD fails in the Nix sandbox, and compiledb doesn't +# recognize Nix wrapper paths as compilers. +# + +{ + pkgs, + lib, + llvmConfig, + nativeBuildInputs, + buildInputs, +}: + +let + llvmPackages = llvmConfig.llvmPackages; + hostPkgs = pkgs.buildPackages; + hostCC = hostPkgs.stdenv.cc; + hostPython = hostPkgs.python3.withPackages (p: [ p.scapy ]); + + host-gcc = hostPkgs.writeShellScript "host-gcc" '' + exec ${hostCC}/bin/gcc \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + "$@" + ''; + + host-gxx = hostPkgs.writeShellScript "host-g++" '' + exec ${hostCC}/bin/g++ \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -I${hostPython}/include/python3.13 \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + -L${hostPython}/lib \ + -Wl,-rpath,${hostPython}/lib \ + "$@" + ''; + + # Python script to generate compile_commands.json from make build output. + genCompileDbScript = pkgs.writeText "gen-compile-db.py" '' + import json, os, re, sys + + make_output = sys.argv[1] + output_file = sys.argv[2] + store_src = sys.argv[3] + source_root = sys.argv[4] + + build_prefix = "/build/" + source_root + + entries = [] + current_dir = None + + with open(make_output) as f: + raw_lines = f.readlines() + + print(f"Raw lines read: {len(raw_lines)}", file=sys.stderr) + + # Join backslash-continued lines, stripping continuation indentation + lines = [] + buf = "" + for raw in raw_lines: + stripped = raw.rstrip('\n').rstrip('\r') + if stripped.rstrip().endswith('\\'): + s = stripped.rstrip() + buf += s[:-1].rstrip() + " " + else: + if buf: + # This is a continuation line - strip leading whitespace + buf += stripped.lstrip() + else: + buf = stripped + lines.append(buf) + buf = "" + if buf: + lines.append(buf) + + print(f"Joined lines: {len(lines)}", file=sys.stderr) + + c_lines = [l for l in lines if ' -c ' in l] + print(f"Compilation lines found: {len(c_lines)}", file=sys.stderr) + + for line in lines: + # Track directory changes from make -w + m = re.match(r"make\[\d+\]: Entering directory '(.+)'", line) + if m: + current_dir = m.group(1) + continue + + # Match C/C++ compilation commands: must contain -c flag + if ' -c ' not in line: + continue + + # Find source file: last token matching *.c, *.cc, *.cpp, *.cxx + tokens = line.split() + src_file = None + for token in reversed(tokens): + if re.match(r'.*\.(?:c|cc|cpp|cxx|C)$', token): + src_file = token + break + if not src_file: + continue + + directory = current_dir or os.getcwd() + + # Normalize paths + abs_file = src_file + if not os.path.isabs(src_file): + abs_file = os.path.normpath(os.path.join(directory, src_file)) + + # Fix sandbox paths to store paths + abs_file = abs_file.replace(build_prefix, store_src) + directory = directory.replace(build_prefix, store_src) + cmd = line.strip().replace(build_prefix, store_src) + + entries.append({ + "directory": directory, + "command": cmd, + "file": abs_file, + }) + + with open(output_file, "w") as f: + json.dump(entries, f, indent=2) + + print(f"Generated {len(entries)} compile commands", file=sys.stderr) + ''; + +in +pkgs.stdenv.mkDerivation { + pname = "xdp2-compilation-db"; + version = "0.1.0"; + + src = ../..; + + nativeBuildInputs = nativeBuildInputs ++ [ + pkgs.buildPackages.python3 + ]; + inherit buildInputs; + + hardeningDisable = [ "all" ]; + + HOST_CC = "${hostCC}/bin/gcc"; + HOST_CXX = "${hostCC}/bin/g++"; + HOST_LLVM_CONFIG = "${llvmConfig.llvm-config-wrapped}/bin/llvm-config"; + XDP2_CLANG_VERSION = llvmConfig.version; + XDP2_CLANG_RESOURCE_PATH = llvmConfig.paths.clangResourceDir; + + LD_LIBRARY_PATH = lib.makeLibraryPath [ + llvmPackages.llvm + llvmPackages.libclang.lib + hostPkgs.boost + ]; + + dontFixup = true; + doCheck = false; + + # Replicate derivation.nix's postPatch + postPatch = '' + substituteInPlace thirdparty/cppfront/Makefile \ + --replace-fail 'include ../../src/config.mk' '# config.mk not needed for standalone build' + + sed -i '1i#include \n#include \n' thirdparty/cppfront/include/cpp2util.h + + substituteInPlace src/configure.sh \ + --replace-fail 'CC_GCC="gcc"' 'CC_GCC="''${CC_GCC:-gcc}"' \ + --replace-fail 'CC_CXX="g++"' 'CC_CXX="''${CC_CXX:-g++}"' + ''; + + # Replicate derivation.nix's configurePhase + configurePhase = '' + runHook preConfigure + + cd src + + export PATH="${hostCC}/bin:${hostPython}/bin:$PATH" + export CC_GCC="${host-gcc}" + export CC_CXX="${host-gxx}" + export CC="${host-gcc}" + export CXX="${host-gxx}" + export PKG_CONFIG_PATH="${hostPython}/lib/pkgconfig:$PKG_CONFIG_PATH" + export HOST_CC="$CC" + export HOST_CXX="$CXX" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export CONFIGURE_DEBUG_LEVEL=7 + + bash configure.sh --build-opt-parser + + if grep -q 'PATH_ARG="--with-path=' config.mk; then + sed -i 's|PATH_ARG="--with-path=.*"|PATH_ARG=""|' config.mk + fi + + sed -i 's|^HOST_CC := gcc$|HOST_CC := ${host-gcc}|' config.mk + sed -i 's|^HOST_CXX := g++$|HOST_CXX := ${host-gxx}|' config.mk + echo "HOST_LDFLAGS := -L${hostPkgs.boost}/lib -Wl,-rpath,${hostPkgs.boost}/lib" >> config.mk + + cd .. + + runHook postConfigure + ''; + + # Build prerequisites, then use compiledb to capture compile commands + buildPhase = '' + runHook preBuild + + export HOST_CC="${hostCC}/bin/gcc" + export HOST_CXX="${hostCC}/bin/g++" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export XDP2_GLIBC_INCLUDE_PATH="${hostPkgs.stdenv.cc.libc.dev}/include" + export XDP2_LINUX_HEADERS_PATH="${hostPkgs.linuxHeaders}/include" + + # Build cppfront first (needed by xdp2-compiler) + echo "Building cppfront..." + cd thirdparty/cppfront + $HOST_CXX -std=c++20 source/cppfront.cpp -o cppfront-compiler + cd ../.. + + # Build xdp2-compiler (needed for source generation) + echo "Building xdp2-compiler..." + cd src/tools/compiler + make -j''${NIX_BUILD_CORES:-1} + cd ../../.. + + # Build xdp2 with verbose output and capture all compiler invocations. + # We parse the real build output because: + # - bear's LD_PRELOAD doesn't work in Nix sandbox + # - compiledb doesn't recognize Nix wrapper paths as compilers + # Use -j1 to prevent interleaved output that breaks line continuation parsing. + # Use both V=1 and VERBOSE=1 for full command echoing. + echo "Building xdp2 libraries (capturing compile commands)..." + cd src + make V=1 VERBOSE=1 -j1 -wk 2>&1 | tee "$NIX_BUILD_TOP/make-build.log" || true + cd .. + + runHook postBuild + ''; + + installPhase = '' + mkdir -p $out + + ${pkgs.buildPackages.python3}/bin/python3 \ + ${genCompileDbScript} \ + "$NIX_BUILD_TOP/make-build.log" \ + "$out/compile_commands.json" \ + "${../..}" \ + "$sourceRoot" + ''; +} diff --git a/nix/analysis/cppcheck.nix b/nix/analysis/cppcheck.nix new file mode 100644 index 0000000..04e8d69 --- /dev/null +++ b/nix/analysis/cppcheck.nix @@ -0,0 +1,55 @@ +# nix/analysis/cppcheck.nix +# +# cppcheck runner for XDP2's C codebase. +# +# Adapted from reference: uses --std=c11 instead of --std=c++20. +# + +{ + pkgs, + mkCompileDbReport, +}: + +let + runner = pkgs.writeShellApplication { + name = "run-cppcheck-analysis"; + runtimeInputs = with pkgs; [ + cppcheck + coreutils + gnugrep + ]; + text = '' + compile_db="$1" + # shellcheck disable=SC2034 + source_dir="$2" + output_dir="$3" + + echo "=== cppcheck Analysis (C) ===" + + # Use --project for compilation database (cannot combine with source args) + cppcheck \ + --project="$compile_db/compile_commands.json" \ + --enable=all \ + --std=c11 \ + --suppress=missingInclude \ + --suppress=unusedFunction \ + --suppress=unmatchedSuppression \ + --xml \ + 2> "$output_dir/report.xml" || true + + # Also produce a human-readable text report + cppcheck \ + --project="$compile_db/compile_commands.json" \ + --enable=all \ + --std=c11 \ + --suppress=missingInclude \ + --suppress=unusedFunction \ + --suppress=unmatchedSuppression \ + 2> "$output_dir/report.txt" || true + + findings=$(grep -c '\(error\|warning\|style\|performance\|portability\)' "$output_dir/report.txt" || echo "0") + echo "$findings" > "$output_dir/count.txt" + ''; + }; +in +mkCompileDbReport "cppcheck" runner diff --git a/nix/analysis/default.nix b/nix/analysis/default.nix new file mode 100644 index 0000000..23bd313 --- /dev/null +++ b/nix/analysis/default.nix @@ -0,0 +1,182 @@ +# nix/analysis/default.nix +# +# Static analysis infrastructure entry point for XDP2. +# +# Ported from the reference Nix project's analysis framework, +# adapted for XDP2's C codebase and Make-based build system. +# +# Provides 8 analysis tools at 3 levels: +# quick: clang-tidy + cppcheck +# standard: + flawfinder, clang-analyzer, gcc-warnings +# deep: + gcc-analyzer, semgrep, sanitizers +# +# Usage: +# nix build .#analysis-quick +# nix build .#analysis-standard +# nix build .#analysis-deep +# + +{ + pkgs, + lib, + llvmConfig, + packagesModule, + src, +}: + +let + # ── Compilation database ──────────────────────────────────────── + + compilationDb = import ./compile-db.nix { + inherit lib pkgs llvmConfig; + inherit (packagesModule) nativeBuildInputs buildInputs; + }; + + # ── Helper for tools that need compilation database ───────────── + + mkCompileDbReport = name: script: + pkgs.runCommand "xdp2-analysis-${name}" + { + nativeBuildInputs = [ script ]; + } + '' + mkdir -p $out + ${lib.getExe script} ${compilationDb} ${src} $out + ''; + + # ── Helper for tools that work on raw source ──────────────────── + + mkSourceReport = name: script: + pkgs.runCommand "xdp2-analysis-${name}" + { + nativeBuildInputs = [ script ]; + } + '' + mkdir -p $out + ${lib.getExe script} ${src} $out + ''; + + # ── Individual tool targets ──────────────────────────────────── + + clang-tidy = import ./clang-tidy.nix { + inherit pkgs mkCompileDbReport; + }; + + cppcheck = import ./cppcheck.nix { + inherit pkgs mkCompileDbReport; + }; + + flawfinder = import ./flawfinder.nix { + inherit pkgs mkSourceReport; + }; + + semgrep = import ./semgrep.nix { + inherit pkgs mkSourceReport; + }; + + gccTargets = import ./gcc.nix { + inherit lib pkgs src llvmConfig; + inherit (packagesModule) nativeBuildInputs buildInputs; + }; + + clang-analyzer = import ./clang-analyzer.nix { + inherit lib pkgs src llvmConfig; + inherit (packagesModule) nativeBuildInputs buildInputs; + }; + + sanitizers = import ./sanitizers.nix { + inherit lib pkgs src llvmConfig; + inherit (packagesModule) nativeBuildInputs buildInputs; + }; + + # ── Triage system path ────────────────────────────────────────── + + triagePath = "${src}/nix/analysis/triage"; + + # ── Combined targets ─────────────────────────────────────────── + + quick = pkgs.runCommand "xdp2-analysis-quick" { nativeBuildInputs = [ pkgs.python3 ]; } '' + mkdir -p $out + ln -s ${clang-tidy} $out/clang-tidy + ln -s ${cppcheck} $out/cppcheck + python3 ${triagePath} $out --output-dir $out/triage + { + echo "=== Analysis Summary (quick) ===" + echo "" + echo "clang-tidy: $(cat ${clang-tidy}/count.txt) findings" + echo "cppcheck: $(cat ${cppcheck}/count.txt) findings" + echo "triage: $(cat $out/triage/count.txt) high-confidence findings" + echo "" + echo "Run 'nix build .#analysis-standard' for more thorough analysis." + } > $out/summary.txt + cat $out/summary.txt + ''; + + standard = pkgs.runCommand "xdp2-analysis-standard" { nativeBuildInputs = [ pkgs.python3 ]; } '' + mkdir -p $out + ln -s ${clang-tidy} $out/clang-tidy + ln -s ${cppcheck} $out/cppcheck + ln -s ${flawfinder} $out/flawfinder + ln -s ${clang-analyzer} $out/clang-analyzer + ln -s ${gccTargets.gcc-warnings} $out/gcc-warnings + python3 ${triagePath} $out --output-dir $out/triage + { + echo "=== Analysis Summary (standard) ===" + echo "" + echo "clang-tidy: $(cat ${clang-tidy}/count.txt) findings" + echo "cppcheck: $(cat ${cppcheck}/count.txt) findings" + echo "flawfinder: $(cat ${flawfinder}/count.txt) findings" + echo "clang-analyzer: $(cat ${clang-analyzer}/count.txt) findings" + echo "gcc-warnings: $(cat ${gccTargets.gcc-warnings}/count.txt) findings" + echo "triage: $(cat $out/triage/count.txt) high-confidence findings" + echo "" + echo "Run 'nix build .#analysis-deep' for full analysis including" + echo "GCC -fanalyzer, semgrep, and sanitizer builds." + } > $out/summary.txt + cat $out/summary.txt + ''; + + deep = pkgs.runCommand "xdp2-analysis-deep" { nativeBuildInputs = [ pkgs.python3 ]; } '' + mkdir -p $out + ln -s ${clang-tidy} $out/clang-tidy + ln -s ${cppcheck} $out/cppcheck + ln -s ${flawfinder} $out/flawfinder + ln -s ${clang-analyzer} $out/clang-analyzer + ln -s ${gccTargets.gcc-warnings} $out/gcc-warnings + ln -s ${gccTargets.gcc-analyzer} $out/gcc-analyzer + ln -s ${semgrep} $out/semgrep + ln -s ${sanitizers} $out/sanitizers + python3 ${triagePath} $out --output-dir $out/triage + { + echo "=== Analysis Summary (deep) ===" + echo "" + echo "clang-tidy: $(cat ${clang-tidy}/count.txt) findings" + echo "cppcheck: $(cat ${cppcheck}/count.txt) findings" + echo "flawfinder: $(cat ${flawfinder}/count.txt) findings" + echo "clang-analyzer: $(cat ${clang-analyzer}/count.txt) findings" + echo "gcc-warnings: $(cat ${gccTargets.gcc-warnings}/count.txt) findings" + echo "gcc-analyzer: $(cat ${gccTargets.gcc-analyzer}/count.txt) findings" + echo "semgrep: $(cat ${semgrep}/count.txt) findings" + echo "sanitizers: $(cat ${sanitizers}/count.txt) findings" + echo "triage: $(cat $out/triage/count.txt) high-confidence findings" + echo "" + echo "All analysis tools completed." + } > $out/summary.txt + cat $out/summary.txt + ''; + +in +{ + inherit + clang-tidy + cppcheck + flawfinder + clang-analyzer + semgrep + sanitizers + quick + standard + deep + ; + inherit (gccTargets) gcc-warnings gcc-analyzer; +} diff --git a/nix/analysis/flawfinder.nix b/nix/analysis/flawfinder.nix new file mode 100644 index 0000000..c867e0d --- /dev/null +++ b/nix/analysis/flawfinder.nix @@ -0,0 +1,40 @@ +# nix/analysis/flawfinder.nix +# +# flawfinder source scanner — works equally on C and C++. +# Identical to the reference implementation. +# + +{ + pkgs, + mkSourceReport, +}: + +let + runner = pkgs.writeShellApplication { + name = "run-flawfinder-analysis"; + runtimeInputs = with pkgs; [ + flawfinder + coreutils + gnugrep + ]; + text = '' + source_dir="$1" + output_dir="$2" + + echo "=== flawfinder Analysis ===" + + flawfinder \ + --minlevel=1 \ + --columns \ + --context \ + --singleline \ + "$source_dir/src" \ + > "$output_dir/report.txt" 2>&1 || true + + # Extract hit count from flawfinder's summary line: "Hits = N" + findings=$(grep -oP 'Hits = \K[0-9]+' "$output_dir/report.txt" || echo "0") + echo "$findings" > "$output_dir/count.txt" + ''; + }; +in +mkSourceReport "flawfinder" runner diff --git a/nix/analysis/gcc.nix b/nix/analysis/gcc.nix new file mode 100644 index 0000000..5e68edc --- /dev/null +++ b/nix/analysis/gcc.nix @@ -0,0 +1,215 @@ +# nix/analysis/gcc.nix +# +# GCC-based analysis: gcc-warnings and gcc-analyzer. +# +# Adapted from the reference C++ implementation: +# - Uses NIX_CFLAGS_COMPILE instead of NIX_CXXFLAGS_COMPILE +# - Adds C-specific flags: -Wstrict-prototypes, -Wold-style-definition, +# -Wmissing-prototypes, -Wbad-function-cast +# - Builds via Make instead of Meson+Ninja +# + +{ + lib, + pkgs, + src, + llvmConfig, + nativeBuildInputs, + buildInputs, +}: + +let + llvmPackages = llvmConfig.llvmPackages; + hostPkgs = pkgs.buildPackages; + hostCC = hostPkgs.stdenv.cc; + hostPython = hostPkgs.python3.withPackages (p: [ p.scapy ]); + + host-gcc = hostPkgs.writeShellScript "host-gcc" '' + exec ${hostCC}/bin/gcc \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + "$@" + ''; + + host-gxx = hostPkgs.writeShellScript "host-g++" '' + exec ${hostCC}/bin/g++ \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -I${hostPython}/include/python3.13 \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + -L${hostPython}/lib \ + -Wl,-rpath,${hostPython}/lib \ + "$@" + ''; + + gccWarningFlags = [ + "-Wall" + "-Wextra" + "-Wpedantic" + "-Wformat=2" + "-Wformat-security" + "-Wshadow" + "-Wcast-qual" + "-Wcast-align" + "-Wwrite-strings" + "-Wpointer-arith" + "-Wconversion" + "-Wsign-conversion" + "-Wduplicated-cond" + "-Wduplicated-branches" + "-Wlogical-op" + "-Wnull-dereference" + "-Wdouble-promotion" + "-Wfloat-equal" + "-Walloca" + "-Wvla" + "-Werror=return-type" + "-Werror=format-security" + # C-specific warnings + "-Wstrict-prototypes" + "-Wold-style-definition" + "-Wmissing-prototypes" + "-Wbad-function-cast" + ]; + + mkGccAnalysisBuild = name: extraFlags: + pkgs.stdenv.mkDerivation { + pname = "xdp2-analysis-${name}"; + version = "0.1.0"; + inherit src; + + inherit nativeBuildInputs buildInputs; + + hardeningDisable = [ "all" ]; + + env.NIX_CFLAGS_COMPILE = lib.concatStringsSep " " extraFlags; + + HOST_CC = "${hostCC}/bin/gcc"; + HOST_CXX = "${hostCC}/bin/g++"; + HOST_LLVM_CONFIG = "${llvmConfig.llvm-config-wrapped}/bin/llvm-config"; + XDP2_CLANG_VERSION = llvmConfig.version; + XDP2_CLANG_RESOURCE_PATH = llvmConfig.paths.clangResourceDir; + + LD_LIBRARY_PATH = lib.makeLibraryPath [ + llvmPackages.llvm + llvmPackages.libclang.lib + hostPkgs.boost + ]; + + dontFixup = true; + doCheck = false; + + postPatch = '' + substituteInPlace thirdparty/cppfront/Makefile \ + --replace-fail 'include ../../src/config.mk' '# config.mk not needed for standalone build' + + sed -i '1i#include \n#include \n' thirdparty/cppfront/include/cpp2util.h + + substituteInPlace src/configure.sh \ + --replace-fail 'CC_GCC="gcc"' 'CC_GCC="''${CC_GCC:-gcc}"' \ + --replace-fail 'CC_CXX="g++"' 'CC_CXX="''${CC_CXX:-g++}"' + ''; + + configurePhase = '' + runHook preConfigure + + cd src + + export PATH="${hostCC}/bin:${hostPython}/bin:$PATH" + export CC_GCC="${host-gcc}" + export CC_CXX="${host-gxx}" + export CC="${host-gcc}" + export CXX="${host-gxx}" + export PKG_CONFIG_PATH="${hostPython}/lib/pkgconfig:$PKG_CONFIG_PATH" + export HOST_CC="$CC" + export HOST_CXX="$CXX" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export CONFIGURE_DEBUG_LEVEL=7 + + bash configure.sh --build-opt-parser + + if grep -q 'PATH_ARG="--with-path=' config.mk; then + sed -i 's|PATH_ARG="--with-path=.*"|PATH_ARG=""|' config.mk + fi + + sed -i 's|^HOST_CC := gcc$|HOST_CC := ${host-gcc}|' config.mk + sed -i 's|^HOST_CXX := g++$|HOST_CXX := ${host-gxx}|' config.mk + echo "HOST_LDFLAGS := -L${hostPkgs.boost}/lib -Wl,-rpath,${hostPkgs.boost}/lib" >> config.mk + + cd .. + + runHook postConfigure + ''; + + buildPhase = '' + runHook preBuild + + export HOST_CC="${hostCC}/bin/gcc" + export HOST_CXX="${hostCC}/bin/g++" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export XDP2_GLIBC_INCLUDE_PATH="${hostPkgs.stdenv.cc.libc.dev}/include" + export XDP2_LINUX_HEADERS_PATH="${hostPkgs.linuxHeaders}/include" + + # Build cppfront first + echo "Building cppfront..." + cd thirdparty/cppfront + $HOST_CXX -std=c++20 source/cppfront.cpp -o cppfront-compiler + cd ../.. + + # Build xdp2-compiler + echo "Building xdp2-compiler..." + cd src/tools/compiler + make -j''${NIX_BUILD_CORES:-1} + cd ../../.. + + # Build xdp2 libraries and capture all compiler output + echo "Building xdp2 with ${name} flags..." + cd src + make -j''${NIX_BUILD_CORES:-1} 2>&1 | tee "$NIX_BUILD_TOP/build-output.log" || true + cd .. + + runHook postBuild + ''; + + installPhase = '' + mkdir -p $out + # Extract warning/error lines from the build output + grep -E ': warning:|: error:' "$NIX_BUILD_TOP/build-output.log" > $out/report.txt || true + findings=$(wc -l < $out/report.txt) + echo "$findings" > $out/count.txt + + # Include full build log for reference + cp "$NIX_BUILD_TOP/build-output.log" $out/full-build.log + + { + echo "=== ${name} Analysis ===" + echo "" + echo "Flags: ${lib.concatStringsSep " " extraFlags}" + echo "Findings: $findings warnings/errors" + if [ "$findings" -gt 0 ]; then + echo "" + echo "=== Warnings ===" + cat $out/report.txt + fi + } > $out/summary.txt + ''; + }; + +in +{ + gcc-warnings = mkGccAnalysisBuild "gcc-warnings" gccWarningFlags; + + gcc-analyzer = mkGccAnalysisBuild "gcc-analyzer" [ + "-fanalyzer" + "-fdiagnostics-plain-output" + ]; +} diff --git a/nix/analysis/sanitizers.nix b/nix/analysis/sanitizers.nix new file mode 100644 index 0000000..1fd101e --- /dev/null +++ b/nix/analysis/sanitizers.nix @@ -0,0 +1,207 @@ +# nix/analysis/sanitizers.nix +# +# ASan + UBSan instrumented build and test execution. +# +# Unlike the reference (which uses nixComponents.overrideScope), XDP2 +# builds with Make. We build with sanitizer flags and run sample tests +# to detect runtime violations. +# + +{ + lib, + pkgs, + src, + llvmConfig, + nativeBuildInputs, + buildInputs, +}: + +let + llvmPackages = llvmConfig.llvmPackages; + hostPkgs = pkgs.buildPackages; + hostCC = hostPkgs.stdenv.cc; + hostPython = hostPkgs.python3.withPackages (p: [ p.scapy ]); + + host-gcc = hostPkgs.writeShellScript "host-gcc" '' + exec ${hostCC}/bin/gcc \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + "$@" + ''; + + host-gxx = hostPkgs.writeShellScript "host-g++" '' + exec ${hostCC}/bin/g++ \ + -I${hostPkgs.boost.dev}/include \ + -I${hostPkgs.libpcap}/include \ + -I${hostPython}/include/python3.13 \ + -L${hostPkgs.boost}/lib \ + -L${hostPkgs.libpcap.lib}/lib \ + -L${hostPython}/lib \ + -Wl,-rpath,${hostPython}/lib \ + "$@" + ''; + +in +pkgs.stdenv.mkDerivation { + pname = "xdp2-analysis-sanitizers"; + version = "0.1.0"; + inherit src; + + inherit nativeBuildInputs buildInputs; + + hardeningDisable = [ "all" ]; + + HOST_CC = "${hostCC}/bin/gcc"; + HOST_CXX = "${hostCC}/bin/g++"; + HOST_LLVM_CONFIG = "${llvmConfig.llvm-config-wrapped}/bin/llvm-config"; + XDP2_CLANG_VERSION = llvmConfig.version; + XDP2_CLANG_RESOURCE_PATH = llvmConfig.paths.clangResourceDir; + + LD_LIBRARY_PATH = lib.makeLibraryPath [ + llvmPackages.llvm + llvmPackages.libclang.lib + hostPkgs.boost + ]; + + # NOTE: Sanitizer flags are NOT applied via NIX_CFLAGS_COMPILE because + # that would break configure.sh's link tests. Instead, we inject them + # into config.mk CFLAGS/LDFLAGS after configure completes. + + dontFixup = true; + + postPatch = '' + substituteInPlace thirdparty/cppfront/Makefile \ + --replace-fail 'include ../../src/config.mk' '# config.mk not needed for standalone build' + + sed -i '1i#include \n#include \n' thirdparty/cppfront/include/cpp2util.h + + substituteInPlace src/configure.sh \ + --replace-fail 'CC_GCC="gcc"' 'CC_GCC="''${CC_GCC:-gcc}"' \ + --replace-fail 'CC_CXX="g++"' 'CC_CXX="''${CC_CXX:-g++}"' + ''; + + configurePhase = '' + runHook preConfigure + + cd src + + export PATH="${hostCC}/bin:${hostPython}/bin:$PATH" + export CC_GCC="${host-gcc}" + export CC_CXX="${host-gxx}" + export CC="${host-gcc}" + export CXX="${host-gxx}" + export PKG_CONFIG_PATH="${hostPython}/lib/pkgconfig:$PKG_CONFIG_PATH" + export HOST_CC="$CC" + export HOST_CXX="$CXX" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export CONFIGURE_DEBUG_LEVEL=7 + + bash configure.sh --build-opt-parser + + if grep -q 'PATH_ARG="--with-path=' config.mk; then + sed -i 's|PATH_ARG="--with-path=.*"|PATH_ARG=""|' config.mk + fi + + sed -i 's|^HOST_CC := gcc$|HOST_CC := ${host-gcc}|' config.mk + sed -i 's|^HOST_CXX := g++$|HOST_CXX := ${host-gxx}|' config.mk + echo "HOST_LDFLAGS := -L${hostPkgs.boost}/lib -Wl,-rpath,${hostPkgs.boost}/lib" >> config.mk + + # Inject sanitizer flags into config.mk AFTER configure completes + echo "EXTRA_CFLAGS += -fsanitize=address,undefined -fno-omit-frame-pointer" >> config.mk + echo "LDFLAGS += -fsanitize=address,undefined" >> config.mk + + cd .. + + runHook postConfigure + ''; + + buildPhase = '' + runHook preBuild + + export HOST_CC="${hostCC}/bin/gcc" + export HOST_CXX="${hostCC}/bin/g++" + export HOST_LLVM_CONFIG="${llvmConfig.llvm-config-wrapped}/bin/llvm-config" + export XDP2_CLANG_VERSION="${llvmConfig.version}" + export XDP2_CLANG_RESOURCE_PATH="${llvmConfig.paths.clangResourceDir}" + export XDP2_C_INCLUDE_PATH="${llvmConfig.paths.clangResourceDir}/include" + export XDP2_GLIBC_INCLUDE_PATH="${hostPkgs.stdenv.cc.libc.dev}/include" + export XDP2_LINUX_HEADERS_PATH="${hostPkgs.linuxHeaders}/include" + + # Build cppfront (without sanitizers — host tool) + echo "Building cppfront..." + cd thirdparty/cppfront + $HOST_CXX -std=c++20 source/cppfront.cpp -o cppfront-compiler + cd ../.. + + # Build xdp2-compiler (host tool) + echo "Building xdp2-compiler..." + cd src/tools/compiler + make -j''${NIX_BUILD_CORES:-1} + cd ../../.. + + # Build xdp2 libraries with sanitizer instrumentation + echo "Building xdp2 with ASan+UBSan..." + cd src + make -j''${NIX_BUILD_CORES:-1} 2>&1 | tee "$NIX_BUILD_TOP/build-output.log" || true + cd .. + + runHook postBuild + ''; + + # Run sample tests — sanitizer violations cause non-zero exit + checkPhase = '' + echo "Running tests with sanitizer instrumentation..." + sanitizer_violations=0 + + # Run any built sample parsers against test pcaps + for test_bin in src/test/*/test_*; do + if [ -x "$test_bin" ]; then + echo " Running: $test_bin" + if ! "$test_bin" 2>&1 | tee -a "$NIX_BUILD_TOP/sanitizer-output.log"; then + echo " FAIL: $test_bin" + sanitizer_violations=$((sanitizer_violations + 1)) + fi + fi + done + + echo "$sanitizer_violations" > "$NIX_BUILD_TOP/sanitizer-violations.txt" + ''; + + doCheck = true; + + installPhase = '' + mkdir -p $out + + violations=0 + if [ -f "$NIX_BUILD_TOP/sanitizer-violations.txt" ]; then + violations=$(cat "$NIX_BUILD_TOP/sanitizer-violations.txt") + fi + + { + echo "=== ASan + UBSan Analysis ===" + echo "" + echo "Built with AddressSanitizer + UndefinedBehaviorSanitizer." + echo "Sample tests executed with sanitizer instrumentation." + echo "" + if [ "$violations" -gt 0 ]; then + echo "Result: $violations sanitizer violations detected." + else + echo "Result: All tests passed — no sanitizer violations detected." + fi + } > $out/report.txt + + echo "$violations" > $out/count.txt + + if [ -f "$NIX_BUILD_TOP/sanitizer-output.log" ]; then + cp "$NIX_BUILD_TOP/sanitizer-output.log" $out/sanitizer-output.log + fi + if [ -f "$NIX_BUILD_TOP/build-output.log" ]; then + cp "$NIX_BUILD_TOP/build-output.log" $out/build-output.log + fi + ''; +} diff --git a/nix/analysis/semgrep-rules.yaml b/nix/analysis/semgrep-rules.yaml new file mode 100644 index 0000000..261569d --- /dev/null +++ b/nix/analysis/semgrep-rules.yaml @@ -0,0 +1,204 @@ +rules: + # ── Category 1: Unsafe C String/Memory Functions ────────────── + - id: dangerous-system-call + pattern: system($ARG) + message: Use of system() is dangerous — consider execve() or posix_spawn() + languages: [c, cpp] + severity: WARNING + - id: unsafe-sprintf + pattern: sprintf($BUF, ...) + message: sprintf() has no bounds checking — use snprintf() instead + languages: [c, cpp] + severity: WARNING + - id: unsafe-strcpy + pattern: strcpy($DST, $SRC) + message: strcpy() has no bounds checking — use strncpy() or strlcpy() + languages: [c, cpp] + severity: WARNING + - id: unsafe-strcat + pattern: strcat($DST, $SRC) + message: strcat() has no bounds checking — use strncat() or strlcat() + languages: [c, cpp] + severity: WARNING + - id: potential-format-string + patterns: + - pattern: printf($FMT) + - pattern-not: printf("...") + message: Potential format string vulnerability — ensure format string is a literal + languages: [c, cpp] + severity: WARNING + - id: unsafe-vsprintf + pattern: vsprintf($BUF, ...) + message: vsprintf() has no bounds checking — use vsnprintf() instead + languages: [c, cpp] + severity: WARNING + - id: unsafe-gets + pattern: gets($BUF) + message: gets() is always unsafe (unbounded read) — use fgets() instead + languages: [c, cpp] + severity: ERROR + - id: unsafe-strncpy-strlen + pattern: strncpy($D, $S, strlen($S)) + message: strncpy with strlen(src) as length defeats the purpose of bounds checking + languages: [c, cpp] + severity: WARNING + + # ── Category 2: Memory Management (C-specific) ───────────────── + - id: memset-zero-length + pattern: memset($B, $V, 0) + message: memset with length 0 is a no-op — check arguments + languages: [c, cpp] + severity: WARNING + - id: memcpy-sizeof-pointer + pattern: memcpy($D, $S, sizeof($PTR)) + message: memcpy with sizeof(pointer) likely copies only pointer size, not data + languages: [c, cpp] + severity: WARNING + - id: malloc-no-null-check + patterns: + - pattern: | + $PTR = malloc(...); + ... + *$PTR + - pattern-not: | + $PTR = malloc(...); + ... + if ($PTR == NULL) { ... } + ... + *$PTR + message: malloc() result used without NULL check + languages: [c] + severity: WARNING + - id: realloc-self-assign + pattern: $PTR = realloc($PTR, $SIZE) + message: realloc self-assignment leaks memory on failure — use a temporary pointer + languages: [c, cpp] + severity: WARNING + + # ── Category 3: Race Conditions / TOCTOU ───────────────────── + - id: toctou-access + pattern: access($PATH, ...) + message: access() is prone to TOCTOU races — use faccessat() or open-then-check + languages: [c, cpp] + severity: WARNING + - id: chmod-on-pathname + pattern: chmod($PATH, $MODE) + message: chmod on pathname is TOCTOU-prone — prefer fchmod() on an open fd + languages: [c, cpp] + severity: INFO + - id: chown-on-pathname + patterns: + - pattern-either: + - pattern: chown($PATH, $UID, $GID) + - pattern: lchown($PATH, $UID, $GID) + message: chown/lchown on pathname is TOCTOU-prone — prefer fchown() on an open fd + languages: [c, cpp] + severity: INFO + - id: insecure-rand + patterns: + - pattern-either: + - pattern: rand() + - pattern: srand(...) + message: rand()/srand() are not cryptographically secure — use getrandom() + languages: [c, cpp] + severity: WARNING + - id: toctou-stat + patterns: + - pattern-either: + - pattern: stat($PATH, $BUF) + - pattern: lstat($PATH, $BUF) + message: stat/lstat on pathname is TOCTOU-prone — prefer fstat() on an open fd + languages: [c, cpp] + severity: INFO + + # ── Category 5: Error Handling (C-specific) ──────────────────── + - id: strerror-thread-unsafe + pattern: strerror($E) + message: strerror() is not thread-safe — use strerror_r() + languages: [c, cpp] + severity: INFO + + # ── Category 6: Resource Management (C-specific) ────────────── + - id: fopen-no-close-check + pattern: fopen($P, $M) + message: Raw FILE* from fopen — ensure fclose() is called on all paths + languages: [c] + severity: INFO + - id: signal-not-sigaction + pattern: signal($SIG, $H) + message: signal() has portability issues — prefer sigaction() + languages: [c, cpp] + severity: WARNING + - id: vfork-usage + pattern: vfork() + message: vfork() shares address space with parent — prefer posix_spawn() or fork() + languages: [c, cpp] + severity: WARNING + - id: popen-usage + pattern: popen($CMD, $MODE) + message: popen() invokes shell — risk of command injection, prefer posix_spawn() + languages: [c, cpp] + severity: WARNING + + # ── Category 7: Privilege and Command Execution ─────────────── + - id: setuid-setgid + patterns: + - pattern-either: + - pattern: setuid(...) + - pattern: setgid(...) + message: setuid/setgid changes process privileges — ensure proper error checking + languages: [c, cpp] + severity: WARNING + - id: chroot-usage + pattern: chroot($PATH) + message: chroot alone is not a security boundary — ensure chdir+drop privileges + languages: [c, cpp] + severity: WARNING + - id: getenv-unchecked + pattern: getenv($VAR) + message: getenv() returns nullable pointer — check for NULL before use + languages: [c, cpp] + severity: INFO + - id: exec-family + patterns: + - pattern-either: + - pattern: execvp(...) + - pattern: execv(...) + - pattern: execve(...) + message: exec-family call — ensure arguments are validated and paths are absolute + languages: [c, cpp] + severity: INFO + + # ── Category 9: Code Quality / Defensive Programming ───────── + - id: goto-usage + pattern: goto $LABEL + message: goto usage — consider structured control flow alternatives + languages: [c, cpp] + severity: INFO + - id: assert-side-effect + patterns: + - pattern-either: + - pattern: assert($X = $Y) + - pattern: assert($X++) + message: Side effect in assert() — expression is removed in release builds (NDEBUG) + languages: [c, cpp] + severity: ERROR + - id: fprintf-stderr + pattern: fprintf(stderr, ...) + message: Direct stderr output — consider using the project logging infrastructure + languages: [c, cpp] + severity: INFO + - id: atoi-atol-usage + patterns: + - pattern-either: + - pattern: atoi(...) + - pattern: atol(...) + - pattern: atof(...) + message: atoi/atol/atof have no error checking — use strtol/strtod instead + languages: [c, cpp] + severity: WARNING + - id: alloca-usage + pattern: alloca($SIZE) + message: alloca() allocates on stack with no overflow check — prefer heap allocation + languages: [c, cpp] + severity: WARNING diff --git a/nix/analysis/semgrep.nix b/nix/analysis/semgrep.nix new file mode 100644 index 0000000..23bc293 --- /dev/null +++ b/nix/analysis/semgrep.nix @@ -0,0 +1,59 @@ +# nix/analysis/semgrep.nix +# +# semgrep pattern-based code search with custom rules. +# Same structure as reference, uses C-filtered semgrep-rules.yaml. +# + +{ + pkgs, + mkSourceReport, +}: + +let + rulesFile = ./semgrep-rules.yaml; + + runner = pkgs.writeShellApplication { + name = "run-semgrep-analysis"; + runtimeInputs = with pkgs; [ + semgrep + coreutils + gnugrep + cacert + ]; + text = '' + source_dir="$1" + output_dir="$2" + + echo "=== semgrep Analysis ===" + + export SEMGREP_ENABLE_VERSION_CHECK=0 + export SEMGREP_SEND_METRICS=off + export SSL_CERT_FILE="${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + export OTEL_TRACES_EXPORTER=none + # semgrep needs a writable HOME for its config/cache + HOME="$(mktemp -d)" + export HOME + + semgrep \ + --config ${rulesFile} \ + --json \ + --metrics=off \ + --no-git-ignore \ + "$source_dir/src" \ + > "$output_dir/report.json" 2>&1 || true + + # Also produce a text report + semgrep \ + --config ${rulesFile} \ + --metrics=off \ + --no-git-ignore \ + "$source_dir/src" \ + > "$output_dir/report.txt" 2>&1 || true + + # Count results from JSON output + findings=$(grep -o '"check_id"' "$output_dir/report.json" | wc -l || echo "0") + echo "$findings" > "$output_dir/count.txt" + ''; + }; +in +mkSourceReport "semgrep" runner diff --git a/nix/analysis/triage/EXEMPTIONS.md b/nix/analysis/triage/EXEMPTIONS.md new file mode 100644 index 0000000..1319bf1 --- /dev/null +++ b/nix/analysis/triage/EXEMPTIONS.md @@ -0,0 +1,151 @@ +# Static Analysis Exemptions + +This document describes each exemption in the triage system and the +rationale for excluding it from high-confidence findings. + +## Excluded Check IDs (`filters.py`) + +These check IDs are removed entirely during filtering — their findings +never appear in triage output. + +### `syntaxError` +- **Tool:** cppcheck +- **Count:** 14 (all test code) +- **Reason:** cppcheck's parser cannot handle XDP2's complex macro + constructs (variadic macros, token pasting, nested expansion). These + are parser failures in the tool, not syntax errors in the code. The + code compiles successfully with GCC and Clang. + +### `preprocessorErrorDirective` +- **Tool:** cppcheck +- **Count:** 8 +- **Reason:** Two categories: + 1. **Intentional `#error` platform guards** — e.g., + `#error "Unsupported long size"` in `bitmap.h`, + `#error "Endianness not identified"` in `proto_geneve.h`. These are + compile-time assertions that fire only on unsupported platforms. + 2. **cppcheck macro expansion failures** — e.g., `pmacro.h` lines + where cppcheck fails to expand `XDP2_SELECT_START` / + `XDP2_VSTRUCT_VSCONST` due to complex `##` token pasting. The + macros work correctly with real compilers. + +### `unknownMacro` +- **Tool:** cppcheck +- **Count:** 2 +- **Reason:** cppcheck doesn't recognize project-specific macros: + - `LIST_FOREACH` (`dtable.c:789`) — standard BSD-style list traversal + macro, defined in project headers. + - `__XDP2_PMACRO_APPLYXDP2_PMACRO_NARGS` (`bitmap_word.h:544`) — + internal macro helper from the pmacro system. + These would require `--suppress=` or `--library=` configuration for + cppcheck, which is not worth the maintenance burden. + +### `arithOperationsOnVoidPointer` +- **Tool:** cppcheck +- **Count:** 25 +- **Reason:** Void pointer arithmetic (`void *p; p += n`) is a GNU C + extension where `sizeof(void) == 1`. This is used intentionally + throughout the codebase: + - `siphash.c` — hash function byte-level pointer walks + - `obj_allocator.c` — memory pool object addressing + - `parser.c` — packet header pointer advancement + All three GCC, Clang, and the Linux kernel rely on this extension. + The code is correct and compiles without warnings under `-std=gnu11`. + +### `subtractPointers` +- **Tool:** cppcheck +- **Count:** 3 +- **Reason:** Pointer subtraction in `cli.h:88,107` and `dtable.h:85` + implements `container_of`-style macros — computing the offset of a + member within a struct to recover the containing struct pointer. This + is a standard C idiom used throughout Linux kernel code and system + libraries. cppcheck flags it because the two pointers technically + point to "different objects" (member vs. container), but the operation + is well-defined in practice on all target platforms. + +## Generated File Patterns (`filters.py`) + +### `*.template.c` +- **Reason:** Template files under `src/templates/xdp2/` are input to + `xdp2-compiler`, which processes them into final C source. They + contain placeholder identifiers and incomplete type references that + are resolved during code generation. Findings like + `clang-diagnostic-implicit-int` and + `clang-diagnostic-implicit-function-declaration` in these files are + expected and not actionable. + +## Scoring Adjustments (`scoring.py`) + +These checks still appear in the full triage summary but are excluded +from the high-confidence list. + +### `bugprone-narrowing-conversions` → `STYLE_ONLY_CHECKS` +- **Tool:** clang-tidy +- **Count:** 56 (was the single largest category in high-confidence) +- **Reason:** The vast majority are `size_t` → `ssize_t` and + `unsigned int` → `int` conversions in packet parsing code where sizes + and offsets are bounded by protocol constraints (e.g., packet length + fits in `int`). These narrowing conversions are intentional and + ubiquitous in C networking code. Previously listed in + `BUG_CLASS_CHECKS`, which incorrectly elevated all 56 to + high-confidence. Moved to `STYLE_ONLY_CHECKS` so they remain visible + in the full report but don't overwhelm the actionable findings list. + +### `variableScope` → `STYLE_ONLY_CHECKS` +- **Tool:** cppcheck +- **Count:** 30 +- **Reason:** Suggestions to move variable declarations closer to first + use. This is a style preference — the existing code follows C89-style + declarations-at-top-of-block, which is a valid convention. Not a bug. + +### `constParameter`, `constParameterCallback` → `STYLE_ONLY_CHECKS` +- **Tool:** cppcheck +- **Count:** 14 +- **Reason:** Suggestions to add `const` to parameters that are not + modified. Valid style improvement but not a correctness issue, and + changing function signatures affects the public API. + +### Excluded from High-Confidence via `_HIGH_CONF_EXCLUDED_PREFIXES` + +#### `bugprone-reserved-identifier` +- **Count:** 642 (combined with `cert-dcl37-c,cert-dcl51-cpp`) +- **Reason:** XDP2 uses double-underscore prefixed identifiers + (`__XDP2_PMACRO_*`, `___XDP2_BITMAP_WORD_*`) as internal macro + helpers. This is the project's deliberate convention for namespace + separation. While technically reserved by the C standard, these + identifiers do not conflict with any compiler or library names. + +#### `bugprone-easily-swappable-parameters` +- **Count:** 201 +- **Reason:** Functions with multiple parameters of the same type (e.g., + `int offset, int length`). This is inherent to packet parsing APIs + where multiple integer parameters represent distinct protocol fields. + Cannot be changed without breaking the API. + +#### `bugprone-assignment-in-if-condition` +- **Count:** 79 +- **Reason:** `if ((x = func()))` is an intentional C idiom used + throughout the codebase for error-checked assignment. This is standard + practice in C systems code (Linux kernel, glibc, etc.). + +#### `bugprone-macro-parentheses` +- **Count:** 329 +- **Reason:** Suggestions to add parentheses around macro arguments. + Many of XDP2's macros are protocol field accessors where the argument + is always a simple identifier, making extra parentheses unnecessary. + Some macros intentionally use unparenthesized arguments for token + pasting or stringification. + +#### `bugprone-implicit-widening-of-multiplication-result` +- **Count:** 139 +- **Reason:** Multiplication results widened to `size_t` or `ssize_t` + in packet offset calculations. The operands are bounded by protocol + constraints (header sizes, field counts), so overflow is not possible + in practice. False positives in packet parsing arithmetic. + +#### `misc-no-recursion` +- **Count:** 29 +- **Reason:** Recursion is used intentionally in protocol graph + traversal (nested protocol headers, decision tables). The recursion + depth is bounded by protocol nesting limits. Eliminating recursion + would require significant refactoring with no safety benefit. diff --git a/nix/analysis/triage/__main__.py b/nix/analysis/triage/__main__.py new file mode 100644 index 0000000..27439c7 --- /dev/null +++ b/nix/analysis/triage/__main__.py @@ -0,0 +1,71 @@ +"""CLI entry point for static analysis triage. + +Usage: + python -m triage # Full prioritized report + python triage --summary # Category summary + python triage --high-confidence # Likely real bugs only + python triage --cross-ref # Multi-tool correlations + python triage --category # Drill into category +""" + +import argparse +import os +import sys + +from parsers import load_all_findings +from filters import filter_findings, deduplicate, is_test_code +from reports import ( + print_summary, print_high_confidence, print_cross_ref, + print_category, print_full_report, write_all_reports, +) + + +def main(): + parser = argparse.ArgumentParser( + description='Triage static analysis findings across multiple tools.' + ) + parser.add_argument('result_dir', help='Path to analysis results directory') + parser.add_argument('--summary', action='store_true', + help='Show category summary') + parser.add_argument('--high-confidence', action='store_true', + help='Show only likely-real-bug findings') + parser.add_argument('--cross-ref', action='store_true', + help='Show multi-tool correlations') + parser.add_argument('--category', type=str, + help='Drill into a specific check category') + parser.add_argument('--output-dir', type=str, + help='Write all reports as files to this directory') + parser.add_argument('--include-test', action='store_true', + help='Include test code findings (excluded by default in high-confidence)') + + args = parser.parse_args() + + if not os.path.isdir(args.result_dir): + print(f'Error: {args.result_dir} is not a directory', file=sys.stderr) + sys.exit(1) + + # Load, filter, deduplicate + raw = load_all_findings(args.result_dir) + filtered = filter_findings(raw) + findings = deduplicate(filtered) + + print(f'Loaded {len(raw)} raw -> {len(filtered)} filtered -> {len(findings)} dedup') + + if args.output_dir: + write_all_reports(findings, args.output_dir) + elif args.summary: + print_summary(findings) + elif args.high_confidence: + if not args.include_test: + findings = [f for f in findings if not is_test_code(f.file)] + print_high_confidence(findings) + elif args.cross_ref: + print_cross_ref(findings) + elif args.category: + print_category(findings, args.category) + else: + print_full_report(findings) + + +if __name__ == '__main__': + main() diff --git a/nix/analysis/triage/filters.py b/nix/analysis/triage/filters.py new file mode 100644 index 0000000..64bb582 --- /dev/null +++ b/nix/analysis/triage/filters.py @@ -0,0 +1,107 @@ +"""Noise constants, path classifiers, filtering, and deduplication. + +Adapted for XDP2's C codebase — path patterns and exclusions +are specific to this project's directory structure. +""" + +from finding import Finding + + +# Third-party code — findings are not actionable +# Note: /nix/store/ prefix is stripped by normalize_path before filtering +THIRD_PARTY_PATTERNS = [ + 'thirdparty/', 'cppfront/', +] + +# Generated files — findings are not actionable +GENERATED_FILE_PATTERNS = [ + 'parser_*.p.c', # xdp2-compiler generated parser code + '*.template.c', # Template files before xdp2-compiler processing + '_pmacro_gen.h', # Packet macro generator output + '_dtable.h', # Decision table output + '_stable.h', # State table output +] + +EXCLUDED_CHECK_IDS = { + # Known false positive categories + 'normalCheckLevelMaxBranches', + # Cppcheck noise — tool limitations, not code bugs + 'missingIncludeSystem', + 'missingInclude', + 'unmatchedSuppression', + 'checkersReport', + 'syntaxError', # Can't parse complex macro constructs + 'preprocessorErrorDirective', # Intentional #error guards / macro expansion failures + 'unknownMacro', # Doesn't understand project macros (LIST_FOREACH, pmacro) + # Cppcheck false positives in idiomatic C + 'arithOperationsOnVoidPointer', # GNU C extension (sizeof(void)==1), intentional in networking code + 'subtractPointers', # container_of style pointer arithmetic + # Clang-tidy build errors (not real findings) + 'clang-diagnostic-error', + # _FORTIFY_SOURCE warnings (build config, not code bugs) + '-W#warnings', + '-Wcpp', +} + +EXCLUDED_MESSAGE_PATTERNS = [ + '_FORTIFY_SOURCE', +] + +TEST_PATH_PATTERNS = ['src/test/', '/test/'] + +SECURITY_PATHS = ['src/lib/', 'src/include/xdp2/'] + + +def _match_generated(path: str) -> bool: + """Check if file matches a generated file pattern (supports * glob).""" + import fnmatch + name = path.rsplit('/', 1)[-1] if '/' in path else path + return any(fnmatch.fnmatch(name, pat) for pat in GENERATED_FILE_PATTERNS) + + +def is_generated(path: str) -> bool: + return _match_generated(path) + + +def is_third_party(path: str) -> bool: + for pat in THIRD_PARTY_PATTERNS: + if pat in path: + return True + # Files not under src/ are likely third-party or generated + return not path.startswith('src/') + + +def is_test_code(path: str) -> bool: + return any(pat in path for pat in TEST_PATH_PATTERNS) + + +def is_security_sensitive(path: str) -> bool: + return any(pat in path for pat in SECURITY_PATHS) + + +def filter_findings(findings: list) -> list: + """Remove third-party code and known false positive categories.""" + return [ + f for f in findings + if not is_third_party(f.file) + and not is_generated(f.file) + and f.check_id not in EXCLUDED_CHECK_IDS + and f.line > 0 + and not any(pat in f.message for pat in EXCLUDED_MESSAGE_PATTERNS) + ] + + +def deduplicate(findings: list) -> list: + """Deduplicate findings by (file, line, check_id). + + clang-tidy reports the same header finding once per translation unit. + Keep first occurrence only. + """ + seen = set() + result = [] + for f in findings: + key = f.dedup_key() + if key not in seen: + seen.add(key) + result.append(f) + return result diff --git a/nix/analysis/triage/finding.py b/nix/analysis/triage/finding.py new file mode 100644 index 0000000..fc0f7fb --- /dev/null +++ b/nix/analysis/triage/finding.py @@ -0,0 +1,44 @@ +"""Finding dataclass and path/severity normalization.""" + +import re +from dataclasses import dataclass + + +@dataclass +class Finding: + tool: str + check_id: str + severity: str # "error", "warning", "style", "info" + file: str # normalized relative path + line: int + message: str + + def location_key(self): + return (self.file, self.line) + + def dedup_key(self): + return (self.file, self.line, self.check_id) + + +_NIX_STORE_RE = re.compile(r'/nix/store/[a-z0-9]{32}-[^/]*/') + + +def normalize_path(path: str) -> str: + """Strip Nix store prefix to get relative source path.""" + path = _NIX_STORE_RE.sub('', path) + # Clean up double slashes (from Makefile $(CURRDIR)//include patterns) + while '//' in path: + path = path.replace('//', '/') + return path + + +def normalize_severity(sev: str) -> str: + """Map tool-specific severities to unified levels.""" + sev = sev.lower().strip() + if sev in ('error', 'high', '5', '4'): + return 'error' + if sev in ('warning', 'medium', '3', 'portability', 'performance'): + return 'warning' + if sev in ('style', 'low', '2', '1', '0', 'information', 'info'): + return 'style' + return 'warning' diff --git a/nix/analysis/triage/parsers/__init__.py b/nix/analysis/triage/parsers/__init__.py new file mode 100644 index 0000000..3498744 --- /dev/null +++ b/nix/analysis/triage/parsers/__init__.py @@ -0,0 +1,49 @@ +"""Unified finding loader across all tool parsers.""" + +from pathlib import Path + +from finding import Finding +from parsers import cppcheck, semgrep, clang, flawfinder + + +def load_all_findings(result_dir: str) -> list: + """Load findings from all available tool reports.""" + findings = [] + rd = Path(result_dir) + + # cppcheck XML + p = rd / 'cppcheck' / 'report.xml' + if p.exists(): + findings.extend(cppcheck.parse(str(p))) + + # semgrep JSON + p = rd / 'semgrep' / 'report.json' + if p.exists(): + findings.extend(semgrep.parse(str(p))) + + # clang-tidy + p = rd / 'clang-tidy' / 'report.txt' + if p.exists(): + findings.extend(clang.parse(str(p), 'clang-tidy')) + + # gcc-warnings + p = rd / 'gcc-warnings' / 'report.txt' + if p.exists(): + findings.extend(clang.parse(str(p), 'gcc-warnings')) + + # flawfinder + p = rd / 'flawfinder' / 'report.txt' + if p.exists(): + findings.extend(flawfinder.parse(str(p))) + + # clang-analyzer + p = rd / 'clang-analyzer' / 'report.txt' + if p.exists(): + findings.extend(clang.parse(str(p), 'clang-analyzer')) + + # gcc-analyzer + p = rd / 'gcc-analyzer' / 'report.txt' + if p.exists(): + findings.extend(clang.parse(str(p), 'gcc-analyzer')) + + return findings diff --git a/nix/analysis/triage/parsers/clang.py b/nix/analysis/triage/parsers/clang.py new file mode 100644 index 0000000..65dbf0f --- /dev/null +++ b/nix/analysis/triage/parsers/clang.py @@ -0,0 +1,36 @@ +"""Parse clang-tidy, clang-analyzer, gcc-warnings, and gcc-analyzer reports. + +All four tools share the same text line format: + /path/to/file.c:123:45: warning: message [check-name] +""" + +import re + +from finding import Finding, normalize_path, normalize_severity + + +_LINE_RE = re.compile( + r'^(.+?):(\d+):\d+:\s+(warning|error):\s+(.+?)\s+\[([^\]]+)\]$' +) + + +def parse(path: str, tool_name: str) -> list: + """Parse a clang-style diagnostic text report.""" + findings = [] + try: + with open(path) as f: + for line in f: + line = line.strip() + m = _LINE_RE.match(line) + if m: + findings.append(Finding( + tool=tool_name, + check_id=m.group(5), + severity=normalize_severity(m.group(3)), + file=normalize_path(m.group(1)), + line=int(m.group(2)), + message=m.group(4), + )) + except FileNotFoundError: + pass + return findings diff --git a/nix/analysis/triage/parsers/cppcheck.py b/nix/analysis/triage/parsers/cppcheck.py new file mode 100644 index 0000000..b13cf71 --- /dev/null +++ b/nix/analysis/triage/parsers/cppcheck.py @@ -0,0 +1,35 @@ +"""Parse cppcheck XML reports.""" + +import xml.etree.ElementTree as ET + +from finding import Finding, normalize_path, normalize_severity + + +def parse(path: str) -> list: + """Parse cppcheck XML report.""" + findings = [] + try: + tree = ET.parse(path) + except (ET.ParseError, FileNotFoundError): + return findings + + for error in tree.iter('error'): + check_id = error.get('id', '') + severity = error.get('severity', 'warning') + msg = error.get('msg', '') + + for loc in error.iter('location'): + filepath = normalize_path(loc.get('file', '')) + line = int(loc.get('line', 0)) + if filepath and line > 0: + findings.append(Finding( + tool='cppcheck', + check_id=check_id, + severity=normalize_severity(severity), + file=filepath, + line=line, + message=msg, + )) + break # Only take first location per error + + return findings diff --git a/nix/analysis/triage/parsers/flawfinder.py b/nix/analysis/triage/parsers/flawfinder.py new file mode 100644 index 0000000..834cda1 --- /dev/null +++ b/nix/analysis/triage/parsers/flawfinder.py @@ -0,0 +1,37 @@ +"""Parse flawfinder text reports. + +Flawfinder line format: + /path/to/file.c:123:45: [5] (race) chmod:message +""" + +import re + +from finding import Finding, normalize_path, normalize_severity + + +_LINE_RE = re.compile( + r'^(.+?):(\d+):\d+:\s+\[(\d+)\]\s+\((\w+)\)\s+(\w+):(.+)$' +) + + +def parse(path: str) -> list: + """Parse flawfinder text report.""" + findings = [] + try: + with open(path) as f: + for line in f: + m = _LINE_RE.match(line.strip()) + if m: + category = m.group(4) + func_name = m.group(5) + findings.append(Finding( + tool='flawfinder', + check_id=f'{category}.{func_name}', + severity=normalize_severity(m.group(3)), + file=normalize_path(m.group(1)), + line=int(m.group(2)), + message=m.group(6).strip(), + )) + except FileNotFoundError: + pass + return findings diff --git a/nix/analysis/triage/parsers/semgrep.py b/nix/analysis/triage/parsers/semgrep.py new file mode 100644 index 0000000..a5e452b --- /dev/null +++ b/nix/analysis/triage/parsers/semgrep.py @@ -0,0 +1,38 @@ +"""Parse semgrep JSON reports.""" + +import json + +from finding import Finding, normalize_path, normalize_severity + + +def parse(path: str) -> list: + """Parse semgrep JSON report (may contain multiple JSON objects).""" + findings = [] + try: + with open(path) as f: + content = f.read() + except FileNotFoundError: + return findings + + # Parse all JSON objects in the file (semgrep may output multiple) + decoder = json.JSONDecoder() + pos = 0 + while pos < len(content): + try: + idx = content.index('{', pos) + data, end = decoder.raw_decode(content, idx) + pos = end + except (ValueError, json.JSONDecodeError): + break + + for result in data.get('results', []): + findings.append(Finding( + tool='semgrep', + check_id=result.get('check_id', ''), + severity=normalize_severity(result.get('extra', {}).get('severity', 'warning')), + file=normalize_path(result.get('path', '')), + line=result.get('start', {}).get('line', 0), + message=result.get('extra', {}).get('message', ''), + )) + + return findings diff --git a/nix/analysis/triage/reports.py b/nix/analysis/triage/reports.py new file mode 100644 index 0000000..7f25fff --- /dev/null +++ b/nix/analysis/triage/reports.py @@ -0,0 +1,170 @@ +"""Report formatting and output functions.""" + +import io +import os +from collections import defaultdict +from contextlib import redirect_stdout + +from finding import Finding +from filters import is_test_code, is_security_sensitive +from scoring import priority_score, find_cross_tool_hits, get_high_confidence + + +def format_finding(f, score=None) -> str: + score_str = f'[score={score}] ' if score is not None else '' + return f'{score_str}{f.tool}: {f.file}:{f.line}: [{f.severity}] {f.check_id} -- {f.message}' + + +def print_summary(findings: list): + """Print category summary after filtering, sorted by priority.""" + # Count by (tool, check_id) + category_counts = defaultdict(lambda: { + 'count': 0, 'tools': set(), 'severities': set(), + 'prod': 0, 'test': 0, 'security': 0 + }) + + for f in findings: + cat = category_counts[f.check_id] + cat['count'] += 1 + cat['tools'].add(f.tool) + cat['severities'].add(f.severity) + if is_test_code(f.file): + cat['test'] += 1 + else: + cat['prod'] += 1 + if is_security_sensitive(f.file): + cat['security'] += 1 + + # Sort: error first, then by count ascending (anomalies first) + def sort_key(item): + name, cat = item + has_error = 'error' in cat['severities'] + return (not has_error, cat['count'], name) + + print(f'\n{"Category":<50} {"Count":>6} {"Prod":>5} {"Test":>5} {"Sec":>4} {"Sev":<8} {"Tools"}') + print('-' * 110) + + for name, cat in sorted(category_counts.items(), key=sort_key): + sev = '/'.join(sorted(cat['severities'])) + tools = ','.join(sorted(cat['tools'])) + print(f'{name:<50} {cat["count"]:>6} {cat["prod"]:>5} {cat["test"]:>5} ' + f'{cat["security"]:>4} {sev:<8} {tools}') + + total = sum(c['count'] for c in category_counts.values()) + prod = sum(c['prod'] for c in category_counts.values()) + test = sum(c['test'] for c in category_counts.values()) + print(f'\nTotal: {total} findings ({prod} production, {test} test) ' + f'across {len(category_counts)} categories') + + +def print_high_confidence(findings: list): + """Print only likely-real-bug findings.""" + high_conf = get_high_confidence(findings) + + if not high_conf: + print('No high-confidence findings.') + return + + print(f'\n=== High-Confidence Findings ({len(high_conf)}) ===\n') + + for f, score, is_cross in high_conf: + cross_marker = ' [CROSS-TOOL]' if is_cross else '' + loc = 'security' if is_security_sensitive(f.file) else ('test' if is_test_code(f.file) else 'prod') + print(f' [{score:>3}] [{loc:<8}]{cross_marker}') + print(f' {f.tool}: {f.file}:{f.line}') + print(f' {f.check_id} [{f.severity}]') + print(f' {f.message}') + print() + + +def print_cross_ref(findings: list): + """Print multi-tool correlations.""" + clusters = find_cross_tool_hits(findings) + + if not clusters: + print('No cross-tool correlations found.') + return + + print(f'\n=== Cross-Tool Correlations ({len(clusters)} clusters) ===\n') + + for i, cluster in enumerate(clusters, 1): + tools = sorted(set(f.tool for f in cluster)) + print(f' Cluster #{i} -- {cluster[0].file}:{cluster[0].line} ({", ".join(tools)})') + for f in cluster: + print(f' {f.tool}: [{f.severity}] {f.check_id} -- {f.message}') + print() + + +def print_category(findings: list, category: str): + """Print all findings for a specific category.""" + matches = [f for f in findings if f.check_id == category] + + if not matches: + # Try partial match + matches = [f for f in findings if category.lower() in f.check_id.lower()] + + if not matches: + print(f'No findings matching category "{category}".') + return + + # Group by check_id if partial match found multiple + by_check = defaultdict(list) + for f in matches: + by_check[f.check_id].append(f) + + for check_id, check_findings in sorted(by_check.items()): + print(f'\n=== {check_id} ({len(check_findings)} findings) ===\n') + for f in sorted(check_findings, key=lambda x: (x.file, x.line)): + loc = 'test' if is_test_code(f.file) else 'prod' + print(f' [{loc}] {f.file}:{f.line}') + print(f' {f.message}') + print() + + +def print_full_report(findings: list): + """Print all findings sorted by priority.""" + cat_counts = defaultdict(int) + for f in findings: + cat_counts[f.check_id] += 1 + + scored = [(f, priority_score(f, cat_counts)) for f in findings] + scored.sort(key=lambda x: (-x[1], x[0].file, x[0].line)) + + print(f'\n=== All Findings ({len(scored)}) ===\n') + + for f, score in scored[:200]: # Limit output + print(format_finding(f, score)) + + if len(scored) > 200: + print(f'\n... and {len(scored) - 200} more (use --summary or --category to drill in)') + + +def capture_output(func, *args, **kwargs) -> str: + """Capture stdout from a function call and return as string.""" + buf = io.StringIO() + with redirect_stdout(buf): + func(*args, **kwargs) + return buf.getvalue() + + +def write_all_reports(findings: list, output_dir: str): + """Write all report modes as files to output_dir.""" + os.makedirs(output_dir, exist_ok=True) + + with open(os.path.join(output_dir, 'summary.txt'), 'w') as f: + f.write(capture_output(print_summary, findings)) + + prod_findings = [f for f in findings if not is_test_code(f.file)] + + with open(os.path.join(output_dir, 'high-confidence.txt'), 'w') as f: + f.write(capture_output(print_high_confidence, prod_findings)) + + high_conf = get_high_confidence(prod_findings) + with open(os.path.join(output_dir, 'count.txt'), 'w') as f: + f.write(str(len(high_conf))) + + with open(os.path.join(output_dir, 'cross-ref.txt'), 'w') as f: + f.write(capture_output(print_cross_ref, findings)) + + with open(os.path.join(output_dir, 'full-report.txt'), 'w') as f: + f.write(capture_output(print_full_report, findings)) diff --git a/nix/analysis/triage/scoring.py b/nix/analysis/triage/scoring.py new file mode 100644 index 0000000..fbb9d09 --- /dev/null +++ b/nix/analysis/triage/scoring.py @@ -0,0 +1,171 @@ +"""Priority scoring, cross-tool correlation, and high-confidence filtering. + +Adapted for XDP2's C codebase — check IDs and noise patterns +are specific to C static analysis tools. +""" + +from collections import defaultdict + +from finding import Finding +from filters import is_security_sensitive, is_test_code + + +# flawfinder check_ids that are intentional patterns in system software +FLAWFINDER_NOISE = { + 'race.chmod', 'race.chown', 'race.access', 'race.vfork', + 'shell.system', 'shell.execl', 'shell.execlp', 'shell.execv', 'shell.execvp', + 'buffer.read', 'buffer.char', 'buffer.equal', 'buffer.memcpy', + 'buffer.strlen', 'buffer.getenv', 'buffer.wchar_t', + 'misc.open', 'random.random', 'tmpfile.mkstemp', 'access.umask', + 'format.snprintf', 'format.vsnprintf', 'misc.chroot', +} + +# Bug-class check IDs that represent real correctness issues (not style) +BUG_CLASS_CHECKS = { + # Correctness bugs + 'uninitMemberVar', 'unsignedLessThanZero', + 'core.UndefinedBinaryOperatorResult', 'core.NullDereference', + 'core.DivideZero', 'core.uninitialized', + 'core.uninitialized.Assign', + # Bugprone clang-tidy checks + 'bugprone-use-after-move', + 'bugprone-sizeof-expression', + 'bugprone-integer-division', + # Memory safety + 'unix.Malloc', 'unix.MismatchedDeallocator', + 'alpha.security.ArrayBoundV2', +} + +# Style checks that shouldn't appear in high-confidence even in security code +STYLE_ONLY_CHECKS = { + 'constParameterPointer', 'constVariablePointer', + 'constParameter', 'constParameterCallback', + 'shadowVariable', 'shadowArgument', 'shadowFunction', + 'knownConditionTrueFalse', 'unusedStructMember', + 'variableScope', # Moving declarations is style, not bugs + 'bugprone-narrowing-conversions', # size_t→ssize_t, uint→int: intentional in C networking code +} + +# Prefixes excluded from high-confidence output +_HIGH_CONF_EXCLUDED_PREFIXES = ( + 'readability-', + 'misc-include-cleaner', 'misc-use-internal-linkage', + 'misc-use-anonymous-namespace', 'misc-unused-parameters', + 'misc-const-correctness', 'misc-header-include-cycle', + 'misc-no-recursion', + 'cert-', + # High-volume bugprone checks that are style/convention, not bugs + 'bugprone-reserved-identifier', # __XDP2_PMACRO_* is project convention + 'bugprone-easily-swappable-parameters', # Style — can't change existing API + 'bugprone-assignment-in-if-condition', # Intentional C pattern: if ((x = func())) + 'bugprone-macro-parentheses', # Style — many macros are correct without extra parens + 'bugprone-implicit-widening-of-multiplication-result', # False positives in packet offset math +) + + +def priority_score(f, category_counts: dict) -> int: + """Higher score = higher priority. Range roughly 0-100.""" + score = 0 + + # Severity + if f.severity == 'error': + score += 40 + elif f.severity == 'warning': + score += 20 + + # Security-sensitive location + if is_security_sensitive(f.file): + score += 20 + + # Test code is lower priority + if is_test_code(f.file): + score -= 15 + + # Small-count categories are anomalies (more likely real bugs) + count = category_counts.get(f.check_id, 999) + if count <= 3: + score += 30 + elif count <= 10: + score += 20 + elif count <= 30: + score += 10 + + return score + + +def find_cross_tool_hits(findings: list, tolerance: int = 3) -> list: + """Find file:line pairs flagged by 2+ independent tools. + + Uses a tolerance of +/-N lines to account for minor line number differences. + """ + # Group findings by file + by_file = defaultdict(list) + for f in findings: + by_file[f.file].append(f) + + clusters = [] + for filepath, file_findings in by_file.items(): + # Sort by line + file_findings.sort(key=lambda f: f.line) + + # For each pair of findings from different tools, check proximity + for i, f1 in enumerate(file_findings): + cluster = [f1] + for f2 in file_findings[i + 1:]: + if f2.line > f1.line + tolerance: + break + if f2.tool != f1.tool: + cluster.append(f2) + if len(set(f.tool for f in cluster)) >= 2: + clusters.append(cluster) + + # Deduplicate overlapping clusters + seen_keys = set() + unique_clusters = [] + for cluster in clusters: + key = frozenset((f.tool, f.file, f.line) for f in cluster) + if key not in seen_keys: + seen_keys.add(key) + unique_clusters.append(cluster) + + return unique_clusters + + +def get_high_confidence(findings: list) -> list: + """Return high-confidence findings as (Finding, score, is_cross) tuples, sorted by score.""" + # Compute category counts + cat_counts = defaultdict(int) + for f in findings: + cat_counts[f.check_id] += 1 + + # Find cross-tool correlations (excluding flawfinder noise) + non_noise = [f for f in findings if f.check_id not in FLAWFINDER_NOISE] + cross_hits = find_cross_tool_hits(non_noise) + cross_locations = set() + for cluster in cross_hits: + for f in cluster: + cross_locations.add((f.file, f.line)) + + high_conf = [] + for f in findings: + # Skip noise categories entirely from high-confidence + if f.check_id in FLAWFINDER_NOISE or f.check_id in STYLE_ONLY_CHECKS: + continue + # Skip excluded prefixes (style, not bugs) + if any(f.check_id.startswith(p) for p in _HIGH_CONF_EXCLUDED_PREFIXES): + continue + + score = priority_score(f, cat_counts) + is_cross = (f.file, f.line) in cross_locations + is_bug_class = f.check_id in BUG_CLASS_CHECKS + is_small_cat = cat_counts[f.check_id] <= 3 + # Only cppcheck/clang-analyzer error-severity in security code + is_security_bug = (is_security_sensitive(f.file) and f.severity == 'error' + and f.tool in ('cppcheck', 'clang-analyzer', 'gcc-analyzer')) + + if is_cross or is_bug_class or (is_small_cat and score >= 60) or is_security_bug: + high_conf.append((f, score, is_cross)) + + # Sort by score descending + high_conf.sort(key=lambda x: -x[1]) + return high_conf diff --git a/nix/packages.nix b/nix/packages.nix index fa84064..07a2ed0 100644 --- a/nix/packages.nix +++ b/nix/packages.nix @@ -85,6 +85,12 @@ in pkgs.shellcheck llvmPackages.clang-tools # clang-tidy, clang-format, etc. + # Static analysis tools + pkgs.compiledb # Compile command capture for static analysis (make dry-run) + pkgs.cppcheck # Static analysis + pkgs.flawfinder # C/C++ security scanner + pkgs.clang-analyzer # Clang static analyzer (scan-build) + # Utilities pkgs.jp2a # ASCII art for logo pkgs.glibcLocales # Locale support