From f03bed1c8aeccd176ff0d8be7bd5822bbc7aa523 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Fri, 5 Jun 2026 09:29:11 -0700 Subject: [PATCH 1/2] Add a `callgrind` measure This commit adds a new `callgrind` measure. It must always be run inside a child process that is running under Valgrind's Callgrind tool. It uses the `valgrind-requests` crate to communicate with Valgrind and record data from the simulated caches and branch predictor. Running under Callgrind is much slower than running natively, but also is much less noisy. Therefore we adjust the default numbers of processes and iterations per process accordingly. Fixes https://github.com/bytecodealliance/sightglass/issues/312 --- Cargo.lock | 152 +++++- README.md | 26 +- crates/cli/Cargo.toml | 6 +- crates/cli/src/benchmark.rs | 447 +++++++++++++++--- crates/recorder/Cargo.toml | 4 + crates/recorder/src/measure/callgrind.rs | 322 +++++++++++++ crates/recorder/src/measure/mod.rs | 31 ++ test/fixtures/callgrind.client-request.out | 18 + .../callgrind.program-termination.out | 18 + 9 files changed, 940 insertions(+), 84 deletions(-) create mode 100644 crates/recorder/src/measure/callgrind.rs create mode 100644 test/fixtures/callgrind.client-request.out create mode 100644 test/fixtures/callgrind.program-termination.out diff --git a/Cargo.lock b/Cargo.lock index 3d966a7d..4949682e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,6 +99,26 @@ dependencies = [ "special", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.10.0", + "cexpr", + "clang-sys", + "itertools", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.109", +] + [[package]] name = "bitflags" version = "0.7.0" @@ -197,6 +217,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -222,6 +251,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading 0.8.9", +] + [[package]] name = "clap" version = "2.34.0" @@ -369,6 +409,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "cty" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" + [[package]] name = "darling" version = "0.20.11" @@ -831,6 +877,12 @@ dependencies = [ "wasip2", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "h2" version = "0.3.27" @@ -870,6 +922,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1241,6 +1299,16 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if 1.0.4", + "windows-link", +] + [[package]] name = "libloading" version = "0.9.0" @@ -1354,6 +1422,12 @@ dependencies = [ "serde", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "mio" version = "1.1.0" @@ -1404,6 +1478,16 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1804,6 +1888,16 @@ dependencies = [ "log", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.109", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2054,6 +2148,21 @@ dependencies = [ "winreg", ] +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.2" @@ -2298,7 +2407,7 @@ dependencies = [ "assert_cmd", "csv", "env_logger 0.8.4", - "libloading", + "libloading 0.9.0", "log", "minijinja", "predicates 1.0.8", @@ -2356,7 +2465,7 @@ dependencies = [ "ittapi", "lazy_static", "libc", - "libloading", + "libloading 0.9.0", "log", "perf-event", "precision", @@ -2365,6 +2474,7 @@ dependencies = [ "sightglass-build", "sightglass-data", "thiserror", + "valgrind-requests", "wat", ] @@ -2489,13 +2599,34 @@ version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro-error", "proc-macro2", "quote", "syn 1.0.109", ] +[[package]] +name = "strum" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.109", +] + [[package]] name = "subtle" version = "2.6.1" @@ -2819,6 +2950,21 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "valgrind-requests" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796c44118a551cc842f2ce6fce4983b86e282ec99b6b5e80b09dfeab830e7755" +dependencies = [ + "bindgen", + "cc", + "cfg-if 1.0.4", + "cty", + "regex", + "rustc_version", + "strum", +] + [[package]] name = "vcpkg" version = "0.2.15" diff --git a/README.md b/README.md index 72043dd7..c24242ae 100644 --- a/README.md +++ b/README.md @@ -195,12 +195,20 @@ note that measuring using CPU cycles alone can be problematic (e.g., CPU frequency changes, context switches, etc.). Several _measures_ can be configured using the `--measure` option: -- `cycles`: the number of CPU cycles elapsed -- `perf-counters`: a selection of common `perf` counters (CPU cycles, - instructions retired, cache accesses, cache misses); only available on Linux -- `vtune`: record each phase as a VTune task for analysis; see [this help - documentation](docs/vtune.md) for more details -- `noop`: no measurement is performed + +- `cycles`: The number of CPU cycles elapsed. + +- `perf-counters`: A selection of common `perf` counters (CPU cycles, + instructions retired, cache accesses, cache misses); only available on Linux. + +- `callgrind`: Uses Valgrind's Callgrind to count instructions retired and + simulate caches and branch prediction. Mostly deterministic and very low + noise. Only available on Linux and when built with `--features callgrind`. + +- `vtune`: Record each phase as a VTune task for analysis; see [this help + documentation](docs/vtune.md) for more details. + +- `noop`: No measurement is performed. For example, run: @@ -208,6 +216,12 @@ For example, run: $ cargo run -- benchmark --measure perf-counters ... ``` +For `callgrind`, Sightglass runs benchmark children under `setarch -R valgrind` +with a fixed cache model and forces single-threaded Wasmtime compilation with +`RAYON_NUM_THREADS=1` so results stay stable across machines. Use the same +Valgrind version when comparing data recorded on different machines for best +results. + ### Getting Raw JSON or CSV Results If you don't want the results to be summarized and displayed in a human-readable diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index ecbf8f5b..a5a70c67 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" authors = ["Sightglass Project Developers"] edition = "2021" +[features] +default = ["callgrind"] +callgrind = ["sightglass-recorder/callgrind"] + [dependencies] anyhow = "1.0" libloading = "0.9" @@ -24,10 +28,10 @@ csv = "1.1.6" regex = "1.5.4" vega_lite_4 = { git = "https://github.com/procyon-rs/vega_lite_4.rs" } minijinja = "2.10" +tempfile = "3.2.0" [dev-dependencies] assert_cmd = "1.0.4" env_logger = "0.8.3" predicates = "1.0.8" -tempfile = "3.2.0" scraper = "0.24" diff --git a/crates/cli/src/benchmark.rs b/crates/cli/src/benchmark.rs index 548b61ff..2d388b4c 100644 --- a/crates/cli/src/benchmark.rs +++ b/crates/cli/src/benchmark.rs @@ -11,11 +11,191 @@ use std::{ fs, io::{self, BufWriter, Write}, path::{Path, PathBuf}, - process::Command, - process::Stdio, + process::{Command, Stdio}, }; use structopt::StructOpt; +const DEFAULT_PROCESSES: usize = 10; +const DEFAULT_ITERATIONS_PER_PROCESS: usize = 10; + +#[cfg(all(target_os = "linux", feature = "callgrind"))] +mod callgrind { + use super::*; + use sightglass_recorder::measure::callgrind::CALLGRIND_OUT_DIR_ENV_VAR; + use tempfile::TempDir; + + const DEFAULT_CALLGRIND_PROCESSES: usize = 3; + const DEFAULT_CALLGRIND_ITERATIONS_PER_PROCESS: usize = 1; + const CACHE_MODEL_I1: &str = "32768,8,64"; + const CACHE_MODEL_D1: &str = "32768,8,64"; + const CACHE_MODEL_LL: &str = "8388608,16,64"; + + impl PreparedCommand { + #[cfg(all(target_os = "linux", feature = "callgrind"))] + fn with_tempdir(mut self, tempdir: tempfile::TempDir) -> Self { + self.tempdir = Some(tempdir); + self + } + + #[cfg(all(target_os = "linux", feature = "callgrind"))] + fn tempdir(&self) -> Option<&tempfile::TempDir> { + self.tempdir.as_ref() + } + } + + impl BenchmarkCommand { + pub(super) fn default_processes(&self) -> usize { + if self.uses_callgrind() { + DEFAULT_CALLGRIND_PROCESSES + } else { + DEFAULT_PROCESSES + } + } + + pub(super) fn default_iterations_per_process(&self) -> usize { + if self.uses_callgrind() { + DEFAULT_CALLGRIND_ITERATIONS_PER_PROCESS + } else { + DEFAULT_ITERATIONS_PER_PROCESS + } + } + + pub(super) fn validate(&self) -> Result<()> { + if self.uses_callgrind() && self.measures.len() > 1 { + anyhow::bail!( + "callgrind must be used by itself and cannot be combined with other measures" + ); + } + + Ok(()) + } + + pub(super) fn should_wrap_subprocesses(&self) -> bool { + self.uses_callgrind() && std::env::var_os(CALLGRIND_OUT_DIR_ENV_VAR).is_none() + } + + pub(super) fn prepare_command( + &self, + this_exe: &Path, + engine: &Path, + wasm: &Path, + ) -> Result { + ensure_tools_available()?; + + let callgrind_output = TempDir::new().context("failed to create callgrind tempdir")?; + let mut prepared = PreparedCommand::new( + Command::new("setarch"), + "callgrind benchmark subprocess", + "failed to run callgrind benchmark subprocess", + "failed to read callgrind benchmark subprocess's results", + ) + .with_tempdir(callgrind_output); + let output_dir = prepared.tempdir().unwrap().path().to_path_buf(); + + prepared + .command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .arg(this_arch()) + .arg("-R") + .arg("valgrind") + .arg("--tool=callgrind") + .arg("--cache-sim=yes") + .arg("--branch-sim=yes") + .arg(format!("--I1={CACHE_MODEL_I1}")) + .arg(format!("--D1={CACHE_MODEL_D1}")) + .arg(format!("--LL={CACHE_MODEL_LL}")) + .arg("--instr-atstart=no") + .arg(format!( + "--callgrind-out-file={}", + output_dir.join("callgrind.out.%p").display() + )) + .arg(this_exe); + prepared.command.env("RAYON_NUM_THREADS", "1"); + prepared.command.env(CALLGRIND_OUT_DIR_ENV_VAR, &output_dir); + self.add_benchmark_child_args( + &mut prepared.command, + engine, + wasm, + 1, + self.iterations_per_process(), + Format::Json, + ); + + Ok(prepared) + } + + fn uses_callgrind(&self) -> bool { + self.measures + .iter() + .any(|measure| matches!(measure, MeasureType::Callgrind)) + } + } + + fn ensure_tools_available() -> Result<()> { + ensure_command_succeeds( + "valgrind", + ["--version"], + "callgrind measurement requires `valgrind` on PATH", + )?; + ensure_command_succeeds( + "setarch", + [this_arch(), "-R", "true"], + "callgrind measurement requires `setarch -R` support to disable ASLR", + )?; + Ok(()) + } + + fn ensure_command_succeeds(program: &str, args: I, error_message: &str) -> Result<()> + where + I: IntoIterator, + S: AsRef, + { + let status = Command::new(program) + .args(args) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .with_context(|| format!("{error_message}: failed to spawn `{program}`"))?; + anyhow::ensure!(status.success(), "{error_message}"); + Ok(()) + } +} + +#[cfg(not(all(target_os = "linux", feature = "callgrind")))] +mod callgrind { + use super::*; + + impl BenchmarkCommand { + pub(super) fn default_processes(&self) -> usize { + DEFAULT_PROCESSES + } + + pub(super) fn default_iterations_per_process(&self) -> usize { + DEFAULT_ITERATIONS_PER_PROCESS + } + + pub(super) fn validate(&self) -> Result<()> { + Ok(()) + } + + pub(super) fn should_wrap_subprocesses(&self) -> bool { + false + } + + pub(super) fn prepare_command( + &self, + _this_exe: &Path, + _engine: &Path, + _wasm: &Path, + ) -> Result { + unreachable!() + } + } +} + /// Measure compilation, instantiation, and execution of a Wasm file. /// /// The total number of samples taken for each Wasm benchmark is `PROCESSES * @@ -58,8 +238,8 @@ pub struct BenchmarkCommand { engine_flags: Option, /// How many processes should we use for each Wasm benchmark? - #[structopt(long = "processes", default_value = "10", value_name = "PROCESSES")] - processes: usize, + #[structopt(long = "processes", value_name = "PROCESSES")] + processes: Option, /// Override the "engine" name; this is useful if running experiments that might /// not have a differentiating engine name (e.g. if customizing the flags). @@ -72,10 +252,9 @@ pub struct BenchmarkCommand { /// How many times should we run a benchmark in a single process? #[structopt( long = "iterations-per-process", - default_value = "10", value_name = "NUMBER_OF_ITERATIONS_PER_PROCESS" )] - iterations_per_process: usize, + iterations_per_process: Option, /// Output raw data, rather than the summarized, human-readable analysis /// results. @@ -92,10 +271,17 @@ pub struct BenchmarkCommand { #[structopt(short = "o", long = "output-file")] output_file: Option, - /// The type of measurement to use (cycles, insts-retired, perf-counters, noop, vtune) - /// when recording the benchmark performance. This option can be specified more than - /// once if to record multiple measurements. If no measures are specified, - /// the "cycles" measure will be used. + /// The type of measurement to use (cycles, insts-retired, perf-counters, + /// noop, vtune, callgrind) when recording benchmark performance. + /// + /// This option can be specified more than once to record multiple measures, + /// except for `callgrind`, which must be used by itself. + /// + /// If no measures are specified, the "cycles" measure is used. + /// + /// `callgrind` defaults to fewer processes and iterations per process + /// because it runs the benchmarking processes under Valgrind, which is + /// slower but also more deterministic and less noisy. #[structopt(long = "measure", short = "m", multiple = true)] measures: Vec, @@ -146,23 +332,41 @@ pub struct BenchmarkCommand { impl BenchmarkCommand { pub fn execute(&self) -> Result<()> { - anyhow::ensure!(self.processes > 0, "processes must be greater than zero"); + anyhow::ensure!(self.processes() > 0, "processes must be greater than zero"); anyhow::ensure!( - self.iterations_per_process > 0, + self.iterations_per_process() > 0, "iterations-per-process must be greater than zero" ); anyhow::ensure!( !self.engines.is_empty(), "must pass one or more engines to benchmark with -e/--engine" ); + self.validate()?; + + if self.should_wrap_subprocesses() { + let this_exe = + std::env::current_exe().context("failed to get the current executable's path")?; + return self.execute_in_subprocesses("callgrind iterations", |engine, wasm| { + self.prepare_command(&this_exe, engine, wasm) + }); + } - if self.processes == 1 { + if self.processes() == 1 { self.execute_in_current_process() } else { self.execute_in_multiple_processes() } } + fn processes(&self) -> usize { + self.processes.unwrap_or_else(|| self.default_processes()) + } + + fn iterations_per_process(&self) -> usize { + self.iterations_per_process + .unwrap_or_else(|| self.default_iterations_per_process()) + } + /// Execute benchmark(s) in the provided engine(s) using the current process. pub fn execute_in_current_process(&self) -> Result<()> { let mut output_file: Box = if let Some(file) = self.output_file.as_ref() { @@ -258,7 +462,7 @@ impl BenchmarkCommand { // Run the benchmark (compilation, instantiation, and execution) several times in // this process. - for _ in 0..self.iterations_per_process { + for _ in 0..self.iterations_per_process() { match self.benchmark_phase { None => { let new_engine = benchmark::all(engine.take().unwrap(), &bytes)?; @@ -374,23 +578,58 @@ impl BenchmarkCommand { /// Execute the benchmark(s) by spawning multiple processes. Each of the spawned processes will /// run the `execute_in_current_process` function above. fn execute_in_multiple_processes(&self) -> Result<()> { + let this_exe = + std::env::current_exe().context("failed to get the current executable's path")?; + self.execute_in_subprocesses("iterations", |engine, wasm| { + let mut prepared = PreparedCommand::new( + Command::new(&this_exe), + "benchmark subprocess", + "failed to run benchmark subprocess", + "failed to read benchmark subprocess's results", + ); + prepared + .command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()); + self.add_benchmark_child_args( + &mut prepared.command, + engine, + wasm, + 1, + self.iterations_per_process(), + Format::Json, + ); + Ok(prepared) + }) + } + + fn execute_in_subprocesses( + &self, + iteration_label: &str, + mut prepare_command: F, + ) -> Result<()> + where + F: FnMut(&Path, &Path) -> Result, + { let mut output_file: Box = if let Some(file) = self.output_file.as_ref() { Box::new(BufWriter::new(fs::File::create(file)?)) } else { Box::new(io::stdout()) }; - let this_exe = - std::env::current_exe().context("failed to get the current executable's path")?; - let wasm_files: Vec<_> = self.benchmarks.iter().flat_map(|b| b.paths()).collect(); eprintln!( - "\nRunning {} total iterations ({} engines * {} benchmarks * {} processes * {} iterations per process)", - self.engines.len() * wasm_files.len() * self.processes * self.iterations_per_process, + "\nRunning {} total {} ({} engines * {} benchmarks * {} processes * {} iterations per process)", + self.engines.len() + * wasm_files.len() + * self.processes() + * self.iterations_per_process(), + iteration_label, self.engines.len(), wasm_files.len(), - self.processes, - self.iterations_per_process + self.processes(), + self.iterations_per_process() ); eprint!("\n[Done] [Elapsed ] [Est. Rem. ]"); @@ -398,7 +637,6 @@ impl BenchmarkCommand { // us avoid some measurement bias from CPU state transitions that aren't // constrained within the duration of process execution, like dynamic // CPU throttling due to overheating. - let mut rng = SmallRng::seed_from_u64(0x1337_4242); // Worklist that we randomly sample from. @@ -411,7 +649,7 @@ impl BenchmarkCommand { let engine = check_engine_path(engine)?; for wasm in wasm_files.iter().cloned() { - choices.push((engine.clone(), wasm, self.processes)); + choices.push((engine.clone(), wasm, self.processes())); } } @@ -419,65 +657,22 @@ impl BenchmarkCommand { let mut measurements = vec![]; let mut i = 0; - let n = choices.len() * self.processes; + let n = choices.len() * self.processes(); let start = std::time::Instant::now(); while !choices.is_empty() { let index = rng.gen_range(0, choices.len()); let (engine, wasm, procs_left) = &mut choices[index]; - - let mut command = Command::new(&this_exe); - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::inherit()) - .arg("benchmark") - .arg("--processes") - .arg("1") - .arg("--iterations-per-process") - .arg(self.iterations_per_process.to_string()) - .arg("--engine") - .arg(&engine) - .args( - self.measures - .iter() - .flat_map(|m| ["--measure".to_string(), m.to_string()]), - ) - .arg("--raw") - .arg("--output-format") - // Always use JSON when privately communicating with a - // subprocess. - .arg(Format::Json.to_string()); - - if self.pin { - command.arg("--pin"); - } - - if self.keep_logs { - command.arg("--keep-logs"); - } - - if self.small_workloads { - command.env("WASM_BENCH_USE_SMALL_WORKLOAD", "1"); - } - - if let Some(phase) = self.benchmark_phase { - command.arg("--benchmark-phase").arg(phase.to_string()); - } - - if let Some(flags) = &self.engine_flags { - command.arg(format!("--engine-flags={flags}")); - } - - command.arg("--").arg(&wasm); - - let output = command + let mut prepared = prepare_command(engine, wasm)?; + let output = prepared + .command .output() - .context("failed to run benchmark subprocess")?; + .context(prepared.failure_context)?; anyhow::ensure!( output.status.success(), - "benchmark subprocess did not exit successfully: {}\nstderr: {}\nstdout: {}", + "{} did not exit successfully: {}\nstderr: {}\nstdout: {}", + prepared.status_label, output.status, String::from_utf8_lossy(&output.stderr), String::from_utf8_lossy(&output.stdout) @@ -515,7 +710,7 @@ impl BenchmarkCommand { // accumulation. measurements.extend( serde_json::from_slice::>>(&output.stdout) - .context("failed to read benchmark subprocess's results")?, + .context(prepared.result_context)?, ); *procs_left -= 1; @@ -563,6 +758,82 @@ impl BenchmarkCommand { }; Ok(working_dir) } + + fn add_benchmark_child_args( + &self, + command: &mut Command, + engine: &Path, + wasm: &Path, + processes: usize, + iterations_per_process: usize, + output_format: Format, + ) { + command + .arg("benchmark") + .arg("--processes") + .arg(processes.to_string()) + .arg("--iterations-per-process") + .arg(iterations_per_process.to_string()) + .arg("--engine") + .arg(engine) + .args( + self.measures + .iter() + .flat_map(|measure| ["--measure".to_string(), measure.to_string()]), + ) + .arg("--raw") + .arg("--output-format") + .arg(output_format.to_string()); + + if self.pin { + command.arg("--pin"); + } + + if self.keep_logs { + command.arg("--keep-logs"); + } + + if self.small_workloads { + command.env("WASM_BENCH_USE_SMALL_WORKLOAD", "1"); + } + + if let Some(phase) = self.benchmark_phase { + command.arg("--benchmark-phase").arg(phase.to_string()); + } + + if let Some(flags) = self.engine_flags.as_deref() { + command.arg(format!("--engine-flags={flags}")); + } + + command.arg("--").arg(wasm); + } +} + +struct PreparedCommand { + command: Command, + #[cfg(all(target_os = "linux", feature = "callgrind"))] + tempdir: Option, + status_label: &'static str, + failure_context: &'static str, + result_context: &'static str, +} + +impl PreparedCommand { + fn new( + command: Command, + status_label: &'static str, + failure_context: &'static str, + result_context: &'static str, + ) -> Self { + Self { + command, + #[cfg(all(target_os = "linux", feature = "callgrind"))] + tempdir: None, + status_label, + failure_context, + result_context, + } + } } fn this_arch() -> &'static str { @@ -758,4 +1029,32 @@ instantiation :: nanoseconds :: benchmarks/pulldown-cmark/benchmark.wasm assert_eq!(actual.trim(), expected.trim()); Ok(()) } + + #[cfg(all(target_os = "linux", feature = "callgrind"))] + #[test] + fn callgrind_must_be_exclusive() { + let command = BenchmarkCommand { + benchmarks: vec![], + engines: vec!["/tmp/engine.so".into()], + engine_flags: None, + processes: None, + names: None, + iterations_per_process: None, + raw: false, + output_format: Format::Json, + output_file: None, + measures: vec![MeasureType::Callgrind, MeasureType::Cycles], + small_workloads: false, + working_dir: None, + benchmark_phase: None, + significance_level: 0.01, + pin: false, + keep_logs: false, + }; + + assert_eq!( + command.validate().unwrap_err().to_string(), + "callgrind must be used by itself and cannot be combined with other measures" + ); + } } diff --git a/crates/recorder/Cargo.toml b/crates/recorder/Cargo.toml index cebef5b0..92c15444 100644 --- a/crates/recorder/Cargo.toml +++ b/crates/recorder/Cargo.toml @@ -5,6 +5,9 @@ description = "A measurement tool for compiling and running a single Wasm benchm authors = ["Sightglass Project Developers"] edition = "2021" +[features] +callgrind = ["dep:valgrind-requests"] + [dependencies] anyhow = "1.0" libloading = "0.9" @@ -19,6 +22,7 @@ ittapi = "0.3" [target.'cfg(target_os = "linux")'.dependencies] perf-event = "0.4" +valgrind-requests = { version = "1.1.0", optional = true } # On supported platforms, we use libc's `sched_getcpu` to log the processor ID. libc = "0.2" diff --git a/crates/recorder/src/measure/callgrind.rs b/crates/recorder/src/measure/callgrind.rs new file mode 100644 index 00000000..3eb450f6 --- /dev/null +++ b/crates/recorder/src/measure/callgrind.rs @@ -0,0 +1,322 @@ +//! Callgrind-backed measurement and dump parsing. +//! +//! This measure is active only when the benchmark child is already running +//! under Valgrind Callgrind. At each phase boundary it zeroes the current +//! counters, triggers a labeled dump, reads the matching `callgrind.out` part +//! file, and converts the recorded events into Sightglass measurements. + +use super::{Measure, Measurements}; +use anyhow::{anyhow, ensure, Context, Result}; +use sightglass_data::Phase; +use std::{ + ffi::CString, + fs, + path::{Path, PathBuf}, +}; +use valgrind_requests::{callgrind, valgrind}; + +/// Environment variable used by the CLI parent to tell the child where +/// Callgrind dump files will be written. +pub const CALLGRIND_OUT_DIR_ENV_VAR: &str = "SIGHTGLASS_CALLGRIND_OUT_DIR"; + +const CLIENT_REQUEST_PREFIX: &str = "Client Request: "; +const EVENT_MAPPINGS: &[(&str, &str)] = &[ + ("Ir", "instructions-retired"), + ("Dr", "data-reads"), + ("Dw", "data-writes"), + ("I1mr", "l1-icache-misses"), + ("D1mr", "l1-dcache-read-misses"), + ("D1mw", "l1-dcache-write-misses"), + ("ILmr", "ll-icache-misses"), + ("DLmr", "ll-dcache-read-misses"), + ("DLmw", "ll-dcache-write-misses"), + ("Bc", "conditional-branches"), + ("Bcm", "conditional-branch-misses"), + ("Bi", "indirect-branches"), + ("Bim", "indirect-branch-misses"), +]; + +/// A `Measure` implementation that uses Callgrind to get low-noise +/// measurements. +pub struct CallgrindMeasure { + output_dir: Option, + next_dump_part: u32, +} + +impl Default for CallgrindMeasure { + fn default() -> Self { + Self::new() + } +} + +impl CallgrindMeasure { + /// Create a new callgrind measure for the current process. + pub fn new() -> Self { + Self { + output_dir: std::env::var_os(CALLGRIND_OUT_DIR_ENV_VAR).map(PathBuf::from), + next_dump_part: 1, + } + } + + fn running_under_valgrind(&self) -> bool { + let running_under_valgrind = valgrind::running_on_valgrind() > 0; + assert!( + running_under_valgrind || self.output_dir.is_none(), + "callgrind measure requested but benchmark process is not running under Valgrind", + ); + running_under_valgrind + } + + fn parse_dump_for_phase( + &self, + phase: Phase, + iteration: u32, + part: u32, + ) -> Result { + let output_dir = self.output_dir.as_ref().ok_or_else(|| { + anyhow!( + "callgrind output directory is not configured; expected {CALLGRIND_OUT_DIR_ENV_VAR}" + ) + })?; + let dump_path = output_dir.join(format!("callgrind.out.{}.{}", std::process::id(), part)); + ensure!( + dump_path.exists(), + "no callgrind output found at {} — is valgrind installed and on PATH?", + dump_path.display() + ); + + let dump = parse_callgrind_dump_file(&dump_path)?; + ensure!( + dump.pid == std::process::id(), + "callgrind dump pid mismatch: expected {}, found {} in {}", + std::process::id(), + dump.pid, + dump_path.display() + ); + ensure!( + dump.part == part, + "callgrind dump part mismatch: expected {part}, found {} in {}", + dump.part, + dump_path.display() + ); + + let expected_label = format!("{phase}/{iteration}"); + let actual_label = dump.label.as_deref().ok_or_else(|| { + anyhow!( + "callgrind dump {} is missing a client-request label", + dump_path.display() + ) + })?; + ensure!( + actual_label == expected_label, + "callgrind dump mismatch: expected {expected_label}, got {actual_label}" + ); + + Ok(dump) + } +} + +impl Measure for CallgrindMeasure { + fn start(&mut self, _phase: Phase) { + if !self.running_under_valgrind() { + return; + } + + callgrind::start_instrumentation(); + callgrind::zero_stats(); + } + + fn end(&mut self, phase: Phase, measurements: &mut Measurements) { + if !self.running_under_valgrind() { + return; + } + + let label = CString::new(format!("{phase}/{}", measurements.iteration())).unwrap(); + callgrind::dump_stats_at(label.as_c_str()); + + let dump = self + .parse_dump_for_phase(phase, measurements.iteration(), self.next_dump_part) + .unwrap_or_else(|error| panic!("failed to read callgrind dump: {error:#}")); + self.next_dump_part += 1; + + measurements.reserve(dump.counts.len()); + for event in dump.counts { + measurements.add(phase, event.name.into(), event.count); + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct ParsedCallgrindDump { + pid: u32, + part: u32, + label: Option, + counts: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct CallgrindEventCount { + name: &'static str, + count: u64, +} + +fn parse_callgrind_dump_file(path: &Path) -> Result { + let contents = fs::read_to_string(path) + .with_context(|| format!("failed to read callgrind dump {}", path.display()))?; + parse_callgrind_dump(&contents) +} + +fn parse_callgrind_dump(contents: &str) -> Result { + let mut events = None; + let mut summary = None; + let mut totals = None; + let mut pid = None; + let mut part = None; + let mut label = None; + + for line in contents.lines() { + if let Some(raw_events) = line.strip_prefix("events: ") { + events = Some(raw_events.split_whitespace().collect::>()); + } else if let Some(raw_counts) = line.strip_prefix("summary: ") { + summary = Some(parse_counts(raw_counts)?); + } else if let Some(raw_counts) = line.strip_prefix("totals: ") { + totals = Some(parse_counts(raw_counts)?); + } else if let Some(raw_pid) = line.strip_prefix("pid: ") { + pid = Some(raw_pid.trim().parse().context("invalid callgrind pid")?); + } else if let Some(raw_part) = line.strip_prefix("part: ") { + part = Some(raw_part.trim().parse().context("invalid callgrind part")?); + } else if let Some(trigger) = line.strip_prefix("desc: Trigger: ") { + label = trigger + .strip_prefix(CLIENT_REQUEST_PREFIX) + .map(ToOwned::to_owned); + } + } + + let events = events.ok_or_else(|| anyhow!("callgrind dump is missing an events header"))?; + let counts = summary + .or(totals) + .ok_or_else(|| anyhow!("callgrind dump is missing a summary/totals line"))?; + ensure!( + events.len() == counts.len(), + "callgrind events/count mismatch: {} events, {} counts", + events.len(), + counts.len() + ); + + let mut parsed_counts = Vec::with_capacity(EVENT_MAPPINGS.len()); + for (event, count) in events.into_iter().zip(counts) { + if let Some(name) = event_name(event) { + parsed_counts.push(CallgrindEventCount { name, count }); + } + } + + Ok(ParsedCallgrindDump { + pid: pid.ok_or_else(|| anyhow!("callgrind dump is missing pid"))?, + part: part.ok_or_else(|| anyhow!("callgrind dump is missing part"))?, + label, + counts: parsed_counts, + }) +} + +fn parse_counts(raw_counts: &str) -> Result> { + raw_counts + .split_whitespace() + .map(|count| { + count + .parse::() + .with_context(|| format!("invalid callgrind count: {count}")) + }) + .collect() +} + +fn event_name(raw_event: &str) -> Option<&'static str> { + EVENT_MAPPINGS + .iter() + .find_map(|(event, name)| (*event == raw_event).then_some(*name)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_client_request_dump() -> Result<()> { + let dump = parse_callgrind_dump(include_str!( + "../../../../test/fixtures/callgrind.client-request.out" + ))?; + + assert_eq!(dump.pid, 773940); + assert_eq!(dump.part, 1); + assert_eq!(dump.label.as_deref(), Some("execution/7")); + assert_eq!( + dump.counts, + vec![ + CallgrindEventCount { + name: "instructions-retired", + count: 790139, + }, + CallgrindEventCount { + name: "data-reads", + count: 230038, + }, + CallgrindEventCount { + name: "data-writes", + count: 320058, + }, + CallgrindEventCount { + name: "l1-icache-misses", + count: 17, + }, + CallgrindEventCount { + name: "l1-dcache-read-misses", + count: 0, + }, + CallgrindEventCount { + name: "l1-dcache-write-misses", + count: 8, + }, + CallgrindEventCount { + name: "ll-icache-misses", + count: 17, + }, + CallgrindEventCount { + name: "ll-dcache-read-misses", + count: 0, + }, + CallgrindEventCount { + name: "ll-dcache-write-misses", + count: 8, + }, + CallgrindEventCount { + name: "conditional-branches", + count: 30004, + }, + CallgrindEventCount { + name: "conditional-branch-misses", + count: 8, + }, + CallgrindEventCount { + name: "indirect-branches", + count: 1, + }, + CallgrindEventCount { + name: "indirect-branch-misses", + count: 1, + }, + ] + ); + + Ok(()) + } + + #[test] + fn ignores_program_termination_label() -> Result<()> { + let dump = parse_callgrind_dump(include_str!( + "../../../../test/fixtures/callgrind.program-termination.out" + ))?; + + assert_eq!(dump.label, None); + assert_eq!(dump.part, 2); + Ok(()) + } +} diff --git a/crates/recorder/src/measure/mod.rs b/crates/recorder/src/measure/mod.rs index c6b2e585..8675212f 100644 --- a/crates/recorder/src/measure/mod.rs +++ b/crates/recorder/src/measure/mod.rs @@ -38,6 +38,11 @@ impl<'a> Measurements<'a> { self.measurements.reserve(capacity); } + /// Get the current iteration index. + pub fn iteration(&self) -> u32 { + self.iteration + } + /// Add a measurement of the given event for the given phase to this /// `Measurements` collection. pub fn add(&mut self, phase: Phase, event: Cow<'a, str>, count: u64) { @@ -77,6 +82,8 @@ pub trait Measure: 'static { fn end(&mut self, phase: Phase, measurements: &mut Measurements); } +#[cfg(all(target_os = "linux", feature = "callgrind"))] +pub mod callgrind; #[cfg(target_os = "linux")] pub mod counters; #[cfg(target_os = "linux")] @@ -116,6 +123,11 @@ pub enum MeasureType { /// Measure instructions retired. #[cfg(target_os = "linux")] InstsRetired, + + /// Measure deterministic instruction, cache, and branch simulation events + /// under Valgrind Callgrind. + #[cfg(all(target_os = "linux", feature = "callgrind"))] + Callgrind, } impl fmt::Display for MeasureType { @@ -129,6 +141,8 @@ impl fmt::Display for MeasureType { MeasureType::PerfCounters => write!(f, "perf-counters"), #[cfg(target_os = "linux")] MeasureType::InstsRetired => write!(f, "insts-retired"), + #[cfg(all(target_os = "linux", feature = "callgrind"))] + MeasureType::Callgrind => write!(f, "callgrind"), } } } @@ -141,10 +155,25 @@ impl FromStr for MeasureType { "time" => Ok(Self::Time), "cycles" => Ok(Self::Cycles), "vtune" => Ok(Self::VTune), + #[cfg(target_os = "linux")] "perf-counters" => Ok(Self::PerfCounters), + #[cfg(not(target_os = "linux"))] + "perf-counters" => Err("`perf-counters` measure is only available on Linux"), + #[cfg(target_os = "linux")] "insts-retired" => Ok(Self::InstsRetired), + #[cfg(not(target_os = "linux"))] + "insts-retired" => Err("`insts-retired` measure is only available on Linux"), + + #[cfg(all(target_os = "linux", feature = "callgrind"))] + "callgrind" => Ok(Self::Callgrind), + #[cfg(not(all(target_os = "linux", feature = "callgrind")))] + "callgrind" => Err( + "`insts-retired` measure is only available on Linux and when the `callgrind` cargo \ + feature is enabled", + ), + _ => Err("unknown measure type"), } } @@ -164,6 +193,8 @@ impl MeasureType { Self::PerfCounters => Box::new(counters::CounterMeasure::new()), #[cfg(target_os = "linux")] Self::InstsRetired => Box::new(insts::InstsRetiredMeasure::new()), + #[cfg(all(target_os = "linux", feature = "callgrind"))] + Self::Callgrind => Box::new(callgrind::CallgrindMeasure::new()), } } } diff --git a/test/fixtures/callgrind.client-request.out b/test/fixtures/callgrind.client-request.out new file mode 100644 index 00000000..5a96f1a2 --- /dev/null +++ b/test/fixtures/callgrind.client-request.out @@ -0,0 +1,18 @@ +# callgrind format +version: 1 +creator: callgrind-3.20.0.GIT +pid: 773940 +cmd: target/debug/cgprobe +part: 1 + + +desc: I1 cache: 32768 B, 64 B, 8-way associative +desc: D1 cache: 32768 B, 64 B, 8-way associative +desc: LL cache: 8388608 B, 64 B, 16-way associative + +desc: Timerange: Basic block 0 - 140028 +desc: Trigger: Client Request: execution/7 + +positions: line +events: Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim +summary: 790139 230038 320058 17 0 8 17 0 8 30004 8 1 1 diff --git a/test/fixtures/callgrind.program-termination.out b/test/fixtures/callgrind.program-termination.out new file mode 100644 index 00000000..7346470b --- /dev/null +++ b/test/fixtures/callgrind.program-termination.out @@ -0,0 +1,18 @@ +# callgrind format +version: 1 +creator: callgrind-3.20.0.GIT +pid: 773940 +cmd: target/debug/cgprobe +part: 2 + + +desc: I1 cache: 32768 B, 64 B, 8-way associative +desc: D1 cache: 32768 B, 64 B, 8-way associative +desc: LL cache: 8388608 B, 64 B, 16-way associative + +desc: Timerange: Basic block 140028 - 150282 +desc: Trigger: Program termination + +positions: line +events: Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim +summary: 18446744073708761429 18446744073709321564 18446744073709231539 18446744073709551595 18446744073709551613 18446744073709551605 18446744073709551595 18446744073709551613 18446744073709551605 18446744073709521611 18446744073709551607 18446744073709551614 18446744073709551614 From 0724e1bfba0627e3c87a0fd9ae5f21800c68e0c1 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Fri, 5 Jun 2026 15:46:59 -0700 Subject: [PATCH 2/2] Add help text for process/iteration defaults --- crates/cli/src/benchmark.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/cli/src/benchmark.rs b/crates/cli/src/benchmark.rs index 2d388b4c..c42c7550 100644 --- a/crates/cli/src/benchmark.rs +++ b/crates/cli/src/benchmark.rs @@ -238,6 +238,9 @@ pub struct BenchmarkCommand { engine_flags: Option, /// How many processes should we use for each Wasm benchmark? + /// + /// Defaults to `10`, unless using the `callgrind` measure, in which case the + /// default is `3`. #[structopt(long = "processes", value_name = "PROCESSES")] processes: Option, @@ -250,6 +253,9 @@ pub struct BenchmarkCommand { names: Option>, /// How many times should we run a benchmark in a single process? + /// + /// Defaults to `10`, unless using the `callgrind` measure, in which case the + /// default is `1`. #[structopt( long = "iterations-per-process", value_name = "NUMBER_OF_ITERATIONS_PER_PROCESS"