diff --git a/Cargo.lock b/Cargo.lock index 3d966a7d..4949682e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,6 +99,26 @@ dependencies = [ "special", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.10.0", + "cexpr", + "clang-sys", + "itertools", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.109", +] + [[package]] name = "bitflags" version = "0.7.0" @@ -197,6 +217,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -222,6 +251,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading 0.8.9", +] + [[package]] name = "clap" version = "2.34.0" @@ -369,6 +409,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "cty" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" + [[package]] name = "darling" version = "0.20.11" @@ -831,6 +877,12 @@ dependencies = [ "wasip2", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "h2" version = "0.3.27" @@ -870,6 +922,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1241,6 +1299,16 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if 1.0.4", + "windows-link", +] + [[package]] name = "libloading" version = "0.9.0" @@ -1354,6 +1422,12 @@ dependencies = [ "serde", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "mio" version = "1.1.0" @@ -1404,6 +1478,16 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1804,6 +1888,16 @@ dependencies = [ "log", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.109", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2054,6 +2148,21 @@ dependencies = [ "winreg", ] +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.2" @@ -2298,7 +2407,7 @@ dependencies = [ "assert_cmd", "csv", "env_logger 0.8.4", - "libloading", + "libloading 0.9.0", "log", "minijinja", "predicates 1.0.8", @@ -2356,7 +2465,7 @@ dependencies = [ "ittapi", "lazy_static", "libc", - "libloading", + "libloading 0.9.0", "log", "perf-event", "precision", @@ -2365,6 +2474,7 @@ dependencies = [ "sightglass-build", "sightglass-data", "thiserror", + "valgrind-requests", "wat", ] @@ -2489,13 +2599,34 @@ version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro-error", "proc-macro2", "quote", "syn 1.0.109", ] +[[package]] +name = "strum" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.109", +] + [[package]] name = "subtle" version = "2.6.1" @@ -2819,6 +2950,21 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "valgrind-requests" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796c44118a551cc842f2ce6fce4983b86e282ec99b6b5e80b09dfeab830e7755" +dependencies = [ + "bindgen", + "cc", + "cfg-if 1.0.4", + "cty", + "regex", + "rustc_version", + "strum", +] + [[package]] name = "vcpkg" version = "0.2.15" diff --git a/README.md b/README.md index 72043dd7..c24242ae 100644 --- a/README.md +++ b/README.md @@ -195,12 +195,20 @@ note that measuring using CPU cycles alone can be problematic (e.g., CPU frequency changes, context switches, etc.). Several _measures_ can be configured using the `--measure` option: -- `cycles`: the number of CPU cycles elapsed -- `perf-counters`: a selection of common `perf` counters (CPU cycles, - instructions retired, cache accesses, cache misses); only available on Linux -- `vtune`: record each phase as a VTune task for analysis; see [this help - documentation](docs/vtune.md) for more details -- `noop`: no measurement is performed + +- `cycles`: The number of CPU cycles elapsed. + +- `perf-counters`: A selection of common `perf` counters (CPU cycles, + instructions retired, cache accesses, cache misses); only available on Linux. + +- `callgrind`: Uses Valgrind's Callgrind to count instructions retired and + simulate caches and branch prediction. Mostly deterministic and very low + noise. Only available on Linux and when built with `--features callgrind`. + +- `vtune`: Record each phase as a VTune task for analysis; see [this help + documentation](docs/vtune.md) for more details. + +- `noop`: No measurement is performed. For example, run: @@ -208,6 +216,12 @@ For example, run: $ cargo run -- benchmark --measure perf-counters ... ``` +For `callgrind`, Sightglass runs benchmark children under `setarch -R valgrind` +with a fixed cache model and forces single-threaded Wasmtime compilation with +`RAYON_NUM_THREADS=1` so results stay stable across machines. Use the same +Valgrind version when comparing data recorded on different machines for best +results. + ### Getting Raw JSON or CSV Results If you don't want the results to be summarized and displayed in a human-readable diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index ecbf8f5b..a5a70c67 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" authors = ["Sightglass Project Developers"] edition = "2021" +[features] +default = ["callgrind"] +callgrind = ["sightglass-recorder/callgrind"] + [dependencies] anyhow = "1.0" libloading = "0.9" @@ -24,10 +28,10 @@ csv = "1.1.6" regex = "1.5.4" vega_lite_4 = { git = "https://github.com/procyon-rs/vega_lite_4.rs" } minijinja = "2.10" +tempfile = "3.2.0" [dev-dependencies] assert_cmd = "1.0.4" env_logger = "0.8.3" predicates = "1.0.8" -tempfile = "3.2.0" scraper = "0.24" diff --git a/crates/cli/src/benchmark.rs b/crates/cli/src/benchmark.rs index 548b61ff..c42c7550 100644 --- a/crates/cli/src/benchmark.rs +++ b/crates/cli/src/benchmark.rs @@ -11,11 +11,191 @@ use std::{ fs, io::{self, BufWriter, Write}, path::{Path, PathBuf}, - process::Command, - process::Stdio, + process::{Command, Stdio}, }; use structopt::StructOpt; +const DEFAULT_PROCESSES: usize = 10; +const DEFAULT_ITERATIONS_PER_PROCESS: usize = 10; + +#[cfg(all(target_os = "linux", feature = "callgrind"))] +mod callgrind { + use super::*; + use sightglass_recorder::measure::callgrind::CALLGRIND_OUT_DIR_ENV_VAR; + use tempfile::TempDir; + + const DEFAULT_CALLGRIND_PROCESSES: usize = 3; + const DEFAULT_CALLGRIND_ITERATIONS_PER_PROCESS: usize = 1; + const CACHE_MODEL_I1: &str = "32768,8,64"; + const CACHE_MODEL_D1: &str = "32768,8,64"; + const CACHE_MODEL_LL: &str = "8388608,16,64"; + + impl PreparedCommand { + #[cfg(all(target_os = "linux", feature = "callgrind"))] + fn with_tempdir(mut self, tempdir: tempfile::TempDir) -> Self { + self.tempdir = Some(tempdir); + self + } + + #[cfg(all(target_os = "linux", feature = "callgrind"))] + fn tempdir(&self) -> Option<&tempfile::TempDir> { + self.tempdir.as_ref() + } + } + + impl BenchmarkCommand { + pub(super) fn default_processes(&self) -> usize { + if self.uses_callgrind() { + DEFAULT_CALLGRIND_PROCESSES + } else { + DEFAULT_PROCESSES + } + } + + pub(super) fn default_iterations_per_process(&self) -> usize { + if self.uses_callgrind() { + DEFAULT_CALLGRIND_ITERATIONS_PER_PROCESS + } else { + DEFAULT_ITERATIONS_PER_PROCESS + } + } + + pub(super) fn validate(&self) -> Result<()> { + if self.uses_callgrind() && self.measures.len() > 1 { + anyhow::bail!( + "callgrind must be used by itself and cannot be combined with other measures" + ); + } + + Ok(()) + } + + pub(super) fn should_wrap_subprocesses(&self) -> bool { + self.uses_callgrind() && std::env::var_os(CALLGRIND_OUT_DIR_ENV_VAR).is_none() + } + + pub(super) fn prepare_command( + &self, + this_exe: &Path, + engine: &Path, + wasm: &Path, + ) -> Result { + ensure_tools_available()?; + + let callgrind_output = TempDir::new().context("failed to create callgrind tempdir")?; + let mut prepared = PreparedCommand::new( + Command::new("setarch"), + "callgrind benchmark subprocess", + "failed to run callgrind benchmark subprocess", + "failed to read callgrind benchmark subprocess's results", + ) + .with_tempdir(callgrind_output); + let output_dir = prepared.tempdir().unwrap().path().to_path_buf(); + + prepared + .command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .arg(this_arch()) + .arg("-R") + .arg("valgrind") + .arg("--tool=callgrind") + .arg("--cache-sim=yes") + .arg("--branch-sim=yes") + .arg(format!("--I1={CACHE_MODEL_I1}")) + .arg(format!("--D1={CACHE_MODEL_D1}")) + .arg(format!("--LL={CACHE_MODEL_LL}")) + .arg("--instr-atstart=no") + .arg(format!( + "--callgrind-out-file={}", + output_dir.join("callgrind.out.%p").display() + )) + .arg(this_exe); + prepared.command.env("RAYON_NUM_THREADS", "1"); + prepared.command.env(CALLGRIND_OUT_DIR_ENV_VAR, &output_dir); + self.add_benchmark_child_args( + &mut prepared.command, + engine, + wasm, + 1, + self.iterations_per_process(), + Format::Json, + ); + + Ok(prepared) + } + + fn uses_callgrind(&self) -> bool { + self.measures + .iter() + .any(|measure| matches!(measure, MeasureType::Callgrind)) + } + } + + fn ensure_tools_available() -> Result<()> { + ensure_command_succeeds( + "valgrind", + ["--version"], + "callgrind measurement requires `valgrind` on PATH", + )?; + ensure_command_succeeds( + "setarch", + [this_arch(), "-R", "true"], + "callgrind measurement requires `setarch -R` support to disable ASLR", + )?; + Ok(()) + } + + fn ensure_command_succeeds(program: &str, args: I, error_message: &str) -> Result<()> + where + I: IntoIterator, + S: AsRef, + { + let status = Command::new(program) + .args(args) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .with_context(|| format!("{error_message}: failed to spawn `{program}`"))?; + anyhow::ensure!(status.success(), "{error_message}"); + Ok(()) + } +} + +#[cfg(not(all(target_os = "linux", feature = "callgrind")))] +mod callgrind { + use super::*; + + impl BenchmarkCommand { + pub(super) fn default_processes(&self) -> usize { + DEFAULT_PROCESSES + } + + pub(super) fn default_iterations_per_process(&self) -> usize { + DEFAULT_ITERATIONS_PER_PROCESS + } + + pub(super) fn validate(&self) -> Result<()> { + Ok(()) + } + + pub(super) fn should_wrap_subprocesses(&self) -> bool { + false + } + + pub(super) fn prepare_command( + &self, + _this_exe: &Path, + _engine: &Path, + _wasm: &Path, + ) -> Result { + unreachable!() + } + } +} + /// Measure compilation, instantiation, and execution of a Wasm file. /// /// The total number of samples taken for each Wasm benchmark is `PROCESSES * @@ -58,8 +238,11 @@ pub struct BenchmarkCommand { engine_flags: Option, /// How many processes should we use for each Wasm benchmark? - #[structopt(long = "processes", default_value = "10", value_name = "PROCESSES")] - processes: usize, + /// + /// Defaults to `10`, unless using the `callgrind` measure, in which case the + /// default is `3`. + #[structopt(long = "processes", value_name = "PROCESSES")] + processes: Option, /// Override the "engine" name; this is useful if running experiments that might /// not have a differentiating engine name (e.g. if customizing the flags). @@ -70,12 +253,14 @@ pub struct BenchmarkCommand { names: Option>, /// How many times should we run a benchmark in a single process? + /// + /// Defaults to `10`, unless using the `callgrind` measure, in which case the + /// default is `1`. #[structopt( long = "iterations-per-process", - default_value = "10", value_name = "NUMBER_OF_ITERATIONS_PER_PROCESS" )] - iterations_per_process: usize, + iterations_per_process: Option, /// Output raw data, rather than the summarized, human-readable analysis /// results. @@ -92,10 +277,17 @@ pub struct BenchmarkCommand { #[structopt(short = "o", long = "output-file")] output_file: Option, - /// The type of measurement to use (cycles, insts-retired, perf-counters, noop, vtune) - /// when recording the benchmark performance. This option can be specified more than - /// once if to record multiple measurements. If no measures are specified, - /// the "cycles" measure will be used. + /// The type of measurement to use (cycles, insts-retired, perf-counters, + /// noop, vtune, callgrind) when recording benchmark performance. + /// + /// This option can be specified more than once to record multiple measures, + /// except for `callgrind`, which must be used by itself. + /// + /// If no measures are specified, the "cycles" measure is used. + /// + /// `callgrind` defaults to fewer processes and iterations per process + /// because it runs the benchmarking processes under Valgrind, which is + /// slower but also more deterministic and less noisy. #[structopt(long = "measure", short = "m", multiple = true)] measures: Vec, @@ -146,23 +338,41 @@ pub struct BenchmarkCommand { impl BenchmarkCommand { pub fn execute(&self) -> Result<()> { - anyhow::ensure!(self.processes > 0, "processes must be greater than zero"); + anyhow::ensure!(self.processes() > 0, "processes must be greater than zero"); anyhow::ensure!( - self.iterations_per_process > 0, + self.iterations_per_process() > 0, "iterations-per-process must be greater than zero" ); anyhow::ensure!( !self.engines.is_empty(), "must pass one or more engines to benchmark with -e/--engine" ); + self.validate()?; + + if self.should_wrap_subprocesses() { + let this_exe = + std::env::current_exe().context("failed to get the current executable's path")?; + return self.execute_in_subprocesses("callgrind iterations", |engine, wasm| { + self.prepare_command(&this_exe, engine, wasm) + }); + } - if self.processes == 1 { + if self.processes() == 1 { self.execute_in_current_process() } else { self.execute_in_multiple_processes() } } + fn processes(&self) -> usize { + self.processes.unwrap_or_else(|| self.default_processes()) + } + + fn iterations_per_process(&self) -> usize { + self.iterations_per_process + .unwrap_or_else(|| self.default_iterations_per_process()) + } + /// Execute benchmark(s) in the provided engine(s) using the current process. pub fn execute_in_current_process(&self) -> Result<()> { let mut output_file: Box = if let Some(file) = self.output_file.as_ref() { @@ -258,7 +468,7 @@ impl BenchmarkCommand { // Run the benchmark (compilation, instantiation, and execution) several times in // this process. - for _ in 0..self.iterations_per_process { + for _ in 0..self.iterations_per_process() { match self.benchmark_phase { None => { let new_engine = benchmark::all(engine.take().unwrap(), &bytes)?; @@ -374,23 +584,58 @@ impl BenchmarkCommand { /// Execute the benchmark(s) by spawning multiple processes. Each of the spawned processes will /// run the `execute_in_current_process` function above. fn execute_in_multiple_processes(&self) -> Result<()> { + let this_exe = + std::env::current_exe().context("failed to get the current executable's path")?; + self.execute_in_subprocesses("iterations", |engine, wasm| { + let mut prepared = PreparedCommand::new( + Command::new(&this_exe), + "benchmark subprocess", + "failed to run benchmark subprocess", + "failed to read benchmark subprocess's results", + ); + prepared + .command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()); + self.add_benchmark_child_args( + &mut prepared.command, + engine, + wasm, + 1, + self.iterations_per_process(), + Format::Json, + ); + Ok(prepared) + }) + } + + fn execute_in_subprocesses( + &self, + iteration_label: &str, + mut prepare_command: F, + ) -> Result<()> + where + F: FnMut(&Path, &Path) -> Result, + { let mut output_file: Box = if let Some(file) = self.output_file.as_ref() { Box::new(BufWriter::new(fs::File::create(file)?)) } else { Box::new(io::stdout()) }; - let this_exe = - std::env::current_exe().context("failed to get the current executable's path")?; - let wasm_files: Vec<_> = self.benchmarks.iter().flat_map(|b| b.paths()).collect(); eprintln!( - "\nRunning {} total iterations ({} engines * {} benchmarks * {} processes * {} iterations per process)", - self.engines.len() * wasm_files.len() * self.processes * self.iterations_per_process, + "\nRunning {} total {} ({} engines * {} benchmarks * {} processes * {} iterations per process)", + self.engines.len() + * wasm_files.len() + * self.processes() + * self.iterations_per_process(), + iteration_label, self.engines.len(), wasm_files.len(), - self.processes, - self.iterations_per_process + self.processes(), + self.iterations_per_process() ); eprint!("\n[Done] [Elapsed ] [Est. Rem. ]"); @@ -398,7 +643,6 @@ impl BenchmarkCommand { // us avoid some measurement bias from CPU state transitions that aren't // constrained within the duration of process execution, like dynamic // CPU throttling due to overheating. - let mut rng = SmallRng::seed_from_u64(0x1337_4242); // Worklist that we randomly sample from. @@ -411,7 +655,7 @@ impl BenchmarkCommand { let engine = check_engine_path(engine)?; for wasm in wasm_files.iter().cloned() { - choices.push((engine.clone(), wasm, self.processes)); + choices.push((engine.clone(), wasm, self.processes())); } } @@ -419,65 +663,22 @@ impl BenchmarkCommand { let mut measurements = vec![]; let mut i = 0; - let n = choices.len() * self.processes; + let n = choices.len() * self.processes(); let start = std::time::Instant::now(); while !choices.is_empty() { let index = rng.gen_range(0, choices.len()); let (engine, wasm, procs_left) = &mut choices[index]; - - let mut command = Command::new(&this_exe); - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::inherit()) - .arg("benchmark") - .arg("--processes") - .arg("1") - .arg("--iterations-per-process") - .arg(self.iterations_per_process.to_string()) - .arg("--engine") - .arg(&engine) - .args( - self.measures - .iter() - .flat_map(|m| ["--measure".to_string(), m.to_string()]), - ) - .arg("--raw") - .arg("--output-format") - // Always use JSON when privately communicating with a - // subprocess. - .arg(Format::Json.to_string()); - - if self.pin { - command.arg("--pin"); - } - - if self.keep_logs { - command.arg("--keep-logs"); - } - - if self.small_workloads { - command.env("WASM_BENCH_USE_SMALL_WORKLOAD", "1"); - } - - if let Some(phase) = self.benchmark_phase { - command.arg("--benchmark-phase").arg(phase.to_string()); - } - - if let Some(flags) = &self.engine_flags { - command.arg(format!("--engine-flags={flags}")); - } - - command.arg("--").arg(&wasm); - - let output = command + let mut prepared = prepare_command(engine, wasm)?; + let output = prepared + .command .output() - .context("failed to run benchmark subprocess")?; + .context(prepared.failure_context)?; anyhow::ensure!( output.status.success(), - "benchmark subprocess did not exit successfully: {}\nstderr: {}\nstdout: {}", + "{} did not exit successfully: {}\nstderr: {}\nstdout: {}", + prepared.status_label, output.status, String::from_utf8_lossy(&output.stderr), String::from_utf8_lossy(&output.stdout) @@ -515,7 +716,7 @@ impl BenchmarkCommand { // accumulation. measurements.extend( serde_json::from_slice::>>(&output.stdout) - .context("failed to read benchmark subprocess's results")?, + .context(prepared.result_context)?, ); *procs_left -= 1; @@ -563,6 +764,82 @@ impl BenchmarkCommand { }; Ok(working_dir) } + + fn add_benchmark_child_args( + &self, + command: &mut Command, + engine: &Path, + wasm: &Path, + processes: usize, + iterations_per_process: usize, + output_format: Format, + ) { + command + .arg("benchmark") + .arg("--processes") + .arg(processes.to_string()) + .arg("--iterations-per-process") + .arg(iterations_per_process.to_string()) + .arg("--engine") + .arg(engine) + .args( + self.measures + .iter() + .flat_map(|measure| ["--measure".to_string(), measure.to_string()]), + ) + .arg("--raw") + .arg("--output-format") + .arg(output_format.to_string()); + + if self.pin { + command.arg("--pin"); + } + + if self.keep_logs { + command.arg("--keep-logs"); + } + + if self.small_workloads { + command.env("WASM_BENCH_USE_SMALL_WORKLOAD", "1"); + } + + if let Some(phase) = self.benchmark_phase { + command.arg("--benchmark-phase").arg(phase.to_string()); + } + + if let Some(flags) = self.engine_flags.as_deref() { + command.arg(format!("--engine-flags={flags}")); + } + + command.arg("--").arg(wasm); + } +} + +struct PreparedCommand { + command: Command, + #[cfg(all(target_os = "linux", feature = "callgrind"))] + tempdir: Option, + status_label: &'static str, + failure_context: &'static str, + result_context: &'static str, +} + +impl PreparedCommand { + fn new( + command: Command, + status_label: &'static str, + failure_context: &'static str, + result_context: &'static str, + ) -> Self { + Self { + command, + #[cfg(all(target_os = "linux", feature = "callgrind"))] + tempdir: None, + status_label, + failure_context, + result_context, + } + } } fn this_arch() -> &'static str { @@ -758,4 +1035,32 @@ instantiation :: nanoseconds :: benchmarks/pulldown-cmark/benchmark.wasm assert_eq!(actual.trim(), expected.trim()); Ok(()) } + + #[cfg(all(target_os = "linux", feature = "callgrind"))] + #[test] + fn callgrind_must_be_exclusive() { + let command = BenchmarkCommand { + benchmarks: vec![], + engines: vec!["/tmp/engine.so".into()], + engine_flags: None, + processes: None, + names: None, + iterations_per_process: None, + raw: false, + output_format: Format::Json, + output_file: None, + measures: vec![MeasureType::Callgrind, MeasureType::Cycles], + small_workloads: false, + working_dir: None, + benchmark_phase: None, + significance_level: 0.01, + pin: false, + keep_logs: false, + }; + + assert_eq!( + command.validate().unwrap_err().to_string(), + "callgrind must be used by itself and cannot be combined with other measures" + ); + } } diff --git a/crates/recorder/Cargo.toml b/crates/recorder/Cargo.toml index cebef5b0..92c15444 100644 --- a/crates/recorder/Cargo.toml +++ b/crates/recorder/Cargo.toml @@ -5,6 +5,9 @@ description = "A measurement tool for compiling and running a single Wasm benchm authors = ["Sightglass Project Developers"] edition = "2021" +[features] +callgrind = ["dep:valgrind-requests"] + [dependencies] anyhow = "1.0" libloading = "0.9" @@ -19,6 +22,7 @@ ittapi = "0.3" [target.'cfg(target_os = "linux")'.dependencies] perf-event = "0.4" +valgrind-requests = { version = "1.1.0", optional = true } # On supported platforms, we use libc's `sched_getcpu` to log the processor ID. libc = "0.2" diff --git a/crates/recorder/src/measure/callgrind.rs b/crates/recorder/src/measure/callgrind.rs new file mode 100644 index 00000000..3eb450f6 --- /dev/null +++ b/crates/recorder/src/measure/callgrind.rs @@ -0,0 +1,322 @@ +//! Callgrind-backed measurement and dump parsing. +//! +//! This measure is active only when the benchmark child is already running +//! under Valgrind Callgrind. At each phase boundary it zeroes the current +//! counters, triggers a labeled dump, reads the matching `callgrind.out` part +//! file, and converts the recorded events into Sightglass measurements. + +use super::{Measure, Measurements}; +use anyhow::{anyhow, ensure, Context, Result}; +use sightglass_data::Phase; +use std::{ + ffi::CString, + fs, + path::{Path, PathBuf}, +}; +use valgrind_requests::{callgrind, valgrind}; + +/// Environment variable used by the CLI parent to tell the child where +/// Callgrind dump files will be written. +pub const CALLGRIND_OUT_DIR_ENV_VAR: &str = "SIGHTGLASS_CALLGRIND_OUT_DIR"; + +const CLIENT_REQUEST_PREFIX: &str = "Client Request: "; +const EVENT_MAPPINGS: &[(&str, &str)] = &[ + ("Ir", "instructions-retired"), + ("Dr", "data-reads"), + ("Dw", "data-writes"), + ("I1mr", "l1-icache-misses"), + ("D1mr", "l1-dcache-read-misses"), + ("D1mw", "l1-dcache-write-misses"), + ("ILmr", "ll-icache-misses"), + ("DLmr", "ll-dcache-read-misses"), + ("DLmw", "ll-dcache-write-misses"), + ("Bc", "conditional-branches"), + ("Bcm", "conditional-branch-misses"), + ("Bi", "indirect-branches"), + ("Bim", "indirect-branch-misses"), +]; + +/// A `Measure` implementation that uses Callgrind to get low-noise +/// measurements. +pub struct CallgrindMeasure { + output_dir: Option, + next_dump_part: u32, +} + +impl Default for CallgrindMeasure { + fn default() -> Self { + Self::new() + } +} + +impl CallgrindMeasure { + /// Create a new callgrind measure for the current process. + pub fn new() -> Self { + Self { + output_dir: std::env::var_os(CALLGRIND_OUT_DIR_ENV_VAR).map(PathBuf::from), + next_dump_part: 1, + } + } + + fn running_under_valgrind(&self) -> bool { + let running_under_valgrind = valgrind::running_on_valgrind() > 0; + assert!( + running_under_valgrind || self.output_dir.is_none(), + "callgrind measure requested but benchmark process is not running under Valgrind", + ); + running_under_valgrind + } + + fn parse_dump_for_phase( + &self, + phase: Phase, + iteration: u32, + part: u32, + ) -> Result { + let output_dir = self.output_dir.as_ref().ok_or_else(|| { + anyhow!( + "callgrind output directory is not configured; expected {CALLGRIND_OUT_DIR_ENV_VAR}" + ) + })?; + let dump_path = output_dir.join(format!("callgrind.out.{}.{}", std::process::id(), part)); + ensure!( + dump_path.exists(), + "no callgrind output found at {} — is valgrind installed and on PATH?", + dump_path.display() + ); + + let dump = parse_callgrind_dump_file(&dump_path)?; + ensure!( + dump.pid == std::process::id(), + "callgrind dump pid mismatch: expected {}, found {} in {}", + std::process::id(), + dump.pid, + dump_path.display() + ); + ensure!( + dump.part == part, + "callgrind dump part mismatch: expected {part}, found {} in {}", + dump.part, + dump_path.display() + ); + + let expected_label = format!("{phase}/{iteration}"); + let actual_label = dump.label.as_deref().ok_or_else(|| { + anyhow!( + "callgrind dump {} is missing a client-request label", + dump_path.display() + ) + })?; + ensure!( + actual_label == expected_label, + "callgrind dump mismatch: expected {expected_label}, got {actual_label}" + ); + + Ok(dump) + } +} + +impl Measure for CallgrindMeasure { + fn start(&mut self, _phase: Phase) { + if !self.running_under_valgrind() { + return; + } + + callgrind::start_instrumentation(); + callgrind::zero_stats(); + } + + fn end(&mut self, phase: Phase, measurements: &mut Measurements) { + if !self.running_under_valgrind() { + return; + } + + let label = CString::new(format!("{phase}/{}", measurements.iteration())).unwrap(); + callgrind::dump_stats_at(label.as_c_str()); + + let dump = self + .parse_dump_for_phase(phase, measurements.iteration(), self.next_dump_part) + .unwrap_or_else(|error| panic!("failed to read callgrind dump: {error:#}")); + self.next_dump_part += 1; + + measurements.reserve(dump.counts.len()); + for event in dump.counts { + measurements.add(phase, event.name.into(), event.count); + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct ParsedCallgrindDump { + pid: u32, + part: u32, + label: Option, + counts: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct CallgrindEventCount { + name: &'static str, + count: u64, +} + +fn parse_callgrind_dump_file(path: &Path) -> Result { + let contents = fs::read_to_string(path) + .with_context(|| format!("failed to read callgrind dump {}", path.display()))?; + parse_callgrind_dump(&contents) +} + +fn parse_callgrind_dump(contents: &str) -> Result { + let mut events = None; + let mut summary = None; + let mut totals = None; + let mut pid = None; + let mut part = None; + let mut label = None; + + for line in contents.lines() { + if let Some(raw_events) = line.strip_prefix("events: ") { + events = Some(raw_events.split_whitespace().collect::>()); + } else if let Some(raw_counts) = line.strip_prefix("summary: ") { + summary = Some(parse_counts(raw_counts)?); + } else if let Some(raw_counts) = line.strip_prefix("totals: ") { + totals = Some(parse_counts(raw_counts)?); + } else if let Some(raw_pid) = line.strip_prefix("pid: ") { + pid = Some(raw_pid.trim().parse().context("invalid callgrind pid")?); + } else if let Some(raw_part) = line.strip_prefix("part: ") { + part = Some(raw_part.trim().parse().context("invalid callgrind part")?); + } else if let Some(trigger) = line.strip_prefix("desc: Trigger: ") { + label = trigger + .strip_prefix(CLIENT_REQUEST_PREFIX) + .map(ToOwned::to_owned); + } + } + + let events = events.ok_or_else(|| anyhow!("callgrind dump is missing an events header"))?; + let counts = summary + .or(totals) + .ok_or_else(|| anyhow!("callgrind dump is missing a summary/totals line"))?; + ensure!( + events.len() == counts.len(), + "callgrind events/count mismatch: {} events, {} counts", + events.len(), + counts.len() + ); + + let mut parsed_counts = Vec::with_capacity(EVENT_MAPPINGS.len()); + for (event, count) in events.into_iter().zip(counts) { + if let Some(name) = event_name(event) { + parsed_counts.push(CallgrindEventCount { name, count }); + } + } + + Ok(ParsedCallgrindDump { + pid: pid.ok_or_else(|| anyhow!("callgrind dump is missing pid"))?, + part: part.ok_or_else(|| anyhow!("callgrind dump is missing part"))?, + label, + counts: parsed_counts, + }) +} + +fn parse_counts(raw_counts: &str) -> Result> { + raw_counts + .split_whitespace() + .map(|count| { + count + .parse::() + .with_context(|| format!("invalid callgrind count: {count}")) + }) + .collect() +} + +fn event_name(raw_event: &str) -> Option<&'static str> { + EVENT_MAPPINGS + .iter() + .find_map(|(event, name)| (*event == raw_event).then_some(*name)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_client_request_dump() -> Result<()> { + let dump = parse_callgrind_dump(include_str!( + "../../../../test/fixtures/callgrind.client-request.out" + ))?; + + assert_eq!(dump.pid, 773940); + assert_eq!(dump.part, 1); + assert_eq!(dump.label.as_deref(), Some("execution/7")); + assert_eq!( + dump.counts, + vec![ + CallgrindEventCount { + name: "instructions-retired", + count: 790139, + }, + CallgrindEventCount { + name: "data-reads", + count: 230038, + }, + CallgrindEventCount { + name: "data-writes", + count: 320058, + }, + CallgrindEventCount { + name: "l1-icache-misses", + count: 17, + }, + CallgrindEventCount { + name: "l1-dcache-read-misses", + count: 0, + }, + CallgrindEventCount { + name: "l1-dcache-write-misses", + count: 8, + }, + CallgrindEventCount { + name: "ll-icache-misses", + count: 17, + }, + CallgrindEventCount { + name: "ll-dcache-read-misses", + count: 0, + }, + CallgrindEventCount { + name: "ll-dcache-write-misses", + count: 8, + }, + CallgrindEventCount { + name: "conditional-branches", + count: 30004, + }, + CallgrindEventCount { + name: "conditional-branch-misses", + count: 8, + }, + CallgrindEventCount { + name: "indirect-branches", + count: 1, + }, + CallgrindEventCount { + name: "indirect-branch-misses", + count: 1, + }, + ] + ); + + Ok(()) + } + + #[test] + fn ignores_program_termination_label() -> Result<()> { + let dump = parse_callgrind_dump(include_str!( + "../../../../test/fixtures/callgrind.program-termination.out" + ))?; + + assert_eq!(dump.label, None); + assert_eq!(dump.part, 2); + Ok(()) + } +} diff --git a/crates/recorder/src/measure/mod.rs b/crates/recorder/src/measure/mod.rs index c6b2e585..8675212f 100644 --- a/crates/recorder/src/measure/mod.rs +++ b/crates/recorder/src/measure/mod.rs @@ -38,6 +38,11 @@ impl<'a> Measurements<'a> { self.measurements.reserve(capacity); } + /// Get the current iteration index. + pub fn iteration(&self) -> u32 { + self.iteration + } + /// Add a measurement of the given event for the given phase to this /// `Measurements` collection. pub fn add(&mut self, phase: Phase, event: Cow<'a, str>, count: u64) { @@ -77,6 +82,8 @@ pub trait Measure: 'static { fn end(&mut self, phase: Phase, measurements: &mut Measurements); } +#[cfg(all(target_os = "linux", feature = "callgrind"))] +pub mod callgrind; #[cfg(target_os = "linux")] pub mod counters; #[cfg(target_os = "linux")] @@ -116,6 +123,11 @@ pub enum MeasureType { /// Measure instructions retired. #[cfg(target_os = "linux")] InstsRetired, + + /// Measure deterministic instruction, cache, and branch simulation events + /// under Valgrind Callgrind. + #[cfg(all(target_os = "linux", feature = "callgrind"))] + Callgrind, } impl fmt::Display for MeasureType { @@ -129,6 +141,8 @@ impl fmt::Display for MeasureType { MeasureType::PerfCounters => write!(f, "perf-counters"), #[cfg(target_os = "linux")] MeasureType::InstsRetired => write!(f, "insts-retired"), + #[cfg(all(target_os = "linux", feature = "callgrind"))] + MeasureType::Callgrind => write!(f, "callgrind"), } } } @@ -141,10 +155,25 @@ impl FromStr for MeasureType { "time" => Ok(Self::Time), "cycles" => Ok(Self::Cycles), "vtune" => Ok(Self::VTune), + #[cfg(target_os = "linux")] "perf-counters" => Ok(Self::PerfCounters), + #[cfg(not(target_os = "linux"))] + "perf-counters" => Err("`perf-counters` measure is only available on Linux"), + #[cfg(target_os = "linux")] "insts-retired" => Ok(Self::InstsRetired), + #[cfg(not(target_os = "linux"))] + "insts-retired" => Err("`insts-retired` measure is only available on Linux"), + + #[cfg(all(target_os = "linux", feature = "callgrind"))] + "callgrind" => Ok(Self::Callgrind), + #[cfg(not(all(target_os = "linux", feature = "callgrind")))] + "callgrind" => Err( + "`insts-retired` measure is only available on Linux and when the `callgrind` cargo \ + feature is enabled", + ), + _ => Err("unknown measure type"), } } @@ -164,6 +193,8 @@ impl MeasureType { Self::PerfCounters => Box::new(counters::CounterMeasure::new()), #[cfg(target_os = "linux")] Self::InstsRetired => Box::new(insts::InstsRetiredMeasure::new()), + #[cfg(all(target_os = "linux", feature = "callgrind"))] + Self::Callgrind => Box::new(callgrind::CallgrindMeasure::new()), } } } diff --git a/test/fixtures/callgrind.client-request.out b/test/fixtures/callgrind.client-request.out new file mode 100644 index 00000000..5a96f1a2 --- /dev/null +++ b/test/fixtures/callgrind.client-request.out @@ -0,0 +1,18 @@ +# callgrind format +version: 1 +creator: callgrind-3.20.0.GIT +pid: 773940 +cmd: target/debug/cgprobe +part: 1 + + +desc: I1 cache: 32768 B, 64 B, 8-way associative +desc: D1 cache: 32768 B, 64 B, 8-way associative +desc: LL cache: 8388608 B, 64 B, 16-way associative + +desc: Timerange: Basic block 0 - 140028 +desc: Trigger: Client Request: execution/7 + +positions: line +events: Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim +summary: 790139 230038 320058 17 0 8 17 0 8 30004 8 1 1 diff --git a/test/fixtures/callgrind.program-termination.out b/test/fixtures/callgrind.program-termination.out new file mode 100644 index 00000000..7346470b --- /dev/null +++ b/test/fixtures/callgrind.program-termination.out @@ -0,0 +1,18 @@ +# callgrind format +version: 1 +creator: callgrind-3.20.0.GIT +pid: 773940 +cmd: target/debug/cgprobe +part: 2 + + +desc: I1 cache: 32768 B, 64 B, 8-way associative +desc: D1 cache: 32768 B, 64 B, 8-way associative +desc: LL cache: 8388608 B, 64 B, 16-way associative + +desc: Timerange: Basic block 140028 - 150282 +desc: Trigger: Program termination + +positions: line +events: Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim +summary: 18446744073708761429 18446744073709321564 18446744073709231539 18446744073709551595 18446744073709551613 18446744073709551605 18446744073709551595 18446744073709551613 18446744073709551605 18446744073709521611 18446744073709551607 18446744073709551614 18446744073709551614