diff --git a/Cargo.lock b/Cargo.lock
index 3d966a7d..4949682e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -99,6 +99,26 @@ dependencies = [
  "special",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.72.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
+dependencies = [
+ "bitflags 2.10.0",
+ "cexpr",
+ "clang-sys",
+ "itertools",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.109",
+]
+
 [[package]]
 name = "bitflags"
 version = "0.7.0"
@@ -197,6 +217,15 @@ dependencies = [
  "shlex",
 ]
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "cfg-if"
 version = "0.1.10"
@@ -222,6 +251,17 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading 0.8.9",
+]
+
 [[package]]
 name = "clap"
 version = "2.34.0"
@@ -369,6 +409,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "cty"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35"
+
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -831,6 +877,12 @@ dependencies = [
  "wasip2",
 ]
 
+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
 [[package]]
 name = "h2"
 version = "0.3.27"
@@ -870,6 +922,12 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
@@ -1241,6 +1299,16 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if 1.0.4",
+ "windows-link",
+]
+
 [[package]]
 name = "libloading"
 version = "0.9.0"
@@ -1354,6 +1422,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
 [[package]]
 name = "mio"
 version = "1.1.0"
@@ -1404,6 +1478,16 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
 
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
 [[package]]
 name = "normalize-line-endings"
 version = "0.3.0"
@@ -1804,6 +1888,16 @@ dependencies = [
  "log",
 ]
 
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.109",
+]
+
 [[package]]
 name = "proc-macro-error"
 version = "1.0.4"
@@ -2054,6 +2148,21 @@ dependencies = [
  "winreg",
 ]
 
+[[package]]
+name = "rustc-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "rustix"
 version = "1.1.2"
@@ -2298,7 +2407,7 @@ dependencies = [
  "assert_cmd",
  "csv",
  "env_logger 0.8.4",
- "libloading",
+ "libloading 0.9.0",
  "log",
  "minijinja",
  "predicates 1.0.8",
@@ -2356,7 +2465,7 @@ dependencies = [
  "ittapi",
  "lazy_static",
  "libc",
- "libloading",
+ "libloading 0.9.0",
  "log",
  "perf-event",
  "precision",
@@ -2365,6 +2474,7 @@ dependencies = [
  "sightglass-build",
  "sightglass-data",
  "thiserror",
+ "valgrind-requests",
  "wat",
 ]
 
@@ -2489,13 +2599,34 @@ version = "0.4.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
 dependencies = [
- "heck",
+ "heck 0.3.3",
  "proc-macro-error",
  "proc-macro2",
  "quote",
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "strum"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.109",
+]
+
 [[package]]
 name = "subtle"
 version = "2.6.1"
@@ -2819,6 +2950,21 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
 
+[[package]]
+name = "valgrind-requests"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "796c44118a551cc842f2ce6fce4983b86e282ec99b6b5e80b09dfeab830e7755"
+dependencies = [
+ "bindgen",
+ "cc",
+ "cfg-if 1.0.4",
+ "cty",
+ "regex",
+ "rustc_version",
+ "strum",
+]
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
diff --git a/README.md b/README.md
index 72043dd7..c24242ae 100644
--- a/README.md
+++ b/README.md
@@ -195,12 +195,20 @@ note that measuring using CPU cycles alone can be problematic (e.g., CPU
 frequency changes, context switches, etc.).
 
 Several _measures_ can be configured using the `--measure` option:
-- `cycles`: the number of CPU cycles elapsed
-- `perf-counters`: a selection of common `perf` counters (CPU cycles,
-  instructions retired, cache accesses, cache misses); only available on Linux
-- `vtune`: record each phase as a VTune task for analysis; see [this help
-  documentation](docs/vtune.md) for more details
-- `noop`: no measurement is performed
+
+- `cycles`: The number of CPU cycles elapsed.
+
+- `perf-counters`: A selection of common `perf` counters (CPU cycles,
+  instructions retired, cache accesses, cache misses); only available on Linux.
+
+- `callgrind`: Uses Valgrind's Callgrind to count instructions retired and
+  simulate caches and branch prediction. Mostly deterministic and very low
+  noise. Only available on Linux and when built with `--features callgrind`.
+
+- `vtune`: Record each phase as a VTune task for analysis; see [this help
+  documentation](docs/vtune.md) for more details.
+
+- `noop`: No measurement is performed.
 
 For example, run:
 
@@ -208,6 +216,12 @@ For example, run:
 $ cargo run -- benchmark --measure perf-counters ...
 ```
 
+For `callgrind`, Sightglass runs benchmark children under `setarch -R valgrind`
+with a fixed cache model and forces single-threaded Wasmtime compilation with
+`RAYON_NUM_THREADS=1` so results stay stable across machines. Use the same
+Valgrind version when comparing data recorded on different machines for best
+results.
+
 ### Getting Raw JSON or CSV Results
 
 If you don't want the results to be summarized and displayed in a human-readable
diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml
index ecbf8f5b..a5a70c67 100644
--- a/crates/cli/Cargo.toml
+++ b/crates/cli/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 authors = ["Sightglass Project Developers"]
 edition = "2021"
 
+[features]
+default = ["callgrind"]
+callgrind = ["sightglass-recorder/callgrind"]
+
 [dependencies]
 anyhow = "1.0"
 libloading = "0.9"
@@ -24,10 +28,10 @@ csv = "1.1.6"
 regex = "1.5.4"
 vega_lite_4 = { git = "https://github.com/procyon-rs/vega_lite_4.rs" }
 minijinja = "2.10"
+tempfile = "3.2.0"
 
 [dev-dependencies]
 assert_cmd = "1.0.4"
 env_logger = "0.8.3"
 predicates = "1.0.8"
-tempfile = "3.2.0"
 scraper = "0.24"
diff --git a/crates/cli/src/benchmark.rs b/crates/cli/src/benchmark.rs
index 548b61ff..c42c7550 100644
--- a/crates/cli/src/benchmark.rs
+++ b/crates/cli/src/benchmark.rs
@@ -11,11 +11,191 @@ use std::{
     fs,
     io::{self, BufWriter, Write},
     path::{Path, PathBuf},
-    process::Command,
-    process::Stdio,
+    process::{Command, Stdio},
 };
 use structopt::StructOpt;
 
+const DEFAULT_PROCESSES: usize = 10;
+const DEFAULT_ITERATIONS_PER_PROCESS: usize = 10;
+
+#[cfg(all(target_os = "linux", feature = "callgrind"))]
+mod callgrind {
+    use super::*;
+    use sightglass_recorder::measure::callgrind::CALLGRIND_OUT_DIR_ENV_VAR;
+    use tempfile::TempDir;
+
+    const DEFAULT_CALLGRIND_PROCESSES: usize = 3;
+    const DEFAULT_CALLGRIND_ITERATIONS_PER_PROCESS: usize = 1;
+    const CACHE_MODEL_I1: &str = "32768,8,64";
+    const CACHE_MODEL_D1: &str = "32768,8,64";
+    const CACHE_MODEL_LL: &str = "8388608,16,64";
+
+    impl PreparedCommand {
+        #[cfg(all(target_os = "linux", feature = "callgrind"))]
+        fn with_tempdir(mut self, tempdir: tempfile::TempDir) -> Self {
+            self.tempdir = Some(tempdir);
+            self
+        }
+
+        #[cfg(all(target_os = "linux", feature = "callgrind"))]
+        fn tempdir(&self) -> Option<&tempfile::TempDir> {
+            self.tempdir.as_ref()
+        }
+    }
+
+    impl BenchmarkCommand {
+        pub(super) fn default_processes(&self) -> usize {
+            if self.uses_callgrind() {
+                DEFAULT_CALLGRIND_PROCESSES
+            } else {
+                DEFAULT_PROCESSES
+            }
+        }
+
+        pub(super) fn default_iterations_per_process(&self) -> usize {
+            if self.uses_callgrind() {
+                DEFAULT_CALLGRIND_ITERATIONS_PER_PROCESS
+            } else {
+                DEFAULT_ITERATIONS_PER_PROCESS
+            }
+        }
+
+        pub(super) fn validate(&self) -> Result<()> {
+            if self.uses_callgrind() && self.measures.len() > 1 {
+                anyhow::bail!(
+                    "callgrind must be used by itself and cannot be combined with other measures"
+                );
+            }
+
+            Ok(())
+        }
+
+        pub(super) fn should_wrap_subprocesses(&self) -> bool {
+            self.uses_callgrind() && std::env::var_os(CALLGRIND_OUT_DIR_ENV_VAR).is_none()
+        }
+
+        pub(super) fn prepare_command(
+            &self,
+            this_exe: &Path,
+            engine: &Path,
+            wasm: &Path,
+        ) -> Result<PreparedCommand> {
+            ensure_tools_available()?;
+
+            let callgrind_output = TempDir::new().context("failed to create callgrind tempdir")?;
+            let mut prepared = PreparedCommand::new(
+                Command::new("setarch"),
+                "callgrind benchmark subprocess",
+                "failed to run callgrind benchmark subprocess",
+                "failed to read callgrind benchmark subprocess's results",
+            )
+            .with_tempdir(callgrind_output);
+            let output_dir = prepared.tempdir().unwrap().path().to_path_buf();
+
+            prepared
+                .command
+                .stdin(Stdio::null())
+                .stdout(Stdio::piped())
+                .stderr(Stdio::piped())
+                .arg(this_arch())
+                .arg("-R")
+                .arg("valgrind")
+                .arg("--tool=callgrind")
+                .arg("--cache-sim=yes")
+                .arg("--branch-sim=yes")
+                .arg(format!("--I1={CACHE_MODEL_I1}"))
+                .arg(format!("--D1={CACHE_MODEL_D1}"))
+                .arg(format!("--LL={CACHE_MODEL_LL}"))
+                .arg("--instr-atstart=no")
+                .arg(format!(
+                    "--callgrind-out-file={}",
+                    output_dir.join("callgrind.out.%p").display()
+                ))
+                .arg(this_exe);
+            prepared.command.env("RAYON_NUM_THREADS", "1");
+            prepared.command.env(CALLGRIND_OUT_DIR_ENV_VAR, &output_dir);
+            self.add_benchmark_child_args(
+                &mut prepared.command,
+                engine,
+                wasm,
+                1,
+                self.iterations_per_process(),
+                Format::Json,
+            );
+
+            Ok(prepared)
+        }
+
+        fn uses_callgrind(&self) -> bool {
+            self.measures
+                .iter()
+                .any(|measure| matches!(measure, MeasureType::Callgrind))
+        }
+    }
+
+    fn ensure_tools_available() -> Result<()> {
+        ensure_command_succeeds(
+            "valgrind",
+            ["--version"],
+            "callgrind measurement requires `valgrind` on PATH",
+        )?;
+        ensure_command_succeeds(
+            "setarch",
+            [this_arch(), "-R", "true"],
+            "callgrind measurement requires `setarch -R` support to disable ASLR",
+        )?;
+        Ok(())
+    }
+
+    fn ensure_command_succeeds<I, S>(program: &str, args: I, error_message: &str) -> Result<()>
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<std::ffi::OsStr>,
+    {
+        let status = Command::new(program)
+            .args(args)
+            .stdin(Stdio::null())
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .status()
+            .with_context(|| format!("{error_message}: failed to spawn `{program}`"))?;
+        anyhow::ensure!(status.success(), "{error_message}");
+        Ok(())
+    }
+}
+
+#[cfg(not(all(target_os = "linux", feature = "callgrind")))]
+mod callgrind {
+    use super::*;
+
+    impl BenchmarkCommand {
+        pub(super) fn default_processes(&self) -> usize {
+            DEFAULT_PROCESSES
+        }
+
+        pub(super) fn default_iterations_per_process(&self) -> usize {
+            DEFAULT_ITERATIONS_PER_PROCESS
+        }
+
+        pub(super) fn validate(&self) -> Result<()> {
+            Ok(())
+        }
+
+        pub(super) fn should_wrap_subprocesses(&self) -> bool {
+            false
+        }
+
+        pub(super) fn prepare_command(
+            &self,
+            _this_exe: &Path,
+            _engine: &Path,
+            _wasm: &Path,
+        ) -> Result<PreparedCommand> {
+            unreachable!()
+        }
+    }
+}
+
 /// Measure compilation, instantiation, and execution of a Wasm file.
 ///
 /// The total number of samples taken for each Wasm benchmark is `PROCESSES *
@@ -58,8 +238,11 @@ pub struct BenchmarkCommand {
     engine_flags: Option<String>,
 
     /// How many processes should we use for each Wasm benchmark?
-    #[structopt(long = "processes", default_value = "10", value_name = "PROCESSES")]
-    processes: usize,
+    ///
+    /// Defaults to `10`, unless using the `callgrind` measure, in which case the
+    /// default is `3`.
+    #[structopt(long = "processes", value_name = "PROCESSES")]
+    processes: Option<usize>,
 
     /// Override the "engine" name; this is useful if running experiments that might
     /// not have a differentiating engine name (e.g. if customizing the flags).
@@ -70,12 +253,14 @@ pub struct BenchmarkCommand {
     names: Option<Vec<String>>,
 
     /// How many times should we run a benchmark in a single process?
+    ///
+    /// Defaults to `10`, unless using the `callgrind` measure, in which case the
+    /// default is `1`.
     #[structopt(
         long = "iterations-per-process",
-        default_value = "10",
         value_name = "NUMBER_OF_ITERATIONS_PER_PROCESS"
     )]
-    iterations_per_process: usize,
+    iterations_per_process: Option<usize>,
 
     /// Output raw data, rather than the summarized, human-readable analysis
     /// results.
@@ -92,10 +277,17 @@ pub struct BenchmarkCommand {
     #[structopt(short = "o", long = "output-file")]
     output_file: Option<String>,
 
-    /// The type of measurement to use (cycles, insts-retired, perf-counters, noop, vtune)
-    /// when recording the benchmark performance.  This option can be specified more than
-    /// once if to record multiple measurements.  If no measures are specified,
-    /// the "cycles" measure will be used.
+    /// The type of measurement to use (cycles, insts-retired, perf-counters,
+    /// noop, vtune, callgrind) when recording benchmark performance.
+    ///
+    /// This option can be specified more than once to record multiple measures,
+    /// except for `callgrind`, which must be used by itself.
+    ///
+    /// If no measures are specified, the "cycles" measure is used.
+    ///
+    /// `callgrind` defaults to fewer processes and iterations per process
+    /// because it runs the benchmarking processes under Valgrind, which is
+    /// slower but also more deterministic and less noisy.
     #[structopt(long = "measure", short = "m", multiple = true)]
     measures: Vec<MeasureType>,
 
@@ -146,23 +338,41 @@ pub struct BenchmarkCommand {
 
 impl BenchmarkCommand {
     pub fn execute(&self) -> Result<()> {
-        anyhow::ensure!(self.processes > 0, "processes must be greater than zero");
+        anyhow::ensure!(self.processes() > 0, "processes must be greater than zero");
         anyhow::ensure!(
-            self.iterations_per_process > 0,
+            self.iterations_per_process() > 0,
             "iterations-per-process must be greater than zero"
         );
         anyhow::ensure!(
             !self.engines.is_empty(),
             "must pass one or more engines to benchmark with -e/--engine"
         );
+        self.validate()?;
+
+        if self.should_wrap_subprocesses() {
+            let this_exe =
+                std::env::current_exe().context("failed to get the current executable's path")?;
+            return self.execute_in_subprocesses("callgrind iterations", |engine, wasm| {
+                self.prepare_command(&this_exe, engine, wasm)
+            });
+        }
 
-        if self.processes == 1 {
+        if self.processes() == 1 {
             self.execute_in_current_process()
         } else {
             self.execute_in_multiple_processes()
         }
     }
 
+    fn processes(&self) -> usize {
+        self.processes.unwrap_or_else(|| self.default_processes())
+    }
+
+    fn iterations_per_process(&self) -> usize {
+        self.iterations_per_process
+            .unwrap_or_else(|| self.default_iterations_per_process())
+    }
+
     /// Execute benchmark(s) in the provided engine(s) using the current process.
     pub fn execute_in_current_process(&self) -> Result<()> {
         let mut output_file: Box<dyn Write> = if let Some(file) = self.output_file.as_ref() {
@@ -258,7 +468,7 @@ impl BenchmarkCommand {
 
                 // Run the benchmark (compilation, instantiation, and execution) several times in
                 // this process.
-                for _ in 0..self.iterations_per_process {
+                for _ in 0..self.iterations_per_process() {
                     match self.benchmark_phase {
                         None => {
                             let new_engine = benchmark::all(engine.take().unwrap(), &bytes)?;
@@ -374,23 +584,58 @@ impl BenchmarkCommand {
     /// Execute the benchmark(s) by spawning multiple processes. Each of the spawned processes will
     /// run the `execute_in_current_process` function above.
     fn execute_in_multiple_processes(&self) -> Result<()> {
+        let this_exe =
+            std::env::current_exe().context("failed to get the current executable's path")?;
+        self.execute_in_subprocesses("iterations", |engine, wasm| {
+            let mut prepared = PreparedCommand::new(
+                Command::new(&this_exe),
+                "benchmark subprocess",
+                "failed to run benchmark subprocess",
+                "failed to read benchmark subprocess's results",
+            );
+            prepared
+                .command
+                .stdin(Stdio::null())
+                .stdout(Stdio::piped())
+                .stderr(Stdio::inherit());
+            self.add_benchmark_child_args(
+                &mut prepared.command,
+                engine,
+                wasm,
+                1,
+                self.iterations_per_process(),
+                Format::Json,
+            );
+            Ok(prepared)
+        })
+    }
+
+    fn execute_in_subprocesses<F>(
+        &self,
+        iteration_label: &str,
+        mut prepare_command: F,
+    ) -> Result<()>
+    where
+        F: FnMut(&Path, &Path) -> Result<PreparedCommand>,
+    {
         let mut output_file: Box<dyn Write> = if let Some(file) = self.output_file.as_ref() {
             Box::new(BufWriter::new(fs::File::create(file)?))
         } else {
             Box::new(io::stdout())
         };
 
-        let this_exe =
-            std::env::current_exe().context("failed to get the current executable's path")?;
-
         let wasm_files: Vec<_> = self.benchmarks.iter().flat_map(|b| b.paths()).collect();
         eprintln!(
-            "\nRunning {} total iterations ({} engines * {} benchmarks * {} processes * {} iterations per process)",
-            self.engines.len() * wasm_files.len() * self.processes * self.iterations_per_process,
+            "\nRunning {} total {} ({} engines * {} benchmarks * {} processes * {} iterations per process)",
+            self.engines.len()
+                * wasm_files.len()
+                * self.processes()
+                * self.iterations_per_process(),
+            iteration_label,
             self.engines.len(),
             wasm_files.len(),
-            self.processes,
-            self.iterations_per_process
+            self.processes(),
+            self.iterations_per_process()
         );
         eprint!("\n[Done] [Elapsed    ] [Est. Rem.  ]");
 
@@ -398,7 +643,6 @@ impl BenchmarkCommand {
         // us avoid some measurement bias from CPU state transitions that aren't
         // constrained within the duration of process execution, like dynamic
         // CPU throttling due to overheating.
-
         let mut rng = SmallRng::seed_from_u64(0x1337_4242);
 
         // Worklist that we randomly sample from.
@@ -411,7 +655,7 @@ impl BenchmarkCommand {
             let engine = check_engine_path(engine)?;
 
             for wasm in wasm_files.iter().cloned() {
-                choices.push((engine.clone(), wasm, self.processes));
+                choices.push((engine.clone(), wasm, self.processes()));
             }
         }
 
@@ -419,65 +663,22 @@ impl BenchmarkCommand {
         let mut measurements = vec![];
 
         let mut i = 0;
-        let n = choices.len() * self.processes;
+        let n = choices.len() * self.processes();
         let start = std::time::Instant::now();
 
         while !choices.is_empty() {
             let index = rng.gen_range(0, choices.len());
             let (engine, wasm, procs_left) = &mut choices[index];
-
-            let mut command = Command::new(&this_exe);
-            command
-                .stdin(Stdio::null())
-                .stdout(Stdio::piped())
-                .stderr(Stdio::inherit())
-                .arg("benchmark")
-                .arg("--processes")
-                .arg("1")
-                .arg("--iterations-per-process")
-                .arg(self.iterations_per_process.to_string())
-                .arg("--engine")
-                .arg(&engine)
-                .args(
-                    self.measures
-                        .iter()
-                        .flat_map(|m| ["--measure".to_string(), m.to_string()]),
-                )
-                .arg("--raw")
-                .arg("--output-format")
-                // Always use JSON when privately communicating with a
-                // subprocess.
-                .arg(Format::Json.to_string());
-
-            if self.pin {
-                command.arg("--pin");
-            }
-
-            if self.keep_logs {
-                command.arg("--keep-logs");
-            }
-
-            if self.small_workloads {
-                command.env("WASM_BENCH_USE_SMALL_WORKLOAD", "1");
-            }
-
-            if let Some(phase) = self.benchmark_phase {
-                command.arg("--benchmark-phase").arg(phase.to_string());
-            }
-
-            if let Some(flags) = &self.engine_flags {
-                command.arg(format!("--engine-flags={flags}"));
-            }
-
-            command.arg("--").arg(&wasm);
-
-            let output = command
+            let mut prepared = prepare_command(engine, wasm)?;
+            let output = prepared
+                .command
                 .output()
-                .context("failed to run benchmark subprocess")?;
+                .context(prepared.failure_context)?;
 
             anyhow::ensure!(
                 output.status.success(),
-                "benchmark subprocess did not exit successfully: {}\nstderr: {}\nstdout: {}",
+                "{} did not exit successfully: {}\nstderr: {}\nstdout: {}",
+                prepared.status_label,
                 output.status,
                 String::from_utf8_lossy(&output.stderr),
                 String::from_utf8_lossy(&output.stdout)
@@ -515,7 +716,7 @@ impl BenchmarkCommand {
             // accumulation.
             measurements.extend(
                 serde_json::from_slice::<Vec<Measurement<'_>>>(&output.stdout)
-                    .context("failed to read benchmark subprocess's results")?,
+                    .context(prepared.result_context)?,
             );
 
             *procs_left -= 1;
@@ -563,6 +764,82 @@ impl BenchmarkCommand {
         };
         Ok(working_dir)
     }
+
+    fn add_benchmark_child_args(
+        &self,
+        command: &mut Command,
+        engine: &Path,
+        wasm: &Path,
+        processes: usize,
+        iterations_per_process: usize,
+        output_format: Format,
+    ) {
+        command
+            .arg("benchmark")
+            .arg("--processes")
+            .arg(processes.to_string())
+            .arg("--iterations-per-process")
+            .arg(iterations_per_process.to_string())
+            .arg("--engine")
+            .arg(engine)
+            .args(
+                self.measures
+                    .iter()
+                    .flat_map(|measure| ["--measure".to_string(), measure.to_string()]),
+            )
+            .arg("--raw")
+            .arg("--output-format")
+            .arg(output_format.to_string());
+
+        if self.pin {
+            command.arg("--pin");
+        }
+
+        if self.keep_logs {
+            command.arg("--keep-logs");
+        }
+
+        if self.small_workloads {
+            command.env("WASM_BENCH_USE_SMALL_WORKLOAD", "1");
+        }
+
+        if let Some(phase) = self.benchmark_phase {
+            command.arg("--benchmark-phase").arg(phase.to_string());
+        }
+
+        if let Some(flags) = self.engine_flags.as_deref() {
+            command.arg(format!("--engine-flags={flags}"));
+        }
+
+        command.arg("--").arg(wasm);
+    }
+}
+
+struct PreparedCommand {
+    command: Command,
+    #[cfg(all(target_os = "linux", feature = "callgrind"))]
+    tempdir: Option<tempfile::TempDir>,
+    status_label: &'static str,
+    failure_context: &'static str,
+    result_context: &'static str,
+}
+
+impl PreparedCommand {
+    fn new(
+        command: Command,
+        status_label: &'static str,
+        failure_context: &'static str,
+        result_context: &'static str,
+    ) -> Self {
+        Self {
+            command,
+            #[cfg(all(target_os = "linux", feature = "callgrind"))]
+            tempdir: None,
+            status_label,
+            failure_context,
+            result_context,
+        }
+    }
 }
 
 fn this_arch() -> &'static str {
@@ -758,4 +1035,32 @@ instantiation :: nanoseconds :: benchmarks/pulldown-cmark/benchmark.wasm
         assert_eq!(actual.trim(), expected.trim());
         Ok(())
     }
+
+    #[cfg(all(target_os = "linux", feature = "callgrind"))]
+    #[test]
+    fn callgrind_must_be_exclusive() {
+        let command = BenchmarkCommand {
+            benchmarks: vec![],
+            engines: vec!["/tmp/engine.so".into()],
+            engine_flags: None,
+            processes: None,
+            names: None,
+            iterations_per_process: None,
+            raw: false,
+            output_format: Format::Json,
+            output_file: None,
+            measures: vec![MeasureType::Callgrind, MeasureType::Cycles],
+            small_workloads: false,
+            working_dir: None,
+            benchmark_phase: None,
+            significance_level: 0.01,
+            pin: false,
+            keep_logs: false,
+        };
+
+        assert_eq!(
+            command.validate().unwrap_err().to_string(),
+            "callgrind must be used by itself and cannot be combined with other measures"
+        );
+    }
 }
diff --git a/crates/recorder/Cargo.toml b/crates/recorder/Cargo.toml
index cebef5b0..92c15444 100644
--- a/crates/recorder/Cargo.toml
+++ b/crates/recorder/Cargo.toml
@@ -5,6 +5,9 @@ description = "A measurement tool for compiling and running a single Wasm benchm
 authors = ["Sightglass Project Developers"]
 edition = "2021"
 
+[features]
+callgrind = ["dep:valgrind-requests"]
+
 [dependencies]
 anyhow = "1.0"
 libloading = "0.9"
@@ -19,6 +22,7 @@ ittapi = "0.3"
 
 [target.'cfg(target_os = "linux")'.dependencies]
 perf-event = "0.4"
+valgrind-requests = { version = "1.1.0", optional = true }
 # On supported platforms, we use libc's `sched_getcpu` to log the processor ID.
 libc = "0.2"
 
diff --git a/crates/recorder/src/measure/callgrind.rs b/crates/recorder/src/measure/callgrind.rs
new file mode 100644
index 00000000..3eb450f6
--- /dev/null
+++ b/crates/recorder/src/measure/callgrind.rs
@@ -0,0 +1,322 @@
+//! Callgrind-backed measurement and dump parsing.
+//!
+//! This measure is active only when the benchmark child is already running
+//! under Valgrind Callgrind. At each phase boundary it zeroes the current
+//! counters, triggers a labeled dump, reads the matching `callgrind.out` part
+//! file, and converts the recorded events into Sightglass measurements.
+
+use super::{Measure, Measurements};
+use anyhow::{anyhow, ensure, Context, Result};
+use sightglass_data::Phase;
+use std::{
+    ffi::CString,
+    fs,
+    path::{Path, PathBuf},
+};
+use valgrind_requests::{callgrind, valgrind};
+
+/// Environment variable used by the CLI parent to tell the child where
+/// Callgrind dump files will be written.
+pub const CALLGRIND_OUT_DIR_ENV_VAR: &str = "SIGHTGLASS_CALLGRIND_OUT_DIR";
+
+const CLIENT_REQUEST_PREFIX: &str = "Client Request: ";
+const EVENT_MAPPINGS: &[(&str, &str)] = &[
+    ("Ir", "instructions-retired"),
+    ("Dr", "data-reads"),
+    ("Dw", "data-writes"),
+    ("I1mr", "l1-icache-misses"),
+    ("D1mr", "l1-dcache-read-misses"),
+    ("D1mw", "l1-dcache-write-misses"),
+    ("ILmr", "ll-icache-misses"),
+    ("DLmr", "ll-dcache-read-misses"),
+    ("DLmw", "ll-dcache-write-misses"),
+    ("Bc", "conditional-branches"),
+    ("Bcm", "conditional-branch-misses"),
+    ("Bi", "indirect-branches"),
+    ("Bim", "indirect-branch-misses"),
+];
+
+/// A `Measure` implementation that uses Callgrind to get low-noise
+/// measurements.
+pub struct CallgrindMeasure {
+    output_dir: Option<PathBuf>,
+    next_dump_part: u32,
+}
+
+impl Default for CallgrindMeasure {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CallgrindMeasure {
+    /// Create a new callgrind measure for the current process.
+    pub fn new() -> Self {
+        Self {
+            output_dir: std::env::var_os(CALLGRIND_OUT_DIR_ENV_VAR).map(PathBuf::from),
+            next_dump_part: 1,
+        }
+    }
+
+    fn running_under_valgrind(&self) -> bool {
+        let running_under_valgrind = valgrind::running_on_valgrind() > 0;
+        assert!(
+            running_under_valgrind || self.output_dir.is_none(),
+            "callgrind measure requested but benchmark process is not running under Valgrind",
+        );
+        running_under_valgrind
+    }
+
+    fn parse_dump_for_phase(
+        &self,
+        phase: Phase,
+        iteration: u32,
+        part: u32,
+    ) -> Result<ParsedCallgrindDump> {
+        let output_dir = self.output_dir.as_ref().ok_or_else(|| {
+            anyhow!(
+                "callgrind output directory is not configured; expected {CALLGRIND_OUT_DIR_ENV_VAR}"
+            )
+        })?;
+        let dump_path = output_dir.join(format!("callgrind.out.{}.{}", std::process::id(), part));
+        ensure!(
+            dump_path.exists(),
+            "no callgrind output found at {} — is valgrind installed and on PATH?",
+            dump_path.display()
+        );
+
+        let dump = parse_callgrind_dump_file(&dump_path)?;
+        ensure!(
+            dump.pid == std::process::id(),
+            "callgrind dump pid mismatch: expected {}, found {} in {}",
+            std::process::id(),
+            dump.pid,
+            dump_path.display()
+        );
+        ensure!(
+            dump.part == part,
+            "callgrind dump part mismatch: expected {part}, found {} in {}",
+            dump.part,
+            dump_path.display()
+        );
+
+        let expected_label = format!("{phase}/{iteration}");
+        let actual_label = dump.label.as_deref().ok_or_else(|| {
+            anyhow!(
+                "callgrind dump {} is missing a client-request label",
+                dump_path.display()
+            )
+        })?;
+        ensure!(
+            actual_label == expected_label,
+            "callgrind dump mismatch: expected {expected_label}, got {actual_label}"
+        );
+
+        Ok(dump)
+    }
+}
+
+impl Measure for CallgrindMeasure {
+    fn start(&mut self, _phase: Phase) {
+        if !self.running_under_valgrind() {
+            return;
+        }
+
+        callgrind::start_instrumentation();
+        callgrind::zero_stats();
+    }
+
+    fn end(&mut self, phase: Phase, measurements: &mut Measurements) {
+        if !self.running_under_valgrind() {
+            return;
+        }
+
+        let label = CString::new(format!("{phase}/{}", measurements.iteration())).unwrap();
+        callgrind::dump_stats_at(label.as_c_str());
+
+        let dump = self
+            .parse_dump_for_phase(phase, measurements.iteration(), self.next_dump_part)
+            .unwrap_or_else(|error| panic!("failed to read callgrind dump: {error:#}"));
+        self.next_dump_part += 1;
+
+        measurements.reserve(dump.counts.len());
+        for event in dump.counts {
+            measurements.add(phase, event.name.into(), event.count);
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct ParsedCallgrindDump {
+    pid: u32,
+    part: u32,
+    label: Option<String>,
+    counts: Vec<CallgrindEventCount>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct CallgrindEventCount {
+    name: &'static str,
+    count: u64,
+}
+
+fn parse_callgrind_dump_file(path: &Path) -> Result<ParsedCallgrindDump> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read callgrind dump {}", path.display()))?;
+    parse_callgrind_dump(&contents)
+}
+
+fn parse_callgrind_dump(contents: &str) -> Result<ParsedCallgrindDump> {
+    let mut events = None;
+    let mut summary = None;
+    let mut totals = None;
+    let mut pid = None;
+    let mut part = None;
+    let mut label = None;
+
+    for line in contents.lines() {
+        if let Some(raw_events) = line.strip_prefix("events: ") {
+            events = Some(raw_events.split_whitespace().collect::<Vec<_>>());
+        } else if let Some(raw_counts) = line.strip_prefix("summary: ") {
+            summary = Some(parse_counts(raw_counts)?);
+        } else if let Some(raw_counts) = line.strip_prefix("totals: ") {
+            totals = Some(parse_counts(raw_counts)?);
+        } else if let Some(raw_pid) = line.strip_prefix("pid: ") {
+            pid = Some(raw_pid.trim().parse().context("invalid callgrind pid")?);
+        } else if let Some(raw_part) = line.strip_prefix("part: ") {
+            part = Some(raw_part.trim().parse().context("invalid callgrind part")?);
+        } else if let Some(trigger) = line.strip_prefix("desc: Trigger: ") {
+            label = trigger
+                .strip_prefix(CLIENT_REQUEST_PREFIX)
+                .map(ToOwned::to_owned);
+        }
+    }
+
+    let events = events.ok_or_else(|| anyhow!("callgrind dump is missing an events header"))?;
+    let counts = summary
+        .or(totals)
+        .ok_or_else(|| anyhow!("callgrind dump is missing a summary/totals line"))?;
+    ensure!(
+        events.len() == counts.len(),
+        "callgrind events/count mismatch: {} events, {} counts",
+        events.len(),
+        counts.len()
+    );
+
+    let mut parsed_counts = Vec::with_capacity(EVENT_MAPPINGS.len());
+    for (event, count) in events.into_iter().zip(counts) {
+        if let Some(name) = event_name(event) {
+            parsed_counts.push(CallgrindEventCount { name, count });
+        }
+    }
+
+    Ok(ParsedCallgrindDump {
+        pid: pid.ok_or_else(|| anyhow!("callgrind dump is missing pid"))?,
+        part: part.ok_or_else(|| anyhow!("callgrind dump is missing part"))?,
+        label,
+        counts: parsed_counts,
+    })
+}
+
+fn parse_counts(raw_counts: &str) -> Result<Vec<u64>> {
+    raw_counts
+        .split_whitespace()
+        .map(|count| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("invalid callgrind count: {count}"))
+        })
+        .collect()
+}
+
+fn event_name(raw_event: &str) -> Option<&'static str> {
+    EVENT_MAPPINGS
+        .iter()
+        .find_map(|(event, name)| (*event == raw_event).then_some(*name))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parses_client_request_dump() -> Result<()> {
+        let dump = parse_callgrind_dump(include_str!(
+            "../../../../test/fixtures/callgrind.client-request.out"
+        ))?;
+
+        assert_eq!(dump.pid, 773940);
+        assert_eq!(dump.part, 1);
+        assert_eq!(dump.label.as_deref(), Some("execution/7"));
+        assert_eq!(
+            dump.counts,
+            vec![
+                CallgrindEventCount {
+                    name: "instructions-retired",
+                    count: 790139,
+                },
+                CallgrindEventCount {
+                    name: "data-reads",
+                    count: 230038,
+                },
+                CallgrindEventCount {
+                    name: "data-writes",
+                    count: 320058,
+                },
+                CallgrindEventCount {
+                    name: "l1-icache-misses",
+                    count: 17,
+                },
+                CallgrindEventCount {
+                    name: "l1-dcache-read-misses",
+                    count: 0,
+                },
+                CallgrindEventCount {
+                    name: "l1-dcache-write-misses",
+                    count: 8,
+                },
+                CallgrindEventCount {
+                    name: "ll-icache-misses",
+                    count: 17,
+                },
+                CallgrindEventCount {
+                    name: "ll-dcache-read-misses",
+                    count: 0,
+                },
+                CallgrindEventCount {
+                    name: "ll-dcache-write-misses",
+                    count: 8,
+                },
+                CallgrindEventCount {
+                    name: "conditional-branches",
+                    count: 30004,
+                },
+                CallgrindEventCount {
+                    name: "conditional-branch-misses",
+                    count: 8,
+                },
+                CallgrindEventCount {
+                    name: "indirect-branches",
+                    count: 1,
+                },
+                CallgrindEventCount {
+                    name: "indirect-branch-misses",
+                    count: 1,
+                },
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn ignores_program_termination_label() -> Result<()> {
+        let dump = parse_callgrind_dump(include_str!(
+            "../../../../test/fixtures/callgrind.program-termination.out"
+        ))?;
+
+        assert_eq!(dump.label, None);
+        assert_eq!(dump.part, 2);
+        Ok(())
+    }
+}
diff --git a/crates/recorder/src/measure/mod.rs b/crates/recorder/src/measure/mod.rs
index c6b2e585..8675212f 100644
--- a/crates/recorder/src/measure/mod.rs
+++ b/crates/recorder/src/measure/mod.rs
@@ -38,6 +38,11 @@ impl<'a> Measurements<'a> {
         self.measurements.reserve(capacity);
     }
 
+    /// Get the current iteration index.
+    pub fn iteration(&self) -> u32 {
+        self.iteration
+    }
+
     /// Add a measurement of the given event for the given phase to this
     /// `Measurements` collection.
     pub fn add(&mut self, phase: Phase, event: Cow<'a, str>, count: u64) {
@@ -77,6 +82,8 @@ pub trait Measure: 'static {
     fn end(&mut self, phase: Phase, measurements: &mut Measurements);
 }
 
+#[cfg(all(target_os = "linux", feature = "callgrind"))]
+pub mod callgrind;
 #[cfg(target_os = "linux")]
 pub mod counters;
 #[cfg(target_os = "linux")]
@@ -116,6 +123,11 @@ pub enum MeasureType {
     /// Measure instructions retired.
     #[cfg(target_os = "linux")]
     InstsRetired,
+
+    /// Measure deterministic instruction, cache, and branch simulation events
+    /// under Valgrind Callgrind.
+    #[cfg(all(target_os = "linux", feature = "callgrind"))]
+    Callgrind,
 }
 
 impl fmt::Display for MeasureType {
@@ -129,6 +141,8 @@ impl fmt::Display for MeasureType {
             MeasureType::PerfCounters => write!(f, "perf-counters"),
             #[cfg(target_os = "linux")]
             MeasureType::InstsRetired => write!(f, "insts-retired"),
+            #[cfg(all(target_os = "linux", feature = "callgrind"))]
+            MeasureType::Callgrind => write!(f, "callgrind"),
         }
     }
 }
@@ -141,10 +155,25 @@ impl FromStr for MeasureType {
             "time" => Ok(Self::Time),
             "cycles" => Ok(Self::Cycles),
             "vtune" => Ok(Self::VTune),
+
             #[cfg(target_os = "linux")]
             "perf-counters" => Ok(Self::PerfCounters),
+            #[cfg(not(target_os = "linux"))]
+            "perf-counters" => Err("`perf-counters` measure is only available on Linux"),
+
             #[cfg(target_os = "linux")]
             "insts-retired" => Ok(Self::InstsRetired),
+            #[cfg(not(target_os = "linux"))]
+            "insts-retired" => Err("`insts-retired` measure is only available on Linux"),
+
+            #[cfg(all(target_os = "linux", feature = "callgrind"))]
+            "callgrind" => Ok(Self::Callgrind),
+            #[cfg(not(all(target_os = "linux", feature = "callgrind")))]
+            "callgrind" => Err(
+                "`insts-retired` measure is only available on Linux and when the `callgrind` cargo \
+                 feature is enabled",
+            ),
+
             _ => Err("unknown measure type"),
         }
     }
@@ -164,6 +193,8 @@ impl MeasureType {
             Self::PerfCounters => Box::new(counters::CounterMeasure::new()),
             #[cfg(target_os = "linux")]
             Self::InstsRetired => Box::new(insts::InstsRetiredMeasure::new()),
+            #[cfg(all(target_os = "linux", feature = "callgrind"))]
+            Self::Callgrind => Box::new(callgrind::CallgrindMeasure::new()),
         }
     }
 }
diff --git a/test/fixtures/callgrind.client-request.out b/test/fixtures/callgrind.client-request.out
new file mode 100644
index 00000000..5a96f1a2
--- /dev/null
+++ b/test/fixtures/callgrind.client-request.out
@@ -0,0 +1,18 @@
+# callgrind format
+version: 1
+creator: callgrind-3.20.0.GIT
+pid: 773940
+cmd:  target/debug/cgprobe
+part: 1
+
+
+desc: I1 cache: 32768 B, 64 B, 8-way associative
+desc: D1 cache: 32768 B, 64 B, 8-way associative
+desc: LL cache: 8388608 B, 64 B, 16-way associative
+
+desc: Timerange: Basic block 0 - 140028
+desc: Trigger: Client Request: execution/7
+
+positions: line
+events: Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
+summary: 790139 230038 320058 17 0 8 17 0 8 30004 8 1 1
diff --git a/test/fixtures/callgrind.program-termination.out b/test/fixtures/callgrind.program-termination.out
new file mode 100644
index 00000000..7346470b
--- /dev/null
+++ b/test/fixtures/callgrind.program-termination.out
@@ -0,0 +1,18 @@
+# callgrind format
+version: 1
+creator: callgrind-3.20.0.GIT
+pid: 773940
+cmd:  target/debug/cgprobe
+part: 2
+
+
+desc: I1 cache: 32768 B, 64 B, 8-way associative
+desc: D1 cache: 32768 B, 64 B, 8-way associative
+desc: LL cache: 8388608 B, 64 B, 16-way associative
+
+desc: Timerange: Basic block 140028 - 150282
+desc: Trigger: Program termination
+
+positions: line
+events: Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
+summary: 18446744073708761429 18446744073709321564 18446744073709231539 18446744073709551595 18446744073709551613 18446744073709551605 18446744073709551595 18446744073709551613 18446744073709551605 18446744073709521611 18446744073709551607 18446744073709551614 18446744073709551614