diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ad18ccd8..f10517b3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Changed +- `zeph-bench`: `BenchRunner::new` no longer takes a `_no_deterministic: bool` parameter. + The flag was dead code — deterministic overrides are applied to the provider before + construction via `apply_deterministic_overrides`. All call sites updated (closes #3952). +- `zeph-bench`: scenario-filter validation and skip logic extracted into a shared + `filter_scenarios` helper, eliminating duplicate code in `run_dataset` and + `run_dataset_with_env_factory` (closes #3953). +- `zeph-bench`: `run_dataset`, `run_dataset_with_env_factory`, and `run_one_with_executor` + are now instrumented with `tracing::info_span!` blocks (`bench.run_dataset`, + `bench.run_dataset_with_env_factory`, `bench.scenario`, `bench.run_one`), making bench + runs visible in Perfetto and Jaeger traces (closes #3948). + - `zeph-skills`: skill embedding vectors are now typed as `SkillEmbedding(Vec)` instead of raw `Vec`. The newtype carries its dimension and requires explicit construction via `SkillEmbedding::new(vec, expected_dim)` (validated) or `SkillEmbedding::from_raw(vec)` diff --git a/crates/zeph-bench/README.md b/crates/zeph-bench/README.md index d8b625c15..48849abc5 100644 --- a/crates/zeph-bench/README.md +++ b/crates/zeph-bench/README.md @@ -76,7 +76,7 @@ use zeph_llm::{any::AnyProvider, mock::MockProvider}; # async fn example() -> Result<(), zeph_bench::BenchError> { let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()])); -let runner = BenchRunner::new(provider, false); +let runner = BenchRunner::new(provider); let opts = RunOptions::default(); let run = runner .run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("gaia.jsonl"), opts) diff --git a/crates/zeph-bench/src/runner.rs b/crates/zeph-bench/src/runner.rs index 7aad2945f..8563a82a9 100644 --- a/crates/zeph-bench/src/runner.rs +++ b/crates/zeph-bench/src/runner.rs @@ -18,7 +18,7 @@ //! //! # async fn example() -> Result<(), zeph_bench::BenchError> { //! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()])); -//! let runner = BenchRunner::new(provider, false); +//! let runner = BenchRunner::new(provider); //! let opts = RunOptions::default(); //! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?; //! println!("mean score: {:.4}", run.aggregate.mean_score); @@ -163,7 +163,7 @@ impl ToolExecutor for NoopExecutor { /// use zeph_llm::{any::AnyProvider, mock::MockProvider}; /// /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()])); -/// let runner = BenchRunner::new(provider, false); +/// let runner = BenchRunner::new(provider); /// ``` pub struct BenchRunner { provider: AnyProvider, @@ -188,10 +188,10 @@ impl BenchRunner { /// use zeph_llm::{any::AnyProvider, mock::MockProvider}; /// /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![])); - /// let runner = BenchRunner::new(provider, false); + /// let runner = BenchRunner::new(provider); /// ``` #[must_use] - pub fn new(provider: AnyProvider, _no_deterministic: bool) -> Self { + pub fn new(provider: AnyProvider) -> Self { Self { provider, memory_params: None, @@ -217,7 +217,7 @@ impl BenchRunner { /// run_id: "bench-abc".into(), /// dataset: "locomo".into(), /// }; - /// let runner = BenchRunner::new(provider, false).with_memory_params(params); + /// let runner = BenchRunner::new(provider).with_memory_params(params); /// ``` #[must_use] pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self { @@ -251,16 +251,14 @@ impl BenchRunner { E: Evaluator, { let scenarios = loader.load(path)?; + let filtered = filter_scenarios(&scenarios, &opts, loader.name())?; - if let Some(ref filter) = opts.scenario_filter - && !scenarios.iter().any(|s| &s.id == filter) - { - return Err(BenchError::InvalidFormat(format!( - "scenario '{}' not found in dataset '{}'", - filter, - loader.name() - ))); - } + let _span = tracing::info_span!( + "bench.run_dataset", + dataset = loader.name(), + scenarios = filtered.len(), + ) + .entered(); let model_id = self.provider.model_identifier().to_owned(); @@ -275,17 +273,8 @@ impl BenchRunner { aggregate: crate::results::Aggregate::default(), }; - for scenario in &scenarios { - // Skip if resume is active and scenario already completed. - if opts.completed_ids.contains(&scenario.id) { - continue; - } - // Skip if a single-scenario filter is active. - if let Some(ref filter) = opts.scenario_filter - && &scenario.id != filter - { - continue; - } + for scenario in filtered { + let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered(); let t0 = Instant::now(); let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?; @@ -332,16 +321,14 @@ impl BenchRunner { X: ToolExecutor + Send + Sync + 'static, { let scenarios = loader.load(path)?; + let filtered = filter_scenarios(&scenarios, &opts, loader.name())?; - if let Some(ref filter) = opts.scenario_filter - && !scenarios.iter().any(|s| &s.id == filter) - { - return Err(BenchError::InvalidFormat(format!( - "scenario '{}' not found in dataset '{}'", - filter, - loader.name() - ))); - } + let _span = tracing::info_span!( + "bench.run_dataset_with_env_factory", + dataset = loader.name(), + scenarios = filtered.len(), + ) + .entered(); let model_id = self.provider.model_identifier().to_owned(); @@ -356,15 +343,8 @@ impl BenchRunner { aggregate: crate::results::Aggregate::default(), }; - for scenario in &scenarios { - if opts.completed_ids.contains(&scenario.id) { - continue; - } - if let Some(ref filter) = opts.scenario_filter - && &scenario.id != filter - { - continue; - } + for scenario in filtered { + let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered(); let (executor, trace) = env_factory(scenario)?; let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?; @@ -436,6 +416,12 @@ impl BenchRunner { memory_mode: MemoryMode, mode: ResponseMode, ) -> Result { + let _span = tracing::info_span!( + "bench.run_one", + scenario_id = %scenario.id, + mode = ?mode, + ) + .entered(); let prompt = scenario.primary_prompt()?.to_owned(); let channel = BenchmarkChannel::new(vec![prompt]); // TODO(multi-turn-history): when loaders emit multiple user turns, push each in @@ -555,6 +541,42 @@ impl BenchRunner { } } +/// Return the subset of `scenarios` that should run given `opts`. +/// +/// Validates that when a `scenario_filter` is set, at least one matching scenario exists in +/// `scenarios`. Then filters out already-completed IDs and non-matching scenarios. +/// +/// # Errors +/// +/// Returns [`BenchError::InvalidFormat`] when `opts.scenario_filter` names a scenario that +/// does not appear in `scenarios`. +fn filter_scenarios<'a>( + scenarios: &'a [Scenario], + opts: &RunOptions, + loader_name: &str, +) -> Result, BenchError> { + if let Some(ref filter) = opts.scenario_filter + && !scenarios.iter().any(|s| &s.id == filter) + { + return Err(BenchError::InvalidFormat(format!( + "scenario '{filter}' not found in dataset '{loader_name}'" + ))); + } + + Ok(scenarios + .iter() + .filter(|s| { + if opts.completed_ids.contains(&s.id) { + return false; + } + if let Some(ref filter) = opts.scenario_filter { + return &s.id == filter; + } + true + }) + .collect()) +} + /// Post-process the raw agent response to extract a clean, terse answer. /// /// Applies these transformations in order: @@ -620,7 +642,7 @@ mod tests { run_id: "bench-abc".into(), dataset: "locomo".into(), }; - let runner = BenchRunner::new(provider, false).with_memory_params(params.clone()); + let runner = BenchRunner::new(provider).with_memory_params(params.clone()); assert!(runner.memory_params.is_some()); let stored = runner.memory_params.unwrap(); assert_eq!(stored.run_id, "bench-abc"); diff --git a/src/commands/bench.rs b/src/commands/bench.rs index 7aba62026..278e6ca87 100644 --- a/src/commands/bench.rs +++ b/src/commands/bench.rs @@ -134,7 +134,6 @@ async fn handle_run_baseline( scenario, base_completed_ids, provider.clone(), - no_deterministic, ) .await?; @@ -146,7 +145,6 @@ async fn handle_run_baseline( &data_dir, &config.llm.embedding_model.clone(), provider, - no_deterministic, ) .await?; @@ -176,7 +174,6 @@ async fn run_memory_off_pass( scenario: Option<&str>, completed_ids: std::collections::HashSet, provider: zeph_llm::any::AnyProvider, - no_deterministic: bool, ) -> anyhow::Result<(BenchRun, ResultWriter)> { let dir = output.join("baseline").join("memory-off"); let writer = ResultWriter::new(&dir)?; @@ -185,7 +182,7 @@ async fn run_memory_off_pass( completed_ids, memory_mode: MemoryMode::Off, }; - let runner = BenchRunner::new(provider, no_deterministic); + let runner = BenchRunner::new(provider); let mut run = dispatch_run(&runner, dataset, data_path, opts).await?; run.status = RunStatus::Completed; run.finished_at = finished_at_now(); @@ -208,7 +205,6 @@ async fn run_memory_on_pass( data_dir: &std::path::Path, embedding_model: &str, provider: zeph_llm::any::AnyProvider, - no_deterministic: bool, ) -> anyhow::Result<(BenchRun, ResultWriter)> { let run_id = format!("bench-on-{}", baseline_run_id_suffix()); let memory_params = BenchMemoryParams { @@ -224,7 +220,7 @@ async fn run_memory_on_pass( completed_ids: std::collections::HashSet::new(), memory_mode: MemoryMode::On, }; - let runner = BenchRunner::new(provider, no_deterministic).with_memory_params(memory_params); + let runner = BenchRunner::new(provider).with_memory_params(memory_params); let mut run = dispatch_run(&runner, dataset, data_path, opts).await?; run.status = RunStatus::Completed; run.finished_at = finished_at_now(); @@ -416,7 +412,7 @@ async fn handle_run( memory_mode: MemoryMode::Off, }; - let runner = BenchRunner::new(provider, no_deterministic); + let runner = BenchRunner::new(provider); let mut run = dispatch_run(&runner, dataset, &data_path, opts).await?; run.status = RunStatus::Completed;