Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

### Changed

- `zeph-bench`: `BenchRunner::new` no longer takes a `_no_deterministic: bool` parameter.
The flag was dead code — deterministic overrides are applied to the provider before
construction via `apply_deterministic_overrides`. All call sites updated (closes #3952).
- `zeph-bench`: scenario-filter validation and skip logic extracted into a shared
`filter_scenarios` helper, eliminating duplicate code in `run_dataset` and
`run_dataset_with_env_factory` (closes #3953).
- `zeph-bench`: `run_dataset`, `run_dataset_with_env_factory`, and `run_one_with_executor`
are now instrumented with `tracing::info_span!` blocks (`bench.run_dataset`,
`bench.run_dataset_with_env_factory`, `bench.scenario`, `bench.run_one`), making bench
runs visible in Perfetto and Jaeger traces (closes #3948).

- `zeph-skills`: skill embedding vectors are now typed as `SkillEmbedding(Vec<f32>)` instead
of raw `Vec<f32>`. The newtype carries its dimension and requires explicit construction via
`SkillEmbedding::new(vec, expected_dim)` (validated) or `SkillEmbedding::from_raw(vec)`
Expand Down
2 changes: 1 addition & 1 deletion crates/zeph-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ use zeph_llm::{any::AnyProvider, mock::MockProvider};

# async fn example() -> Result<(), zeph_bench::BenchError> {
let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
let runner = BenchRunner::new(provider, false);
let runner = BenchRunner::new(provider);
let opts = RunOptions::default();
let run = runner
.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("gaia.jsonl"), opts)
Expand Down
110 changes: 66 additions & 44 deletions crates/zeph-bench/src/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
//!
//! # async fn example() -> Result<(), zeph_bench::BenchError> {
//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
//! let runner = BenchRunner::new(provider, false);
//! let runner = BenchRunner::new(provider);
//! let opts = RunOptions::default();
//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
//! println!("mean score: {:.4}", run.aggregate.mean_score);
Expand Down Expand Up @@ -163,7 +163,7 @@ impl ToolExecutor for NoopExecutor {
/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
///
/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
/// let runner = BenchRunner::new(provider, false);
/// let runner = BenchRunner::new(provider);
/// ```
pub struct BenchRunner {
provider: AnyProvider,
Expand All @@ -188,10 +188,10 @@ impl BenchRunner {
/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
///
/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
/// let runner = BenchRunner::new(provider, false);
/// let runner = BenchRunner::new(provider);
/// ```
#[must_use]
pub fn new(provider: AnyProvider, _no_deterministic: bool) -> Self {
pub fn new(provider: AnyProvider) -> Self {
Self {
provider,
memory_params: None,
Expand All @@ -217,7 +217,7 @@ impl BenchRunner {
/// run_id: "bench-abc".into(),
/// dataset: "locomo".into(),
/// };
/// let runner = BenchRunner::new(provider, false).with_memory_params(params);
/// let runner = BenchRunner::new(provider).with_memory_params(params);
/// ```
#[must_use]
pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
Expand Down Expand Up @@ -251,16 +251,14 @@ impl BenchRunner {
E: Evaluator,
{
let scenarios = loader.load(path)?;
let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;

if let Some(ref filter) = opts.scenario_filter
&& !scenarios.iter().any(|s| &s.id == filter)
{
return Err(BenchError::InvalidFormat(format!(
"scenario '{}' not found in dataset '{}'",
filter,
loader.name()
)));
}
let _span = tracing::info_span!(
"bench.run_dataset",
dataset = loader.name(),
scenarios = filtered.len(),
)
.entered();

let model_id = self.provider.model_identifier().to_owned();

Expand All @@ -275,17 +273,8 @@ impl BenchRunner {
aggregate: crate::results::Aggregate::default(),
};

for scenario in &scenarios {
// Skip if resume is active and scenario already completed.
if opts.completed_ids.contains(&scenario.id) {
continue;
}
// Skip if a single-scenario filter is active.
if let Some(ref filter) = opts.scenario_filter
&& &scenario.id != filter
{
continue;
}
for scenario in filtered {
let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();

let t0 = Instant::now();
let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
Expand Down Expand Up @@ -332,16 +321,14 @@ impl BenchRunner {
X: ToolExecutor + Send + Sync + 'static,
{
let scenarios = loader.load(path)?;
let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;

if let Some(ref filter) = opts.scenario_filter
&& !scenarios.iter().any(|s| &s.id == filter)
{
return Err(BenchError::InvalidFormat(format!(
"scenario '{}' not found in dataset '{}'",
filter,
loader.name()
)));
}
let _span = tracing::info_span!(
"bench.run_dataset_with_env_factory",
dataset = loader.name(),
scenarios = filtered.len(),
)
.entered();

let model_id = self.provider.model_identifier().to_owned();

Expand All @@ -356,15 +343,8 @@ impl BenchRunner {
aggregate: crate::results::Aggregate::default(),
};

for scenario in &scenarios {
if opts.completed_ids.contains(&scenario.id) {
continue;
}
if let Some(ref filter) = opts.scenario_filter
&& &scenario.id != filter
{
continue;
}
for scenario in filtered {
let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();

let (executor, trace) = env_factory(scenario)?;
let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
Expand Down Expand Up @@ -436,6 +416,12 @@ impl BenchRunner {
memory_mode: MemoryMode,
mode: ResponseMode,
) -> Result<String, BenchError> {
let _span = tracing::info_span!(
"bench.run_one",
scenario_id = %scenario.id,
mode = ?mode,
)
.entered();
let prompt = scenario.primary_prompt()?.to_owned();
let channel = BenchmarkChannel::new(vec![prompt]);
// TODO(multi-turn-history): when loaders emit multiple user turns, push each in
Expand Down Expand Up @@ -555,6 +541,42 @@ impl BenchRunner {
}
}

/// Return the subset of `scenarios` that should run given `opts`.
///
/// Validates that when a `scenario_filter` is set, at least one matching scenario exists in
/// `scenarios`. Then filters out already-completed IDs and non-matching scenarios.
///
/// # Errors
///
/// Returns [`BenchError::InvalidFormat`] when `opts.scenario_filter` names a scenario that
/// does not appear in `scenarios`.
fn filter_scenarios<'a>(
scenarios: &'a [Scenario],
opts: &RunOptions,
loader_name: &str,
) -> Result<Vec<&'a Scenario>, BenchError> {
if let Some(ref filter) = opts.scenario_filter
&& !scenarios.iter().any(|s| &s.id == filter)
{
return Err(BenchError::InvalidFormat(format!(
"scenario '{filter}' not found in dataset '{loader_name}'"
)));
}

Ok(scenarios
.iter()
.filter(|s| {
if opts.completed_ids.contains(&s.id) {
return false;
}
if let Some(ref filter) = opts.scenario_filter {
return &s.id == filter;
}
true
})
.collect())
}

/// Post-process the raw agent response to extract a clean, terse answer.
///
/// Applies these transformations in order:
Expand Down Expand Up @@ -620,7 +642,7 @@ mod tests {
run_id: "bench-abc".into(),
dataset: "locomo".into(),
};
let runner = BenchRunner::new(provider, false).with_memory_params(params.clone());
let runner = BenchRunner::new(provider).with_memory_params(params.clone());
assert!(runner.memory_params.is_some());
let stored = runner.memory_params.unwrap();
assert_eq!(stored.run_id, "bench-abc");
Expand Down
10 changes: 3 additions & 7 deletions src/commands/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,6 @@ async fn handle_run_baseline(
scenario,
base_completed_ids,
provider.clone(),
no_deterministic,
)
.await?;

Expand All @@ -146,7 +145,6 @@ async fn handle_run_baseline(
&data_dir,
&config.llm.embedding_model.clone(),
provider,
no_deterministic,
)
.await?;

Expand Down Expand Up @@ -176,7 +174,6 @@ async fn run_memory_off_pass(
scenario: Option<&str>,
completed_ids: std::collections::HashSet<String>,
provider: zeph_llm::any::AnyProvider,
no_deterministic: bool,
) -> anyhow::Result<(BenchRun, ResultWriter)> {
let dir = output.join("baseline").join("memory-off");
let writer = ResultWriter::new(&dir)?;
Expand All @@ -185,7 +182,7 @@ async fn run_memory_off_pass(
completed_ids,
memory_mode: MemoryMode::Off,
};
let runner = BenchRunner::new(provider, no_deterministic);
let runner = BenchRunner::new(provider);
let mut run = dispatch_run(&runner, dataset, data_path, opts).await?;
run.status = RunStatus::Completed;
run.finished_at = finished_at_now();
Expand All @@ -208,7 +205,6 @@ async fn run_memory_on_pass(
data_dir: &std::path::Path,
embedding_model: &str,
provider: zeph_llm::any::AnyProvider,
no_deterministic: bool,
) -> anyhow::Result<(BenchRun, ResultWriter)> {
let run_id = format!("bench-on-{}", baseline_run_id_suffix());
let memory_params = BenchMemoryParams {
Expand All @@ -224,7 +220,7 @@ async fn run_memory_on_pass(
completed_ids: std::collections::HashSet::new(),
memory_mode: MemoryMode::On,
};
let runner = BenchRunner::new(provider, no_deterministic).with_memory_params(memory_params);
let runner = BenchRunner::new(provider).with_memory_params(memory_params);
let mut run = dispatch_run(&runner, dataset, data_path, opts).await?;
run.status = RunStatus::Completed;
run.finished_at = finished_at_now();
Expand Down Expand Up @@ -416,7 +412,7 @@ async fn handle_run(
memory_mode: MemoryMode::Off,
};

let runner = BenchRunner::new(provider, no_deterministic);
let runner = BenchRunner::new(provider);
let mut run = dispatch_run(&runner, dataset, &data_path, opts).await?;

run.status = RunStatus::Completed;
Expand Down
Loading