From b1769d6087bb3ca438f0d6a7d75a8db0d275dd2f Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Wed, 13 May 2026 13:58:56 +0200 Subject: [PATCH] perf(orchestration,subagent): add tracing instrumentation to LLM-bound async paths Add #[tracing::instrument] spans to all primary LLM-bound async functions in zeph-orchestration and zeph-subagent so they appear in local Chrome JSON traces and Perfetto analysis. zeph-orchestration: - LlmPlanner::plan (orchestration.planner.plan, fields: goal_len) - LlmPlanner::plan_with_hint (orchestration.planner.plan_with_hint, fields: goal_len) - LlmAggregator::aggregate (orchestration.aggregator.aggregate, fields: task_count) zeph-subagent: - SubAgentManager::spawn (subagent.manager.spawn, fields: def_name) - SubAgentManager::collect (subagent.manager.collect, fields: task_id) - SubAgentManager::shutdown_all (subagent.manager.shutdown_all) - run_agent_loop (subagent.agent_loop.run, fields: task_id, agent_name) - run_turn (subagent.agent_loop.run_turn, fields: task_id, turn) Closes #3850, #3851 --- CHANGELOG.md | 11 +++++++++++ crates/zeph-orchestration/src/aggregator.rs | 1 + crates/zeph-orchestration/src/planner.rs | 2 ++ crates/zeph-subagent/src/agent_loop.rs | 2 ++ crates/zeph-subagent/src/manager.rs | 6 +++++- 5 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 086fde103..e5ec1f73f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - `zeph-orchestration`: add `Llm(#[from] zeph_llm::LlmError)` typed variant to `OrchestrationError` so callers can pattern-match on root LLM error kinds without string comparison (closes #3842). +### Changed + +- `zeph-orchestration`: add `#[tracing::instrument]` spans to `LlmPlanner::plan`, + `LlmPlanner::plan_with_hint`, and `LlmAggregator::aggregate`. Span names follow the + `orchestration..` convention with `goal_len` / `task_count` fields + for Perfetto trace analysis (closes #3850). +- `zeph-subagent`: add `#[tracing::instrument]` spans to `SubAgentManager::spawn`, + `SubAgentManager::collect`, `SubAgentManager::shutdown_all`, `run_agent_loop`, and + `run_turn`. Span names follow the `subagent..` convention with + `def_name` / `task_id` / `turn` fields (closes #3851). + ### Performance - `zeph-memory`: replace serial `embed()` calls with a single `embed_batch()` call in diff --git a/crates/zeph-orchestration/src/aggregator.rs b/crates/zeph-orchestration/src/aggregator.rs index 4be556361..b95880b1b 100644 --- a/crates/zeph-orchestration/src/aggregator.rs +++ b/crates/zeph-orchestration/src/aggregator.rs @@ -76,6 +76,7 @@ impl LlmAggregator

{ } impl Aggregator for LlmAggregator

{ + #[tracing::instrument(name = "orchestration.aggregator.aggregate", skip_all, fields(task_count = graph.tasks.len()))] async fn aggregate( &self, graph: &TaskGraph, diff --git a/crates/zeph-orchestration/src/planner.rs b/crates/zeph-orchestration/src/planner.rs index 331de742e..2525d0f6f 100644 --- a/crates/zeph-orchestration/src/planner.rs +++ b/crates/zeph-orchestration/src/planner.rs @@ -139,6 +139,7 @@ pub(crate) struct PlannedTask { } impl Planner for LlmPlanner

{ + #[tracing::instrument(name = "orchestration.planner.plan_with_hint", skip_all, fields(goal_len = goal.len()))] async fn plan_with_hint( &self, goal: &str, @@ -171,6 +172,7 @@ impl Planner for LlmPlanner

{ Ok((graph, usage)) } + #[tracing::instrument(name = "orchestration.planner.plan", skip_all, fields(goal_len = goal.len()))] async fn plan( &self, goal: &str, diff --git a/crates/zeph-subagent/src/agent_loop.rs b/crates/zeph-subagent/src/agent_loop.rs index 1f6f1a4a4..c75c3da40 100644 --- a/crates/zeph-subagent/src/agent_loop.rs +++ b/crates/zeph-subagent/src/agent_loop.rs @@ -370,6 +370,7 @@ enum TurnOutcome { /// Returns a [`TurnOutcome`] that drives the loop control flow in /// [`run_agent_loop`]. #[allow(clippy::too_many_arguments)] +#[tracing::instrument(name = "subagent.agent_loop.run_turn", skip_all, fields(task_id = task_id, turn = *turns))] async fn run_turn( provider: &AnyProvider, executor: &FilteredToolExecutor, @@ -560,6 +561,7 @@ async fn handle_tool_step( } } +#[tracing::instrument(name = "subagent.agent_loop.run", skip_all, fields(task_id = %args.task_id, agent_name = %args.agent_name))] pub(super) async fn run_agent_loop( args: AgentLoopArgs, ) -> Result { diff --git a/crates/zeph-subagent/src/manager.rs b/crates/zeph-subagent/src/manager.rs index b820aa63e..9bf48679e 100644 --- a/crates/zeph-subagent/src/manager.rs +++ b/crates/zeph-subagent/src/manager.rs @@ -701,7 +701,9 @@ impl SubAgentManager { /// [`SubAgentError::ConcurrencyLimit`] if the concurrency limit is exceeded, or /// [`SubAgentError::Invalid`] if the agent requests `bypass_permissions` but the config /// does not allow it (`allow_bypass_permissions: false`). - #[allow(clippy::too_many_arguments, clippy::too_many_lines)] // complex algorithm function; both suppressions justified until the function is decomposed in a future refactor + #[allow(clippy::too_many_arguments, clippy::too_many_lines)] + // complex algorithm function; both suppressions justified until the function is decomposed in a future refactor + #[tracing::instrument(name = "subagent.manager.spawn", skip_all, fields(def_name = def_name))] pub fn spawn( &mut self, def_name: &str, @@ -938,6 +940,7 @@ impl SubAgentManager { /// Iterates every agent ID and calls [`cancel`][Self::cancel] on each. /// Unlike [`cancel_all`][Self::cancel_all], this method goes through the normal /// cancel path including hook firing. Prefer this during planned shutdown. + #[tracing::instrument(name = "subagent.manager.shutdown_all", skip_all)] pub fn shutdown_all(&mut self) { let ids: Vec = self.agents.keys().cloned().collect(); for id in ids { @@ -1096,6 +1099,7 @@ impl SubAgentManager { /// /// Returns [`SubAgentError::NotFound`] if the task ID is unknown, /// [`SubAgentError::Spawn`] if the task panicked. + #[tracing::instrument(name = "subagent.manager.collect", skip_all, fields(task_id = task_id))] pub async fn collect(&mut self, task_id: &str) -> Result { let mut handle = self .agents