From a24497642f38e8fa183df9ac38df35c601f8c346 Mon Sep 17 00:00:00 2001 From: DJ Majumdar Date: Tue, 24 Mar 2026 00:53:11 -0700 Subject: [PATCH 1/3] feat!: implement priority aging for anti-starvation dispatch (#37, phase 1) Add dispatch-time priority aging that gradually promotes tasks waiting longer than a configurable grace period. Effective priority is computed in SQL (no write amplification) and capped at `max_effective_priority`. - New `AgingConfig` type with grace_period, aging_interval, max_effective_priority, and urgent_threshold - Modified peek_next/pop_next/pop_next_batch with aging ORDER BY clause - Schema: pause_duration_ms and paused_at_ms columns for clock freezing - All pause/resume paths accumulate pause_duration_ms correctly - Crash recovery accumulates stale pause duration - TaskEventHeader carries base_priority and effective_priority - SchedulerSnapshot exposes aging_config - Child tasks inherit parent's effective priority when aging enabled - SchedulerBuilder::priority_aging() for opt-in configuration - Zero overhead when aging is disabled (original query preserved) --- benches/history.rs | 2 +- benches/scheduler.rs | 2 +- migrations/001_tasks.sql | 2 + src/lib.rs | 6 +- src/registry/context.rs | 3 + src/registry/domain_context.rs | 15 ++ src/scheduler/aging.rs | 214 +++++++++++++++ src/scheduler/builder.rs | 22 ++ src/scheduler/event.rs | 11 + src/scheduler/mod.rs | 5 + src/scheduler/queries.rs | 1 + src/scheduler/run_loop.rs | 24 +- src/scheduler/spawn.rs | 4 +- src/scheduler/spawn/context.rs | 5 +- src/scheduler/submit.rs | 7 + src/store/dependencies.rs | 10 +- src/store/hierarchy.rs | 34 +-- src/store/lifecycle/cancel_expire.rs | 63 ++++- src/store/lifecycle/tests.rs | 385 ++++++++++++++++++++++++--- src/store/lifecycle/transitions.rs | 274 ++++++++++++++----- src/store/mod.rs | 22 +- src/store/query/tests.rs | 10 +- src/store/row_mapping.rs | 4 + src/store/submit/mod.rs | 5 +- src/store/submit/tests.rs | 12 +- src/task/mod.rs | 39 +++ src/task/tests.rs | 2 + tests/integration.rs | 2 + tests/integration/aging.rs | 177 ++++++++++++ tests/integration/dependencies.rs | 26 +- tests/integration/scheduler_core.rs | 18 +- 31 files changed, 1216 insertions(+), 190 deletions(-) create mode 100644 src/scheduler/aging.rs create mode 100644 tests/integration/aging.rs diff --git a/benches/history.rs b/benches/history.rs index 13053a1..b2a9e2e 100644 --- a/benches/history.rs +++ b/benches/history.rs @@ -28,7 +28,7 @@ async fn store_with_history(n: usize) -> TaskStore { .submit(&TaskSubmission::new(task_type).key(format!("h-{i}"))) .await .unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.complete(task.id, &budget).await.unwrap(); } store diff --git a/benches/scheduler.rs b/benches/scheduler.rs index eee4e6b..f4d02dd 100644 --- a/benches/scheduler.rs +++ b/benches/scheduler.rs @@ -211,7 +211,7 @@ fn bench_peek_next_varying_depth(c: &mut Criterion) { async move { let start = Instant::now(); for _ in 0..iters { - let _ = store.peek_next().await.unwrap(); + let _ = store.peek_next(None).await.unwrap(); } start.elapsed() } diff --git a/migrations/001_tasks.sql b/migrations/001_tasks.sql index 45df5a6..73fe14c 100644 --- a/migrations/001_tasks.sql +++ b/migrations/001_tasks.sql @@ -36,6 +36,8 @@ CREATE TABLE IF NOT EXISTS tasks ( recurring_execution_count INTEGER NOT NULL DEFAULT 0, recurring_paused INTEGER NOT NULL DEFAULT 0, pause_reasons INTEGER NOT NULL DEFAULT 0, + pause_duration_ms INTEGER NOT NULL DEFAULT 0, + paused_at_ms INTEGER DEFAULT NULL, UNIQUE(key) ); diff --git a/src/lib.rs b/src/lib.rs index d2af21b..2da1c19 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -808,9 +808,9 @@ pub use resource::network_pressure::NetworkPressure; pub use resource::sampler::SamplerConfig; pub use resource::{ResourceReader, ResourceSampler, ResourceSnapshot}; pub use scheduler::{ - EstimatedProgress, GroupLimits, PausedGroupInfo, ProgressReporter, RateLimit, RateLimitInfo, - Scheduler, SchedulerBuilder, SchedulerConfig, SchedulerEvent, SchedulerSnapshot, ShutdownMode, - TaskEventHeader, TaskProgress, + AgingConfig, EstimatedProgress, GroupLimits, PausedGroupInfo, ProgressReporter, RateLimit, + RateLimitInfo, Scheduler, SchedulerBuilder, SchedulerConfig, SchedulerEvent, SchedulerSnapshot, + ShutdownMode, TaskEventHeader, TaskProgress, }; pub use store::{RetentionPolicy, StoreConfig, StoreError, TaskStore}; pub use task::{ diff --git a/src/registry/context.rs b/src/registry/context.rs index 6a3d466..042de31 100644 --- a/src/registry/context.rs +++ b/src/registry/context.rs @@ -42,6 +42,9 @@ pub struct TaskContext { /// Name of the module that owns this task (e.g. `"media"`). Empty string for /// tasks running outside the module system (via `Scheduler::new`). pub(crate) owning_module: String, + /// Aging config from the scheduler, used for child priority inheritance. + /// `None` = aging disabled. + pub(crate) aging_config: Option>, } impl TaskContext { diff --git a/src/registry/domain_context.rs b/src/registry/domain_context.rs index 3d96405..d5ee8b1 100644 --- a/src/registry/domain_context.rs +++ b/src/registry/domain_context.rs @@ -268,6 +268,10 @@ impl<'a, D: DomainKey, T: TypedTask> ChildSpawnBuilder<'a, D, T> { } /// Submit the child task. + /// + /// When aging is enabled and no explicit priority override is set, + /// the child inherits the higher of the parent's effective priority + /// and the child's configured priority (lower numeric value wins). pub async fn submit(self) -> Result { let mut sub = TaskSubmission::from_typed(&self.task); if let Some(k) = self.override_key { @@ -275,6 +279,17 @@ impl<'a, D: DomainKey, T: TypedTask> ChildSpawnBuilder<'a, D, T> { } if let Some(p) = self.override_priority { sub = sub.priority(p); + } else if let Some(ref config) = self.ctx.aging_config { + let parent = self.ctx.record(); + let parent_effective = parent.effective_priority(Some(config)); + // Take the higher priority (lower numeric value) of parent's + // effective and child's configured priority. + let child_config = ::config() + .priority + .unwrap_or(crate::priority::Priority::NORMAL); + let inherited = + crate::priority::Priority::new(parent_effective.value().min(child_config.value())); + sub = sub.priority(inherited); } if let Some(d) = self.override_ttl { sub = sub.ttl(d); diff --git a/src/scheduler/aging.rs b/src/scheduler/aging.rs new file mode 100644 index 0000000..ec82fb0 --- /dev/null +++ b/src/scheduler/aging.rs @@ -0,0 +1,214 @@ +//! Priority aging — anti-starvation for low-priority tasks. +//! +//! When enabled, the scheduler computes an *effective priority* for each +//! pending task at dispatch time. Tasks waiting longer than `grace_period` +//! are gradually promoted, preventing starvation when high-priority tasks +//! arrive continuously. +//! +//! The stored priority is never mutated — effective priority is a +//! dispatch-time computation visible in snapshots and events. + +use std::time::Duration; + +use serde::{Deserialize, Serialize}; + +use crate::priority::Priority; + +/// Configuration for priority aging (anti-starvation). +/// +/// When enabled, the scheduler computes an *effective priority* for each +/// pending task at dispatch time: +/// +/// ```text +/// age = now - created_at - pause_duration +/// promotions = max(0, (age - grace_period) / aging_interval) +/// effective = max(base_priority - promotions, max_effective_priority) +/// ``` +/// +/// Lower numeric value = higher priority. Aging *decreases* the numeric +/// value (promotes the task). `max_effective_priority` is the floor +/// (highest priority a task can age into). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgingConfig { + /// Time a task must wait before aging begins. Default: 5 minutes. + pub grace_period: Duration, + /// Interval between each one-step priority promotion. Default: 60 seconds. + pub aging_interval: Duration, + /// Priority ceiling — tasks cannot age above this level. + /// Default: `Priority::HIGH` (64). Use `Priority::REALTIME` to allow + /// aging to the absolute highest level. + pub max_effective_priority: Priority, + /// When effective priority reaches this level, the task may bypass + /// group weight allocation (dispatched from the global pool). + /// `None` disables the urgent override. Default: `None`. + /// + /// **Invariant:** Must be `<= max_effective_priority` numerically + /// (i.e. at least as high priority), since tasks cannot age past + /// `max_effective_priority`. Validated at build time. + pub urgent_threshold: Option, +} + +impl Default for AgingConfig { + fn default() -> Self { + Self { + grace_period: Duration::from_secs(300), + aging_interval: Duration::from_secs(60), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + } + } +} + +impl AgingConfig { + /// Validate configuration invariants. Called by `SchedulerBuilder::build()`. + pub fn validate(&self) -> Result<(), &'static str> { + if self.aging_interval.is_zero() { + return Err("aging_interval must be non-zero"); + } + if let Some(urgent) = self.urgent_threshold { + // Lower numeric value = higher priority. urgent_threshold must + // be reachable: its numeric value >= max_effective_priority. + if urgent.value() < self.max_effective_priority.value() { + return Err( + "urgent_threshold is higher priority than max_effective_priority — \ + tasks can never age past max_effective_priority, so the \ + urgent threshold would never trigger", + ); + } + } + Ok(()) + } +} + +/// Bind parameters for the aging ORDER BY clause. +/// Computed once per dispatch cycle, passed to peek/pop queries. +pub struct AgingParams { + pub now_ms: i64, + pub grace_period_ms: i64, + pub aging_interval_ms: i64, + pub max_effective_priority: i64, +} + +impl AgingParams { + pub fn from_config(config: &AgingConfig) -> Self { + Self { + now_ms: chrono::Utc::now().timestamp_millis(), + grace_period_ms: config.grace_period.as_millis() as i64, + aging_interval_ms: config.aging_interval.as_millis().max(1) as i64, + max_effective_priority: config.max_effective_priority.value() as i64, + } + } +} + +/// Compute effective priority for a task record (used in events, snapshots, +/// not in the dispatch hot path which uses SQL). +pub fn effective_priority( + base: Priority, + created_at_ms: i64, + pause_duration_ms: i64, + config: &AgingConfig, +) -> Priority { + let now_ms = chrono::Utc::now().timestamp_millis(); + let age_ms = now_ms - created_at_ms - pause_duration_ms; + let grace_ms = config.grace_period.as_millis() as i64; + let interval_ms = config.aging_interval.as_millis().max(1) as i64; + let promotions = ((age_ms - grace_ms).max(0) / interval_ms).min(255) as u8; + let effective = base.value().saturating_sub(promotions); + let floor = config.max_effective_priority.value(); + Priority::new(effective.max(floor)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn effective_priority_within_grace() { + let config = AgingConfig { + grace_period: Duration::from_secs(300), + aging_interval: Duration::from_secs(60), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + // Task created just now — within grace period. + let now_ms = chrono::Utc::now().timestamp_millis(); + let result = effective_priority(Priority::IDLE, now_ms, 0, &config); + assert_eq!(result.value(), Priority::IDLE.value()); + } + + #[test] + fn effective_priority_one_promotion() { + let config = AgingConfig { + grace_period: Duration::from_secs(300), + aging_interval: Duration::from_secs(60), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + // Task created 361 seconds ago (grace 300 + 1 interval of 60 + 1). + let now_ms = chrono::Utc::now().timestamp_millis(); + let created_at_ms = now_ms - 361_000; + let result = effective_priority(Priority::IDLE, created_at_ms, 0, &config); + assert_eq!(result.value(), Priority::IDLE.value() - 1); + } + + #[test] + fn effective_priority_capped_at_max() { + let config = AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(1), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + // Task created long ago — many promotions, but capped. + let now_ms = chrono::Utc::now().timestamp_millis(); + let created_at_ms = now_ms - 1_000_000_000; // ~31 years + let result = effective_priority(Priority::IDLE, created_at_ms, 0, &config); + assert_eq!(result.value(), Priority::HIGH.value()); + } + + #[test] + fn effective_priority_with_pause_duration() { + let config = AgingConfig { + grace_period: Duration::from_secs(300), + aging_interval: Duration::from_secs(60), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + // Task created 400s ago but paused for 200s → effective age = 200s < grace. + let now_ms = chrono::Utc::now().timestamp_millis(); + let created_at_ms = now_ms - 400_000; + let result = effective_priority(Priority::IDLE, created_at_ms, 200_000, &config); + assert_eq!(result.value(), Priority::IDLE.value()); + } + + #[test] + fn default_config_values() { + let config = AgingConfig::default(); + assert_eq!(config.grace_period, Duration::from_secs(300)); + assert_eq!(config.aging_interval, Duration::from_secs(60)); + assert_eq!( + config.max_effective_priority.value(), + Priority::HIGH.value() + ); + assert!(config.urgent_threshold.is_none()); + } + + #[test] + fn validate_rejects_unreachable_urgent() { + let config = AgingConfig { + urgent_threshold: Some(Priority::REALTIME), // value 0 < HIGH value 64 + max_effective_priority: Priority::HIGH, + ..Default::default() + }; + assert!(config.validate().is_err()); + } + + #[test] + fn validate_rejects_zero_interval() { + let config = AgingConfig { + aging_interval: Duration::ZERO, + ..Default::default() + }; + assert!(config.validate().is_err()); + } +} diff --git a/src/scheduler/builder.rs b/src/scheduler/builder.rs index 99709bc..da3b145 100644 --- a/src/scheduler/builder.rs +++ b/src/scheduler/builder.rs @@ -14,6 +14,7 @@ use crate::resource::sampler::{SamplerConfig, SmoothedReader}; use crate::resource::{ResourceReader, ResourceSampler}; use crate::store::{StoreConfig, StoreError, TaskStore}; +use super::aging::AgingConfig; use super::rate_limit::RateLimit; use super::event::{SchedulerConfig, ShutdownMode}; @@ -131,6 +132,20 @@ impl SchedulerBuilder { self } + /// Enable priority aging with the given configuration. + /// + /// When enabled, tasks that wait longer than `grace_period` in the + /// pending queue are gradually promoted in effective priority, up to + /// `max_effective_priority`. This prevents starvation of low-priority + /// work when high-priority tasks arrive continuously. + /// + /// Effective priority is computed at dispatch time — the stored priority + /// is never mutated. Aging is visible in snapshots and events. + pub fn priority_aging(mut self, config: AgingConfig) -> Self { + self.config.aging_config = Some(config); + self + } + /// Set the poll interval. Default: 500ms. pub fn poll_interval(mut self, interval: Duration) -> Self { self.config.poll_interval = interval; @@ -450,6 +465,13 @@ impl SchedulerBuilder { self.app_state_entries, )); + // Validate aging config if present. + if let Some(ref aging) = self.config.aging_config { + aging + .validate() + .map_err(|e| StoreError::Database(e.into()))?; + } + let scheduler = Scheduler::with_gate( store, self.config, diff --git a/src/scheduler/event.rs b/src/scheduler/event.rs index 369e652..02ab93e 100644 --- a/src/scheduler/event.rs +++ b/src/scheduler/event.rs @@ -54,6 +54,8 @@ pub struct SchedulerSnapshot { pub paused_groups: Vec, /// Configured rate limits with current utilization. pub rate_limits: Vec, + /// Priority aging configuration (if enabled). + pub aging_config: Option, } /// Information about a paused group for snapshot/dashboard display. @@ -88,6 +90,12 @@ pub struct TaskEventHeader { pub label: String, /// Key-value metadata tags from the task record. pub tags: HashMap, + /// Stored (base) priority. + pub base_priority: Priority, + /// Effective priority at the time the event was emitted. + /// Computed by the scheduler from the `AgingConfig`; equals + /// `base_priority` when aging is disabled or the task hasn't aged. + pub effective_priority: Priority, } // ── Events ────────────────────────────────────────────────────────── @@ -288,6 +296,8 @@ pub struct SchedulerConfig { /// How often to sweep for expired tasks. `None` disables periodic sweeps /// (dispatch-time checks still apply). Default: `Some(30s)`. pub expiry_sweep_interval: Option, + /// Priority aging configuration. `None` (default) disables aging. + pub aging_config: Option, } impl Default for SchedulerConfig { @@ -303,6 +313,7 @@ impl Default for SchedulerConfig { cancel_hook_timeout: Duration::from_secs(30), default_ttl: None, expiry_sweep_interval: Some(Duration::from_secs(30)), + aging_config: None, } } } diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs index b8c0c73..2962876 100644 --- a/src/scheduler/mod.rs +++ b/src/scheduler/mod.rs @@ -21,6 +21,7 @@ //! See the [crate-level docs](crate) for a full walkthrough of the task //! lifecycle, common patterns, and how the dispatch loop works. +pub mod aging; mod builder; mod control; pub(crate) mod dispatch; @@ -77,6 +78,7 @@ pub(crate) struct FailureMsg { pub retryable: bool, pub metrics: IoBudget, } +pub use aging::AgingConfig; pub use event::{ PausedGroupInfo, SchedulerConfig, SchedulerEvent, SchedulerSnapshot, ShutdownMode, TaskEventHeader, @@ -195,6 +197,8 @@ pub(crate) struct SchedulerInner { pub(crate) failure_tx: tokio::sync::mpsc::UnboundedSender, /// Receive side (leader election + run loop drain). pub(crate) failure_rx: std::sync::Arc>>, + /// Priority aging configuration. `None` = aging disabled. + pub(crate) aging_config: Option>, } /// IO-aware priority scheduler. @@ -342,6 +346,7 @@ impl Scheduler { completion_rx: std::sync::Arc::new(Mutex::new(completion_rx)), failure_tx, failure_rx: std::sync::Arc::new(Mutex::new(failure_rx)), + aging_config: config.aging_config.map(Arc::new), }), } } diff --git a/src/scheduler/queries.rs b/src/scheduler/queries.rs index 2bc3e2b..e029590 100644 --- a/src/scheduler/queries.rs +++ b/src/scheduler/queries.rs @@ -187,6 +187,7 @@ impl Scheduler { blocked_count, paused_groups, rate_limits, + aging_config: self.inner.aging_config.as_ref().map(|arc| (**arc).clone()), }) } } diff --git a/src/scheduler/run_loop.rs b/src/scheduler/run_loop.rs index c2d0788..e41d7bf 100644 --- a/src/scheduler/run_loop.rs +++ b/src/scheduler/run_loop.rs @@ -6,6 +6,7 @@ use std::time::Duration; use tokio_util::sync::CancellationToken; +use crate::scheduler::aging::AgingParams; use crate::store::StoreError; use crate::task::IoBudget; @@ -35,6 +36,7 @@ impl Scheduler { completion_rx: self.inner.completion_rx.clone(), failure_tx: self.inner.failure_tx.clone(), failure_rx: self.inner.failure_rx.clone(), + aging_config: self.inner.aging_config.clone(), } } @@ -50,11 +52,18 @@ impl Scheduler { return Ok(false); } + // Compute aging params once per dispatch attempt. + let aging = self + .inner + .aging_config + .as_ref() + .map(|c| AgingParams::from_config(c)); + // Fast path: no gate checks needed, use pop_next() (single SQL) // instead of peek_next() + gate.admit() + claim_task() (2 SQL). // pop_next() skips expired tasks via its WHERE clause. if self.inner.fast_dispatch.load(AtomicOrdering::Relaxed) { - let Some(mut task) = self.inner.store.pop_next().await? else { + let Some(mut task) = self.inner.store.pop_next(aging.as_ref()).await? else { return Ok(false); }; self.inner @@ -65,7 +74,7 @@ impl Scheduler { } // Slow path: peek → gate check → claim. - let Some(mut candidate) = self.inner.store.peek_next().await? else { + let Some(mut candidate) = self.inner.store.peek_next(aging.as_ref()).await? else { return Ok(false); }; self.inner @@ -264,6 +273,11 @@ impl Scheduler { /// checks), falling back to one-at-a-time dispatch on the slow path. async fn dispatch_pending(&self) -> Result<(), StoreError> { if self.inner.fast_dispatch.load(AtomicOrdering::Relaxed) { + let aging = self + .inner + .aging_config + .as_ref() + .map(|c| AgingParams::from_config(c)); loop { let active_count = self.inner.active.count(); let max = self.inner.max_concurrency.load(AtomicOrdering::Relaxed); @@ -271,7 +285,11 @@ impl Scheduler { break; } let available = max - active_count; - let mut tasks = self.inner.store.pop_next_batch(available).await?; + let mut tasks = self + .inner + .store + .pop_next_batch(available, aging.as_ref()) + .await?; if tasks.is_empty() { break; } diff --git a/src/scheduler/spawn.rs b/src/scheduler/spawn.rs index df681c7..6d7a016 100644 --- a/src/scheduler/spawn.rs +++ b/src/scheduler/spawn.rs @@ -71,10 +71,10 @@ pub(crate) async fn spawn_task( } } - // Emit dispatched event. + // Emit dispatched event with aging-aware effective priority. emit_event( &ctx.event_tx, - SchedulerEvent::Dispatched(task.event_header()), + SchedulerEvent::Dispatched(task.event_header_with_aging(ctx.aging_config.as_deref())), ); // Build deps for handlers (cloned from SpawnContext since they move into the spawned future). diff --git a/src/scheduler/spawn/context.rs b/src/scheduler/spawn/context.rs index beb5bd3..6c2ae71 100644 --- a/src/scheduler/spawn/context.rs +++ b/src/scheduler/spawn/context.rs @@ -45,6 +45,8 @@ pub(crate) struct SpawnContext { pub failure_rx: std::sync::Arc< tokio::sync::Mutex>, >, + /// Priority aging configuration. `None` = aging disabled. + pub aging_config: Option>, } /// Output of task context construction — everything needed to insert into the @@ -86,7 +88,7 @@ pub(crate) fn build_task_context(task: &TaskRecord, spawn_ctx: &SpawnContext) -> record: task.clone(), token: token.clone(), progress: ProgressReporter::new( - task.event_header(), + task.event_header_with_aging(spawn_ctx.aging_config.as_deref()), spawn_ctx.event_tx.clone(), spawn_ctx.active.clone(), io.clone(), @@ -98,6 +100,7 @@ pub(crate) fn build_task_context(task: &TaskRecord, spawn_ctx: &SpawnContext) -> io: io.clone(), module_registry: spawn_ctx.module_registry.clone(), owning_module: owning_module.clone(), + aging_config: spawn_ctx.aging_config.clone(), }; PreparedTask { ctx, io, token } diff --git a/src/scheduler/submit.rs b/src/scheduler/submit.rs index ac40314..bde4c72 100644 --- a/src/scheduler/submit.rs +++ b/src/scheduler/submit.rs @@ -52,6 +52,7 @@ impl Scheduler { // Emit superseded event — we need the old record's header. // The old task is already in history, so build header from // submission info. + let priority = sub.priority; let old_header = super::event::TaskEventHeader { task_id: *replaced_task_id, module: sub @@ -63,6 +64,8 @@ impl Scheduler { key: sub.effective_key(), label: sub.label.clone(), tags: sub.tags.clone(), + base_priority: priority, + effective_priority: priority, }; emit_event( &self.inner.event_tx, @@ -120,6 +123,7 @@ impl Scheduler { } = outcome { self.handle_superseded_active(*replaced_task_id).await; + let priority = sub.priority; let old_header = super::event::TaskEventHeader { task_id: *replaced_task_id, module: sub @@ -131,6 +135,8 @@ impl Scheduler { key: sub.effective_key(), label: sub.label.clone(), tags: sub.tags.clone(), + base_priority: priority, + effective_priority: priority, }; emit_event( &self.inner.event_tx, @@ -548,6 +554,7 @@ impl Scheduler { io, module_registry, owning_module, + aging_config: None, }; match tokio::time::timeout(timeout, executor.on_cancel_erased(&ctx)).await { diff --git a/src/store/dependencies.rs b/src/store/dependencies.rs index 4b756c2..75d5d39 100644 --- a/src/store/dependencies.rs +++ b/src/store/dependencies.rs @@ -60,13 +60,16 @@ impl TaskStore { // Step 3: If any newly-unblocked task's group is paused, downgrade to // paused with the GROUP reason bit instead of leaving it as pending. + let now_ms = chrono::Utc::now().timestamp_millis(); for (task_id,) in &unblocked { sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 8 + pause_reasons = pause_reasons | 8, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE id = ? AND status = 'pending' AND group_key IN (SELECT group_key FROM paused_groups)", ) + .bind(now_ms) .bind(task_id) .execute(&mut **conn) .await?; @@ -186,12 +189,15 @@ impl TaskStore { .await?; if result.rows_affected() > 0 { // If the task's group is paused, downgrade to paused. + let pause_now_ms = chrono::Utc::now().timestamp_millis(); sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 8 + pause_reasons = pause_reasons | 8, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE id = ? AND status = 'pending' AND group_key IN (SELECT group_key FROM paused_groups)", ) + .bind(pause_now_ms) .bind(dep_id) .execute(&mut **conn) .await?; diff --git a/src/store/hierarchy.rs b/src/store/hierarchy.rs index 7c1fe75..b677425 100644 --- a/src/store/hierarchy.rs +++ b/src/store/hierarchy.rs @@ -244,7 +244,7 @@ mod tests { let sub = make_submission("waiter", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.set_waiting(task.id, None).await.unwrap(); @@ -262,13 +262,13 @@ mod tests { let parent_sub = make_submission("parent", Priority::NORMAL); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); store.set_waiting(parent_id, None).await.unwrap(); let mut child_sub = make_submission("child", Priority::NORMAL); child_sub.parent_id = Some(parent_id); store.submit(&child_sub).await.unwrap(); - let child = store.pop_next().await.unwrap().unwrap(); + let child = store.pop_next(None).await.unwrap().unwrap(); store .complete(child.id, &IoBudget::default()) .await @@ -284,7 +284,7 @@ mod tests { let parent_sub = make_submission("parent", Priority::NORMAL); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); store.set_waiting(parent_id, None).await.unwrap(); for i in 0..2 { @@ -292,7 +292,7 @@ mod tests { sub.parent_id = Some(parent_id); store.submit(&sub).await.unwrap(); } - let child = store.pop_next().await.unwrap().unwrap(); + let child = store.pop_next(None).await.unwrap().unwrap(); store .complete(child.id, &IoBudget::default()) .await @@ -308,13 +308,13 @@ mod tests { let parent_sub = make_submission("parent", Priority::NORMAL); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); store.set_waiting(parent_id, None).await.unwrap(); let mut child_sub = make_submission("child", Priority::NORMAL); child_sub.parent_id = Some(parent_id); store.submit(&child_sub).await.unwrap(); - let child = store.pop_next().await.unwrap().unwrap(); + let child = store.pop_next(None).await.unwrap().unwrap(); store .fail( child.id, @@ -341,7 +341,7 @@ mod tests { let parent_sub = make_submission("parent", Priority::NORMAL); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - let _parent = store.pop_next().await.unwrap().unwrap(); + let _parent = store.pop_next(None).await.unwrap().unwrap(); for i in 0..3 { let mut sub = make_submission(&format!("child-{i}"), Priority::NORMAL); @@ -349,7 +349,7 @@ mod tests { store.submit(&sub).await.unwrap(); } - let running_child = store.pop_next().await.unwrap().unwrap(); + let running_child = store.pop_next(None).await.unwrap().unwrap(); let running_ids = store.cancel_children(parent_id).await.unwrap(); assert_eq!(running_ids.len(), 1); @@ -365,12 +365,12 @@ mod tests { let parent_sub = make_submission("parent", Priority::NORMAL); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - let _parent = store.pop_next().await.unwrap().unwrap(); + let _parent = store.pop_next(None).await.unwrap().unwrap(); let mut child_sub = make_submission("child", Priority::NORMAL); child_sub.parent_id = Some(parent_id); store.submit(&child_sub).await.unwrap(); - let child = store.pop_next().await.unwrap().unwrap(); + let child = store.pop_next(None).await.unwrap().unwrap(); store .complete(child.id, &IoBudget::default()) @@ -390,7 +390,7 @@ mod tests { sub.fail_fast = false; store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert!(!task.fail_fast); store.complete(task.id, &IoBudget::default()).await.unwrap(); @@ -405,7 +405,7 @@ mod tests { let sub = make_submission("fin", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.set_waiting(task.id, None).await.unwrap(); store.set_running_for_finalize(task.id).await.unwrap(); @@ -428,7 +428,7 @@ mod tests { .tag("env", "prod") .tag("region", "us-east"); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - let mut parent = store.pop_next().await.unwrap().unwrap(); + let mut parent = store.pop_next(None).await.unwrap().unwrap(); store .populate_tags(std::slice::from_mut(&mut parent)) .await @@ -466,7 +466,7 @@ mod tests { .tag("env", "prod") .tag("region", "us-east"); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - let mut parent = store.pop_next().await.unwrap().unwrap(); + let mut parent = store.pop_next(None).await.unwrap().unwrap(); store .populate_tags(std::slice::from_mut(&mut parent)) .await @@ -501,13 +501,13 @@ mod tests { let parent_sub = make_submission("parent", Priority::NORMAL); let parent_id = store.submit(&parent_sub).await.unwrap().id().unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); store.set_waiting(parent_id, None).await.unwrap(); let mut child_sub = make_submission("child", Priority::NORMAL); child_sub.parent_id = Some(parent_id); store.submit(&child_sub).await.unwrap(); - let child = store.pop_next().await.unwrap().unwrap(); + let child = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(child.status, TaskStatus::Running); store.recover_running().await.unwrap(); diff --git a/src/store/lifecycle/cancel_expire.rs b/src/store/lifecycle/cancel_expire.rs index b8ffa6f..2af391d 100644 --- a/src/store/lifecycle/cancel_expire.rs +++ b/src/store/lifecycle/cancel_expire.rs @@ -14,13 +14,17 @@ use super::{compute_duration_ms, insert_history, HistoryStatus}; impl TaskStore { /// Pause a running task. ORs the reason bit into `pause_reasons`. + /// Sets `paused_at_ms` if not already set (task not already paused with another reason). pub async fn pause(&self, id: i64, reason: PauseReasons) -> Result<(), StoreError> { + let now_ms = chrono::Utc::now().timestamp_millis(); sqlx::query( "UPDATE tasks SET status = 'paused', started_at = NULL, - pause_reasons = pause_reasons | ? + pause_reasons = pause_reasons | ?, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE id = ?", ) .bind(reason.bits()) + .bind(now_ms) .bind(id) .execute(&self.pool) .await?; @@ -28,11 +32,16 @@ impl TaskStore { } /// Resume a paused task back to pending. Clears all pause reasons. + /// Accumulates `pause_duration_ms` from `paused_at_ms`. pub async fn resume(&self, id: i64) -> Result<(), StoreError> { + let now_ms = chrono::Utc::now().timestamp_millis(); sqlx::query( - "UPDATE tasks SET status = 'pending', pause_reasons = 0 + "UPDATE tasks SET status = 'pending', pause_reasons = 0, + pause_duration_ms = pause_duration_ms + COALESCE(? - paused_at_ms, 0), + paused_at_ms = NULL WHERE id = ? AND status = 'paused'", ) + .bind(now_ms) .bind(id) .execute(&self.pool) .await?; @@ -55,18 +64,24 @@ impl TaskStore { /// Clear the PREEMPTION bit from a paused task. If no other pause reasons /// remain, the task transitions back to pending. + /// Accumulates `pause_duration_ms` on full resume. pub async fn resume_preempted(&self, id: i64) -> Result<(), StoreError> { + let now_ms = chrono::Utc::now().timestamp_millis(); // Fully resume if PREEMPTION is the only reason. let result = sqlx::query( - "UPDATE tasks SET status = 'pending', pause_reasons = 0 + "UPDATE tasks SET status = 'pending', pause_reasons = 0, + pause_duration_ms = pause_duration_ms + COALESCE(? - paused_at_ms, 0), + paused_at_ms = NULL WHERE id = ? AND status = 'paused' AND pause_reasons = 1", ) + .bind(now_ms) .bind(id) .execute(&self.pool) .await?; if result.rows_affected() == 0 { // Clear PREEMPTION bit but stay paused (other reasons remain). + // Don't accumulate — task is still paused. sqlx::query( "UPDATE tasks SET pause_reasons = pause_reasons & ~1 WHERE id = ? AND status = 'paused' AND (pause_reasons & 1) != 0", @@ -80,21 +95,27 @@ impl TaskStore { /// Clear a specific pause-reason bit from all paused tasks. Tasks whose /// `pause_reasons` becomes 0 transition back to pending. + /// Accumulates `pause_duration_ms` on full resume. /// Returns the count of tasks that fully resumed. pub async fn clear_pause_bit(&self, reason: PauseReasons) -> Result { let bit = reason.bits(); + let now_ms = chrono::Utc::now().timestamp_millis(); // Fully resume tasks where this is the sole reason. let fully_resumed = sqlx::query( - "UPDATE tasks SET status = 'pending', pause_reasons = 0 + "UPDATE tasks SET status = 'pending', pause_reasons = 0, + pause_duration_ms = pause_duration_ms + COALESCE(? - paused_at_ms, 0), + paused_at_ms = NULL WHERE status = 'paused' AND pause_reasons = ?", ) + .bind(now_ms) .bind(bit) .execute(&self.pool) .await? .rows_affected(); // Clear the bit from multi-reason tasks (stays paused). + // Don't accumulate — task is still paused. sqlx::query( "UPDATE tasks SET pause_reasons = pause_reasons & ~? WHERE status = 'paused' AND (pause_reasons & ?) != 0", @@ -173,13 +194,14 @@ impl TaskStore { /// Pause tasks in a group. ORs the GROUP bit into all pending and /// already-paused tasks in the group: - /// - Pending tasks → paused with GROUP bit. + /// - Pending tasks → paused with GROUP bit, sets `paused_at_ms`. /// - Already-paused tasks → GROUP bit added (prevents premature resume - /// by other mechanisms clearing their own bit). + /// by other mechanisms clearing their own bit). `paused_at_ms` kept as-is. /// /// Returns the count of newly paused tasks (status changed from pending). pub async fn pause_tasks_in_group(&self, group_key: &str) -> Result { // Add GROUP bit to already-paused tasks (no status change, just adds the bit). + // Don't touch paused_at_ms — they already have one. sqlx::query( "UPDATE tasks SET pause_reasons = pause_reasons | 8 WHERE group_key = ? AND status = 'paused' AND (pause_reasons & 8) = 0", @@ -188,12 +210,15 @@ impl TaskStore { .execute(&self.pool) .await?; - // Pause pending tasks with GROUP bit. + // Pause pending tasks with GROUP bit and set paused_at_ms. + let now_ms = chrono::Utc::now().timestamp_millis(); let result = sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 8 + pause_reasons = pause_reasons | 8, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE group_key = ? AND status = 'pending'", ) + .bind(now_ms) .bind(group_key) .execute(&self.pool) .await?; @@ -203,22 +228,28 @@ impl TaskStore { /// Resume group-paused tasks. Two-step process: /// /// 1. **Fully resume** tasks where GROUP is the sole reason (pause_reasons = 8). + /// Accumulates `pause_duration_ms` and clears `paused_at_ms`. /// 2. **Clear GROUP bit** from multi-reason tasks (they stay paused under - /// their remaining reasons). + /// their remaining reasons). `paused_at_ms` left as-is. /// /// Returns the count of tasks that fully resumed (became pending). pub async fn resume_paused_by_group(&self, group_key: &str) -> Result { + let now_ms = chrono::Utc::now().timestamp_millis(); // 1. Fully resume tasks where GROUP is the sole reason. let fully_resumed = sqlx::query( - "UPDATE tasks SET status = 'pending', pause_reasons = 0 + "UPDATE tasks SET status = 'pending', pause_reasons = 0, + pause_duration_ms = pause_duration_ms + COALESCE(? - paused_at_ms, 0), + paused_at_ms = NULL WHERE group_key = ? AND status = 'paused' AND pause_reasons = 8", ) + .bind(now_ms) .bind(group_key) .execute(&self.pool) .await? .rows_affected(); // 2. Clear GROUP bit from multi-reason tasks (stays paused). + // Don't accumulate — task is still paused. sqlx::query( "UPDATE tasks SET pause_reasons = pause_reasons & ~8 WHERE group_key = ? AND status = 'paused' AND (pause_reasons & 8) != 0", @@ -333,11 +364,14 @@ impl TaskStore { /// Returns the number of tasks whose status changed to paused. pub async fn pause_pending_by_type_prefix(&self, prefix: &str) -> Result { let pattern = format!("{prefix}%"); + let now_ms = chrono::Utc::now().timestamp_millis(); let result = sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 2 + pause_reasons = pause_reasons | 2, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE task_type LIKE ? AND status IN ('pending', 'paused')", ) + .bind(now_ms) .bind(&pattern) .execute(&self.pool) .await?; @@ -346,15 +380,20 @@ impl TaskStore { /// Resume module-paused tasks by type prefix. Clears the MODULE bit. /// Tasks fully resume (become pending) only if no other pause reasons remain. + /// Accumulates `pause_duration_ms` on full resume. /// Returns the count of tasks that fully resumed. pub async fn resume_paused_by_type_prefix(&self, prefix: &str) -> Result { let pattern = format!("{prefix}%"); + let now_ms = chrono::Utc::now().timestamp_millis(); // Fully resume tasks where MODULE is the only reason. let fully_resumed = sqlx::query( - "UPDATE tasks SET status = 'pending', pause_reasons = 0 + "UPDATE tasks SET status = 'pending', pause_reasons = 0, + pause_duration_ms = pause_duration_ms + COALESCE(? - paused_at_ms, 0), + paused_at_ms = NULL WHERE task_type LIKE ? AND status = 'paused' AND pause_reasons = 2", ) + .bind(now_ms) .bind(&pattern) .execute(&self.pool) .await? diff --git a/src/store/lifecycle/tests.rs b/src/store/lifecycle/tests.rs index e85b26d..b5d9f66 100644 --- a/src/store/lifecycle/tests.rs +++ b/src/store/lifecycle/tests.rs @@ -36,13 +36,13 @@ async fn priority_ordering() { store.submit(&rt).await.unwrap(); store.submit(&normal).await.unwrap(); - let first = store.pop_next().await.unwrap().unwrap(); + let first = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(first.key, rt_key); - let second = store.pop_next().await.unwrap().unwrap(); + let second = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(second.key, normal_key); - let third = store.pop_next().await.unwrap().unwrap(); + let third = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(third.key, bg_key); } @@ -52,7 +52,7 @@ async fn complete_moves_to_history() { let sub = make_submission("done", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .complete(task.id, &IoBudget::disk(2000, 1000)) @@ -73,7 +73,7 @@ async fn fail_retryable_requeues() { let sub = make_submission("retry-me", Priority::HIGH); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .fail( @@ -99,7 +99,7 @@ async fn fail_exhausted_retries_moves_to_history() { let sub = make_submission("permanent", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .fail( @@ -112,7 +112,7 @@ async fn fail_exhausted_retries_moves_to_history() { ) .await .unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.retry_count, 1); store .fail( @@ -140,7 +140,7 @@ async fn pause_and_resume() { .submit(&make_submission("pausable", Priority::NORMAL)) .await .unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .pause(task.id, PauseReasons::PREEMPTION) @@ -163,7 +163,7 @@ async fn pause_reasons_accumulate_across_sources() { let store = test_store().await; let sub = TaskSubmission::new("mod1.work").key("multi-pause"); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Preemption pause. store @@ -200,7 +200,7 @@ async fn preemption_paused_tasks_filters_by_bit() { // Task A: preemption-paused. let sub_a = TaskSubmission::new("test").key("a"); store.submit(&sub_a).await.unwrap(); - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store.pause(a.id, PauseReasons::PREEMPTION).await.unwrap(); // Task B: module-paused only. @@ -222,13 +222,13 @@ async fn clear_pause_bit_resumes_sole_reason_tasks() { // Task with only GLOBAL reason. let sub_a = TaskSubmission::new("test").key("global-only"); store.submit(&sub_a).await.unwrap(); - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store.pause(a.id, PauseReasons::GLOBAL).await.unwrap(); // Task with GLOBAL + PREEMPTION reasons. let sub_b = TaskSubmission::new("test").key("multi"); store.submit(&sub_b).await.unwrap(); - let b = store.pop_next().await.unwrap().unwrap(); + let b = store.pop_next(None).await.unwrap().unwrap(); store.pause(b.id, PauseReasons::PREEMPTION).await.unwrap(); store.pause(b.id, PauseReasons::GLOBAL).await.unwrap(); @@ -293,8 +293,8 @@ async fn running_io_totals() { .expected_io(IoBudget::disk(3000, 1000)); store.submit(&sub2).await.unwrap(); - store.pop_next().await.unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); + store.pop_next(None).await.unwrap(); let (read, write) = store.running_io_totals().await.unwrap(); assert_eq!(read, 8000); @@ -306,7 +306,7 @@ async fn key_freed_after_completion() { let store = test_store().await; let sub = make_submission("reuse", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.complete(task.id, &IoBudget::default()).await.unwrap(); let outcome = store.submit(&sub).await.unwrap(); @@ -319,7 +319,7 @@ async fn requeue_running_task() { let sub = make_submission("rq", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.status, TaskStatus::Running); store.requeue(task.id).await.unwrap(); @@ -335,7 +335,7 @@ async fn peek_next_does_not_modify_status() { let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let peeked = store.peek_next().await.unwrap().unwrap(); + let peeked = store.peek_next(None).await.unwrap().unwrap(); assert_eq!(peeked.key, key); assert_eq!(peeked.status, TaskStatus::Pending); @@ -343,14 +343,14 @@ async fn peek_next_does_not_modify_status() { assert_eq!(t.status, TaskStatus::Pending); assert!(t.started_at.is_none()); - let peeked2 = store.peek_next().await.unwrap().unwrap(); + let peeked2 = store.peek_next(None).await.unwrap().unwrap(); assert_eq!(peeked2.id, peeked.id); } #[tokio::test] async fn peek_next_empty_queue() { let store = test_store().await; - assert!(store.peek_next().await.unwrap().is_none()); + assert!(store.peek_next(None).await.unwrap().is_none()); } #[tokio::test] @@ -372,7 +372,7 @@ async fn pop_by_id_returns_none_if_already_running() { let sub = make_submission("already-taken", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert!(store.pop_by_id(task.id).await.unwrap().is_none()); } @@ -390,12 +390,12 @@ async fn peek_then_pop_by_id_workflow() { let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let peeked = store.peek_next().await.unwrap().unwrap(); + let peeked = store.peek_next(None).await.unwrap().unwrap(); let claimed = store.pop_by_id(peeked.id).await.unwrap().unwrap(); assert_eq!(claimed.key, key); assert_eq!(claimed.status, TaskStatus::Running); - assert!(store.peek_next().await.unwrap().is_none()); + assert!(store.peek_next(None).await.unwrap().is_none()); } // ── Tag lifecycle tests ─────────────────────────────────────────── @@ -409,7 +409,7 @@ async fn tags_copied_to_history_on_complete() { .tag("owner", "alice"); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.complete(task.id, &IoBudget::default()).await.unwrap(); let mut hist = store.history_by_key(&sub.effective_key()).await.unwrap(); @@ -427,7 +427,7 @@ async fn tags_copied_to_history_on_fail() { .tag("region", "us-west"); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .fail( task.id, @@ -499,7 +499,7 @@ async fn tags_preserved_on_recurring_requeue() { .recurring(Duration::from_secs(3600)); store.submit(&sub).await.unwrap(); - let mut task = store.pop_next().await.unwrap().unwrap(); + let mut task = store.pop_next(None).await.unwrap().unwrap(); store .populate_tags(std::slice::from_mut(&mut task)) .await @@ -525,7 +525,7 @@ async fn tags_in_pop_next() { .tag("color", "blue"); store.submit(&sub).await.unwrap(); - let mut task = store.pop_next().await.unwrap().unwrap(); + let mut task = store.pop_next(None).await.unwrap().unwrap(); store .populate_tags(std::slice::from_mut(&mut task)) .await @@ -565,7 +565,7 @@ async fn max_retries_preserved_in_history_on_complete() { .max_retries(7); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.max_retries, Some(7)); store.complete(task.id, &IoBudget::default()).await.unwrap(); @@ -584,7 +584,7 @@ async fn max_retries_preserved_in_history_on_fail() { .max_retries(3); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Permanent failure (non-retryable). store @@ -612,7 +612,7 @@ async fn max_retries_null_reads_back_as_none() { // Submit without max_retries (NULL in DB). let sub = TaskSubmission::new("test").key("mr-null"); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.max_retries, None); // Complete it and verify history also has None. @@ -633,7 +633,7 @@ async fn backoff_constant_sets_run_after() { let sub = make_submission("const-backoff", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); let strategy = BackoffStrategy::Constant { delay: Duration::from_secs(60), @@ -683,7 +683,7 @@ async fn backoff_exponential_increases_across_retries() { }; // First failure (retry_count=0): delay = 10s - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.retry_count, 0); store .fail( @@ -715,7 +715,7 @@ async fn backoff_exponential_increases_across_retries() { .unwrap(); // Second failure (retry_count=1): delay = 20s - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.retry_count, 1); store .fail( @@ -749,7 +749,7 @@ async fn executor_retry_after_overrides_strategy() { let sub = make_submission("override-backoff", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Strategy says 10s, but executor override says 120s. let strategy = BackoffStrategy::Constant { @@ -786,7 +786,7 @@ async fn no_backoff_requeues_immediately() { let sub = make_submission("no-backoff", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // No strategy, no executor override → immediate retry. store @@ -820,7 +820,7 @@ async fn permanent_error_skips_retry_moves_to_history() { let sub = make_submission("permanent-err", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Even with a backoff strategy, non-retryable errors go straight to history. let strategy = BackoffStrategy::Constant { @@ -861,7 +861,7 @@ async fn exhausted_retries_produce_dead_letter_status() { store.submit(&sub).await.unwrap(); // First failure: retry_count=0, max_retries=1 → requeue. - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.retry_count, 0); store .fail( @@ -876,7 +876,7 @@ async fn exhausted_retries_produce_dead_letter_status() { .unwrap(); // Second failure: retry_count=1, max_retries=1 → exhausted → dead_letter. - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.retry_count, 1); store .fail( @@ -907,7 +907,7 @@ async fn non_retryable_error_still_produces_failed_status() { let sub = make_submission("dl-permanent", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Non-retryable error with remaining retries → should be "failed", not "dead_letter". store @@ -940,7 +940,7 @@ async fn dead_letter_tasks_query_returns_only_dead_lettered() { // Create a dead-lettered task (retryable, exhausted). let sub_dl = make_submission("dl-query-dl", Priority::NORMAL); store.submit(&sub_dl).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .fail( task.id, @@ -956,7 +956,7 @@ async fn dead_letter_tasks_query_returns_only_dead_lettered() { // Create a failed task (non-retryable). let sub_fail = make_submission("dl-query-fail", Priority::NORMAL); store.submit(&sub_fail).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .fail( task.id, @@ -972,7 +972,7 @@ async fn dead_letter_tasks_query_returns_only_dead_lettered() { // Create a completed task. let sub_ok = make_submission("dl-query-ok", Priority::NORMAL); store.submit(&sub_ok).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.complete(task.id, &IoBudget::default()).await.unwrap(); // dead_letter_tasks should return only the dead-lettered one. @@ -1077,7 +1077,7 @@ async fn pause_tasks_in_group_adds_bit_to_already_paused() { let store = test_store().await; let sub = TaskSubmission::new("test").key("t1").group("g1"); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Preemption pause first. store @@ -1115,7 +1115,7 @@ async fn resume_paused_by_group_clears_bit_but_stays_paused_with_other_reasons() let store = test_store().await; let sub = TaskSubmission::new("test").key("t1").group("g1"); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); // Preemption pause + group pause. store @@ -1180,3 +1180,302 @@ async fn pending_and_paused_count_for_group() { assert_eq!(store.pending_count_for_group("g2").await.unwrap(), 1); assert_eq!(store.paused_count_for_group("g2").await.unwrap(), 0); } + +// ── Priority Aging (Phase 1) ───────────────────────────────────── + +use crate::scheduler::aging::{AgingConfig, AgingParams}; + +#[tokio::test] +async fn peek_next_with_aging_promotes_old_task() { + let store = test_store().await; + + // Submit an IDLE task with a very old created_at (via raw SQL). + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; // 10 min ago + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'old', 'old', ?, 'pending', ?)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + // Submit a fresh NORMAL task. + let now_ms = chrono::Utc::now().timestamp_millis(); + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'new', 'new', ?, 'pending', ?)", + ) + .bind(Priority::NORMAL.value() as i32) + .bind(now_ms) + .execute(store.pool()) + .await + .unwrap(); + + // Without aging: NORMAL (128) pops before IDLE (255). + let peeked = store.peek_next(None).await.unwrap().unwrap(); + assert_eq!(peeked.label, "new"); + + // With aging (grace=0, interval=1s, max=HIGH(64)): IDLE aged to 64 < NORMAL(128). + let config = AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(1), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + let aging = AgingParams::from_config(&config); + let peeked = store.peek_next(Some(&aging)).await.unwrap().unwrap(); + assert_eq!(peeked.label, "old"); +} + +#[tokio::test] +async fn pop_next_with_aging_selects_aged() { + let store = test_store().await; + + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'old', 'old', ?, 'pending', ?)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + let now_ms = chrono::Utc::now().timestamp_millis(); + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'new', 'new', ?, 'pending', ?)", + ) + .bind(Priority::NORMAL.value() as i32) + .bind(now_ms) + .execute(store.pool()) + .await + .unwrap(); + + let config = AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(1), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + let aging = AgingParams::from_config(&config); + let popped = store.pop_next(Some(&aging)).await.unwrap().unwrap(); + assert_eq!(popped.label, "old"); + assert_eq!(popped.status, TaskStatus::Running); +} + +#[tokio::test] +async fn pop_next_batch_respects_aging_order() { + let store = test_store().await; + + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; + let now_ms = chrono::Utc::now().timestamp_millis(); + + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'old', 'old', ?, 'pending', ?)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'new', 'new', ?, 'pending', ?)", + ) + .bind(Priority::NORMAL.value() as i32) + .bind(now_ms) + .execute(store.pool()) + .await + .unwrap(); + + let config = AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(1), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + let aging = AgingParams::from_config(&config); + // Batch pop selects both; verify both are claimed. + let batch = store.pop_next_batch(10, Some(&aging)).await.unwrap(); + assert_eq!(batch.len(), 2); + let labels: Vec<&str> = batch.iter().map(|t| t.label.as_str()).collect(); + assert!(labels.contains(&"old")); + assert!(labels.contains(&"new")); +} + +#[tokio::test] +async fn aging_disabled_preserves_original_order() { + let store = test_store().await; + + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; + let now_ms = chrono::Utc::now().timestamp_millis(); + + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'old', 'old', ?, 'pending', ?)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test', 'new', 'new', ?, 'pending', ?)", + ) + .bind(Priority::NORMAL.value() as i32) + .bind(now_ms) + .execute(store.pool()) + .await + .unwrap(); + + // Without aging, NORMAL (128) peeks before IDLE (255). + let peeked = store.peek_next(None).await.unwrap().unwrap(); + assert_eq!(peeked.label, "new"); +} + +#[tokio::test] +async fn pause_duration_freezes_aging_clock() { + let store = test_store().await; + + let config = AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(60), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }; + + // Two tasks created 600s ago. One has 500s of pause_duration (effective age = 100s), + // the other has 0s (effective age = 600s). The unpaused one should age more. + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; + + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at, pause_duration_ms) + VALUES ('test', 'paused', 'paused', ?, 'pending', ?, 500000)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at, pause_duration_ms) + VALUES ('test', 'unpaused', 'unpaused', ?, 'pending', ?, 0)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + // The unpaused one aged more (10 promotions) vs paused (1 promotion), + // so it has lower effective priority value → higher priority → dispatched first. + let aging = AgingParams::from_config(&config); + let popped = store.pop_next(Some(&aging)).await.unwrap().unwrap(); + assert_eq!(popped.label, "unpaused"); +} + +#[tokio::test] +async fn pause_sets_paused_at_ms() { + let store = test_store().await; + store + .submit(&make_submission("t1", Priority::NORMAL)) + .await + .unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); + + // Initially no paused_at_ms. + assert!(task.paused_at_ms.is_none()); + + // Pause sets it. + store + .pause(task.id, PauseReasons::PREEMPTION) + .await + .unwrap(); + let t = store.task_by_id(task.id).await.unwrap().unwrap(); + assert!(t.paused_at_ms.is_some()); + + // Adding another pause reason doesn't overwrite it. + let original = t.paused_at_ms.unwrap(); + store.pause(task.id, PauseReasons::GROUP).await.unwrap(); + let t2 = store.task_by_id(task.id).await.unwrap().unwrap(); + assert_eq!(t2.paused_at_ms.unwrap(), original); +} + +#[tokio::test] +async fn resume_accumulates_pause_duration() { + let store = test_store().await; + store + .submit(&make_submission("t1", Priority::NORMAL)) + .await + .unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); + + store + .pause(task.id, PauseReasons::PREEMPTION) + .await + .unwrap(); + tokio::time::sleep(Duration::from_millis(50)).await; + store.resume(task.id).await.unwrap(); + + let t = store.task_by_id(task.id).await.unwrap().unwrap(); + assert!( + t.pause_duration_ms >= 40, + "pause_duration_ms={}", + t.pause_duration_ms + ); + assert!(t.paused_at_ms.is_none()); +} + +#[tokio::test] +async fn bulk_pause_resume_group_accumulates() { + let store = test_store().await; + let sub = TaskSubmission::new("test").key("t1").group("g1"); + store.submit(&sub).await.unwrap(); + + store.pause_group_state("g1", None).await.unwrap(); + store.pause_tasks_in_group("g1").await.unwrap(); + + // Verify paused_at_ms is set. + let t = store.task_by_id(1).await.unwrap().unwrap(); + assert!(t.paused_at_ms.is_some()); + + tokio::time::sleep(Duration::from_millis(50)).await; + store.resume_paused_by_group("g1").await.unwrap(); + + let t = store.task_by_id(1).await.unwrap().unwrap(); + assert!( + t.pause_duration_ms >= 40, + "pause_duration_ms={}", + t.pause_duration_ms + ); + assert!(t.paused_at_ms.is_none()); +} + +#[tokio::test] +async fn crash_recovery_accumulates_stale_pause() { + let store = test_store().await; + store + .submit(&make_submission("t1", Priority::NORMAL)) + .await + .unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); + + store + .pause(task.id, PauseReasons::PREEMPTION) + .await + .unwrap(); + tokio::time::sleep(Duration::from_millis(50)).await; + + // Simulate crash recovery. + store.recover_running().await.unwrap(); + + // paused_at_ms should be cleared and pause_duration_ms accumulated. + let t = store.task_by_id(task.id).await.unwrap().unwrap(); + assert!(t.paused_at_ms.is_none()); + assert!( + t.pause_duration_ms >= 40, + "pause_duration_ms={}", + t.pause_duration_ms + ); +} diff --git a/src/store/lifecycle/transitions.rs b/src/store/lifecycle/transitions.rs index 541bbc9..6d76fb8 100644 --- a/src/store/lifecycle/transitions.rs +++ b/src/store/lifecycle/transitions.rs @@ -14,6 +14,7 @@ //! The `waiting` transition lives in `hierarchy.rs`, and pause/resume/cancel/expire //! live in `cancel_expire.rs`. +use crate::scheduler::aging::AgingParams; use crate::store::row_mapping::row_to_task_record; use crate::store::{StoreError, TaskStore}; use crate::task::{BackoffStrategy, IoBudget, TaskRecord}; @@ -26,21 +27,56 @@ impl TaskStore { /// Peek at the highest-priority pending task without modifying it. /// Returns `None` if the queue is empty. Tasks with a future `run_after` /// timestamp are excluded (not yet eligible for dispatch). - pub async fn peek_next(&self) -> Result, StoreError> { + /// + /// When `aging` is `Some`, the ORDER BY uses the aging formula to compute + /// effective priority at dispatch time. When `None`, the original + /// index-ordered query is used (zero overhead). + pub async fn peek_next( + &self, + aging: Option<&AgingParams>, + ) -> Result, StoreError> { let now_ms = chrono::Utc::now().timestamp_millis(); - let row = sqlx::query( - "SELECT * FROM tasks - WHERE id = ( - SELECT id FROM tasks - WHERE status = 'pending' - AND (run_after IS NULL OR run_after <= ?) - ORDER BY priority ASC, id ASC - LIMIT 1 - )", - ) - .bind(now_ms) - .fetch_optional(&self.pool) - .await?; + let row = match aging { + None => { + sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + ORDER BY priority ASC, id ASC + LIMIT 1 + )", + ) + .bind(now_ms) + .fetch_optional(&self.pool) + .await? + } + Some(ap) => { + sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + ORDER BY + MAX( + priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), + ? + ) ASC, + id ASC + LIMIT 1 + )", + ) + .bind(now_ms) + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .fetch_optional(&self.pool) + .await? + } + }; Ok(row.as_ref().map(row_to_task_record)) } @@ -115,41 +151,88 @@ impl TaskStore { /// them as running. Returns an empty vec if no work is available. This /// avoids the N sequential `pop_next()` round-trips when filling /// concurrency slots. - pub async fn pop_next_batch(&self, limit: usize) -> Result, StoreError> { + /// + /// When `aging` is `Some`, the ORDER BY uses the aging formula. + pub async fn pop_next_batch( + &self, + limit: usize, + aging: Option<&AgingParams>, + ) -> Result, StoreError> { if limit == 0 { return Ok(Vec::new()); } if limit == 1 { - return Ok(self.pop_next().await?.into_iter().collect()); + return Ok(self.pop_next(aging).await?.into_iter().collect()); } let now_ms = chrono::Utc::now().timestamp_millis(); - let rows = sqlx::query( - "UPDATE tasks SET - status = 'running', - started_at = ?, - expires_at = CASE - WHEN ttl_from = 'first_attempt' AND ttl_seconds IS NOT NULL AND expires_at IS NULL - THEN ? + (ttl_seconds * 1000) - ELSE expires_at - END - WHERE id IN ( - SELECT id FROM tasks - WHERE status = 'pending' - AND (run_after IS NULL OR run_after <= ?) - AND (expires_at IS NULL OR expires_at > ?) - ORDER BY priority ASC, id ASC - LIMIT ? - ) - RETURNING *", - ) - .bind(now_ms) - .bind(now_ms) - .bind(now_ms) - .bind(now_ms) - .bind(limit as i64) - .fetch_all(&self.pool) - .await?; + let rows = match aging { + None => { + sqlx::query( + "UPDATE tasks SET + status = 'running', + started_at = ?, + expires_at = CASE + WHEN ttl_from = 'first_attempt' AND ttl_seconds IS NOT NULL AND expires_at IS NULL + THEN ? + (ttl_seconds * 1000) + ELSE expires_at + END + WHERE id IN ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + AND (expires_at IS NULL OR expires_at > ?) + ORDER BY priority ASC, id ASC + LIMIT ? + ) + RETURNING *", + ) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(limit as i64) + .fetch_all(&self.pool) + .await? + } + Some(ap) => { + sqlx::query( + "UPDATE tasks SET + status = 'running', + started_at = ?, + expires_at = CASE + WHEN ttl_from = 'first_attempt' AND ttl_seconds IS NOT NULL AND expires_at IS NULL + THEN ? + (ttl_seconds * 1000) + ELSE expires_at + END + WHERE id IN ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + AND (expires_at IS NULL OR expires_at > ?) + ORDER BY + MAX( + priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), + ? + ) ASC, + id ASC + LIMIT ? + ) + RETURNING *", + ) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .bind(limit as i64) + .fetch_all(&self.pool) + .await? + } + }; let records: Vec = rows.iter().map(row_to_task_record).collect(); if !records.is_empty() { @@ -163,39 +246,84 @@ impl TaskStore { /// Returns `None` if the queue is empty. Tasks with a future `run_after` /// timestamp are excluded. /// + /// When `aging` is `Some`, the ORDER BY uses the aging formula. + /// /// For tasks with `ttl_from = 'first_attempt'`, sets `expires_at` on /// the first pop. /// /// Tags are **not** populated — callers needing tags should call /// [`populate_tags`](Self::populate_tags) explicitly or use /// [`task_by_id`](Self::task_by_id). - pub async fn pop_next(&self) -> Result, StoreError> { + pub async fn pop_next( + &self, + aging: Option<&AgingParams>, + ) -> Result, StoreError> { let now_ms = chrono::Utc::now().timestamp_millis(); - let row = sqlx::query( - "UPDATE tasks SET - status = 'running', - started_at = ?, - expires_at = CASE - WHEN ttl_from = 'first_attempt' AND ttl_seconds IS NOT NULL AND expires_at IS NULL - THEN ? + (ttl_seconds * 1000) - ELSE expires_at - END - WHERE id = ( - SELECT id FROM tasks - WHERE status = 'pending' - AND (run_after IS NULL OR run_after <= ?) - AND (expires_at IS NULL OR expires_at > ?) - ORDER BY priority ASC, id ASC - LIMIT 1 - ) - RETURNING *", - ) - .bind(now_ms) - .bind(now_ms) - .bind(now_ms) - .bind(now_ms) - .fetch_optional(&self.pool) - .await?; + let row = match aging { + None => { + sqlx::query( + "UPDATE tasks SET + status = 'running', + started_at = ?, + expires_at = CASE + WHEN ttl_from = 'first_attempt' AND ttl_seconds IS NOT NULL AND expires_at IS NULL + THEN ? + (ttl_seconds * 1000) + ELSE expires_at + END + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + AND (expires_at IS NULL OR expires_at > ?) + ORDER BY priority ASC, id ASC + LIMIT 1 + ) + RETURNING *", + ) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .fetch_optional(&self.pool) + .await? + } + Some(ap) => { + sqlx::query( + "UPDATE tasks SET + status = 'running', + started_at = ?, + expires_at = CASE + WHEN ttl_from = 'first_attempt' AND ttl_seconds IS NOT NULL AND expires_at IS NULL + THEN ? + (ttl_seconds * 1000) + ELSE expires_at + END + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + AND (expires_at IS NULL OR expires_at > ?) + ORDER BY + MAX( + priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), + ? + ) ASC, + id ASC + LIMIT 1 + ) + RETURNING *", + ) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(now_ms) + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .fetch_optional(&self.pool) + .await? + } + }; let record = row.map(|r| row_to_task_record(&r)); if record.is_some() { @@ -452,13 +580,16 @@ impl TaskStore { all_unblocked = unblocked.into_iter().map(|(id,)| id).collect(); // Downgrade any newly-unblocked tasks whose group is paused. + let pause_now_ms = chrono::Utc::now().timestamp_millis(); for task_id in &all_unblocked { sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 8 + pause_reasons = pause_reasons | 8, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE id = ? AND status = 'pending' AND group_key IN (SELECT group_key FROM paused_groups)", ) + .bind(pause_now_ms) .bind(task_id) .execute(&mut *conn) .await?; @@ -647,11 +778,14 @@ impl TaskStore { .await?; if is_paused.is_some() { + let pause_ts = chrono::Utc::now().timestamp_millis(); sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 8 + pause_reasons = pause_reasons | 8, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE id = ?", ) + .bind(pause_ts) .bind(next_id) .execute(&mut **conn) .await?; diff --git a/src/store/mod.rs b/src/store/mod.rs index 31b4bdb..54e94b3 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -162,7 +162,7 @@ impl Default for StoreConfig { /// assert!(outcome.is_inserted()); /// /// // Pop the highest-priority task and mark it running. -/// let task = store.pop_next().await?.unwrap(); +/// let task = store.pop_next(None).await?.unwrap(); /// assert_eq!(task.status, TaskStatus::Running); /// /// // Complete it — moves to history. @@ -304,6 +304,26 @@ impl TaskStore { tracing::info!(count, "recovered interrupted tasks back to pending"); } + // Accumulate stale pause duration for tasks that were paused when + // the scheduler crashed. Over-promotes slightly (aging clock ran + // during the crash window), which is acceptable for anti-starvation. + let now_ms = chrono::Utc::now().timestamp_millis(); + let pause_recovered = sqlx::query( + "UPDATE tasks SET + pause_duration_ms = pause_duration_ms + (? - paused_at_ms), + paused_at_ms = NULL + WHERE status = 'paused' AND paused_at_ms IS NOT NULL", + ) + .bind(now_ms) + .execute(&self.pool) + .await?; + if pause_recovered.rows_affected() > 0 { + tracing::info!( + count = pause_recovered.rows_affected(), + "accumulated stale pause duration on recovery" + ); + } + // Clean up stale dependency edges pointing to tasks that no longer // exist (e.g. crashed mid-cancellation). Then unblock any tasks with // zero remaining edges. diff --git a/src/store/query/tests.rs b/src/store/query/tests.rs index 9481330..c3b8c49 100644 --- a/src/store/query/tests.rs +++ b/src/store/query/tests.rs @@ -33,7 +33,7 @@ async fn history_by_id_lookup() { let store = test_store().await; let sub = make_submission("hist-id", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .complete(task.id, &IoBudget::disk(100, 50)) @@ -58,7 +58,7 @@ async fn history_stats_computation() { for i in 0..3 { let sub = make_submission(&format!("stat-{i}"), Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .complete(task.id, &IoBudget::disk(1000, 500)) .await @@ -101,7 +101,7 @@ async fn task_lookup_active() { let result = store.task_lookup(&key).await.unwrap(); assert!(matches!(result, TaskLookup::Active(ref r) if r.status == TaskStatus::Pending)); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); let result = store.task_lookup(&key).await.unwrap(); assert!(matches!(result, TaskLookup::Active(ref r) if r.status == TaskStatus::Running)); } @@ -112,7 +112,7 @@ async fn task_lookup_history() { let sub = make_submission("lookup-hist", Priority::NORMAL); let key = sub.effective_key(); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.complete(task.id, &IoBudget::default()).await.unwrap(); let result = store.task_lookup(&key).await.unwrap(); @@ -134,7 +134,7 @@ async fn prune_by_count() { for i in 0..5 { let sub = make_submission(&format!("prune-{i}"), Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store.complete(task.id, &IoBudget::default()).await.unwrap(); } diff --git a/src/store/row_mapping.rs b/src/store/row_mapping.rs index bbcfdad..12530ee 100644 --- a/src/store/row_mapping.rs +++ b/src/store/row_mapping.rs @@ -77,6 +77,10 @@ pub(crate) fn row_to_task_record(row: &sqlx::sqlite::SqliteRow) -> TaskRecord { max_retries: row.get("max_retries"), memo: row.get("memo"), pause_reasons: PauseReasons::from_bits(row.try_get::("pause_reasons").unwrap_or(0)), + pause_duration_ms: row.try_get::("pause_duration_ms").unwrap_or(0), + paused_at_ms: row + .try_get::, _>("paused_at_ms") + .unwrap_or(None), } } diff --git a/src/store/submit/mod.rs b/src/store/submit/mod.rs index 032899c..cd58d29 100644 --- a/src/store/submit/mod.rs +++ b/src/store/submit/mod.rs @@ -180,11 +180,14 @@ pub(crate) async fn submit_one( }; if is_group_paused { + let pause_now_ms = chrono::Utc::now().timestamp_millis(); sqlx::query( "UPDATE tasks SET status = 'paused', - pause_reasons = pause_reasons | 8 + pause_reasons = pause_reasons | 8, + paused_at_ms = CASE WHEN paused_at_ms IS NULL THEN ? ELSE paused_at_ms END WHERE id = ? AND status = 'pending'", ) + .bind(pause_now_ms) .bind(task_id) .execute(&mut **conn) .await?; diff --git a/src/store/submit/tests.rs b/src/store/submit/tests.rs index 6298143..6060204 100644 --- a/src/store/submit/tests.rs +++ b/src/store/submit/tests.rs @@ -25,7 +25,7 @@ async fn submit_and_pop() { let outcome = store.submit(&sub).await.unwrap(); assert!(outcome.is_inserted()); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.key, expected_key); assert_eq!(task.status, TaskStatus::Running); assert!(task.started_at.is_some()); @@ -73,7 +73,7 @@ async fn dedup_requeues_when_running() { let sub = make_submission("running-task", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); let sub_high = make_submission("running-task", Priority::HIGH); let outcome = store.submit(&sub_high).await.unwrap(); @@ -92,7 +92,7 @@ async fn dedup_requeues_when_running() { assert!(!requeued.requeue); assert_eq!(requeued.requeue_priority, None); - let popped = store.pop_next().await.unwrap().unwrap(); + let popped = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(popped.id, task.id); } @@ -102,7 +102,7 @@ async fn dedup_requeue_already_requeued_same_priority() { let sub = make_submission("rq-dup", Priority::NORMAL); store.submit(&sub).await.unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); let sub_high = make_submission("rq-dup", Priority::HIGH); let outcome = store.submit(&sub_high).await.unwrap(); @@ -118,7 +118,7 @@ async fn dedup_requeue_upgrades_priority() { let sub = make_submission("rq-upgrade", Priority::BACKGROUND); store.submit(&sub).await.unwrap(); - store.pop_next().await.unwrap(); + store.pop_next(None).await.unwrap(); let sub_normal = make_submission("rq-upgrade", Priority::NORMAL); let outcome = store.submit(&sub_normal).await.unwrap(); @@ -139,7 +139,7 @@ async fn permanent_failure_drops_requeue() { let sub = make_submission("fail-rq", Priority::NORMAL); store.submit(&sub).await.unwrap(); - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); let sub_high = make_submission("fail-rq", Priority::HIGH); store.submit(&sub_high).await.unwrap(); diff --git a/src/task/mod.rs b/src/task/mod.rs index a03ee76..6f3c83b 100644 --- a/src/task/mod.rs +++ b/src/task/mod.rs @@ -282,6 +282,12 @@ pub struct TaskRecord { pub memo: Option>, /// Bitmask of active pause reasons. 0 when the task is not paused. pub pause_reasons: PauseReasons, + /// Accumulated milliseconds spent in `paused` state. Excluded from + /// the aging formula to freeze the aging clock while paused. + pub pause_duration_ms: i64, + /// Epoch-ms timestamp of the most recent pause transition. `None` + /// when the task is not paused. + pub paused_at_ms: Option, } impl TaskRecord { @@ -302,8 +308,39 @@ impl TaskRecord { } } + /// Compute effective priority with aging applied. + /// + /// Returns `self.priority` when `config` is `None` (aging disabled) + /// or the task hasn't aged past the grace period. + pub fn effective_priority( + &self, + config: Option<&crate::scheduler::aging::AgingConfig>, + ) -> crate::priority::Priority { + let Some(config) = config else { + return self.priority; + }; + crate::scheduler::aging::effective_priority( + self.priority, + self.created_at.timestamp_millis(), + self.pause_duration_ms, + config, + ) + } + /// Build a [`TaskEventHeader`](crate::scheduler::event::TaskEventHeader) from this record. + /// + /// When `aging_config` is provided, the header's `effective_priority` reflects + /// the aging computation. Otherwise, `effective_priority == base_priority`. pub fn event_header(&self) -> crate::scheduler::event::TaskEventHeader { + self.event_header_with_aging(None) + } + + /// Build a [`TaskEventHeader`] with aging-aware effective priority. + pub fn event_header_with_aging( + &self, + aging_config: Option<&crate::scheduler::aging::AgingConfig>, + ) -> crate::scheduler::event::TaskEventHeader { + let effective = self.effective_priority(aging_config); crate::scheduler::event::TaskEventHeader { task_id: self.id, module: self @@ -315,6 +352,8 @@ impl TaskRecord { key: self.key.clone(), label: self.label.clone(), tags: self.tags.clone(), + base_priority: self.priority, + effective_priority: effective, } } } diff --git a/src/task/tests.rs b/src/task/tests.rs index be5cb1a..13fdf54 100644 --- a/src/task/tests.rs +++ b/src/task/tests.rs @@ -256,6 +256,8 @@ fn event_header_includes_tags() { max_retries: None, memo: None, pause_reasons: super::PauseReasons::NONE, + pause_duration_ms: 0, + paused_at_ms: None, }; record.tags.insert("env".into(), "prod".into()); record.tags.insert("owner".into(), "alice".into()); diff --git a/tests/integration.rs b/tests/integration.rs index 83f4b77..c8d791f 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -12,6 +12,8 @@ //! - `cross_module`: steps 8–11 (TaskContext module access, cross-module child //! spawning, Scheduler::modules(), event module identity) +#[path = "integration/aging.rs"] +mod aging; #[path = "integration/common.rs"] mod common; #[path = "integration/cross_module.rs"] diff --git a/tests/integration/aging.rs b/tests/integration/aging.rs new file mode 100644 index 0000000..4a2f104 --- /dev/null +++ b/tests/integration/aging.rs @@ -0,0 +1,177 @@ +//! Integration tests for priority aging (Phase 1 of fair scheduling). + +use std::time::Duration; + +use taskmill::{ + AgingConfig, Domain, Priority, Scheduler, SchedulerEvent, TaskStore, TaskSubmission, +}; +use tokio_util::sync::CancellationToken; + +use super::common::*; + +// ── Aged task dispatches before younger ────────────────────────────── + +#[tokio::test] +async fn aged_task_dispatches_before_younger() { + let store = TaskStore::open_memory().await.unwrap(); + + // Insert an IDLE task with a very old created_at via raw SQL. + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; // 10 min ago + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test::test', 'old-idle', 'old-idle', ?, 'pending', ?)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + // Submit a fresh NORMAL task. + store + .submit( + &TaskSubmission::new("test::test") + .key("fresh-normal") + .priority(Priority::NORMAL), + ) + .await + .unwrap(); + + let sched = Scheduler::builder() + .store(store) + .domain(Domain::::new().task::(NoopExecutor)) + .max_concurrency(1) + .priority_aging(AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(1), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }) + .build() + .await + .unwrap(); + + let mut rx = sched.subscribe(); + let token = CancellationToken::new(); + let t = token.clone(); + tokio::spawn(async move { sched.run(t).await }); + + // The aged IDLE task should dispatch first (effective priority = HIGH(64) < NORMAL(128)). + let event = tokio::time::timeout(Duration::from_secs(5), rx.recv()) + .await + .unwrap() + .unwrap(); + + if let SchedulerEvent::Dispatched(header) = event { + assert_eq!(header.label, "old-idle"); + assert_eq!(header.base_priority, Priority::IDLE); + assert!( + header.effective_priority.value() < Priority::NORMAL.value(), + "effective_priority {} should be higher than NORMAL {}", + header.effective_priority.value(), + Priority::NORMAL.value(), + ); + } else { + panic!("expected Dispatched event, got {:?}", event); + } + + token.cancel(); +} + +// ── Builder configures aging ───────────────────────────────────────── + +#[tokio::test] +async fn builder_configures_aging() { + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain(Domain::::new().task::(NoopExecutor)) + .priority_aging(AgingConfig::default()) + .build() + .await + .unwrap(); + + let snap = sched.snapshot().await.unwrap(); + assert!(snap.aging_config.is_some()); +} + +// ── Snapshot shows aging config ────────────────────────────────────── + +#[tokio::test] +async fn snapshot_shows_aging_config() { + let config = AgingConfig { + grace_period: Duration::from_secs(120), + aging_interval: Duration::from_secs(30), + max_effective_priority: Priority::REALTIME, + urgent_threshold: None, + }; + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain(Domain::::new().task::(NoopExecutor)) + .priority_aging(config.clone()) + .build() + .await + .unwrap(); + + let snap = sched.snapshot().await.unwrap(); + let ac = snap.aging_config.unwrap(); + assert_eq!(ac.grace_period, Duration::from_secs(120)); + assert_eq!(ac.aging_interval, Duration::from_secs(30)); + assert_eq!(ac.max_effective_priority, Priority::REALTIME); +} + +// ── Dispatched event has effective priority ────────────────────────── + +#[tokio::test] +async fn dispatched_event_has_effective_priority() { + let store = TaskStore::open_memory().await.unwrap(); + + // Insert an old task. + let old_ms = chrono::Utc::now().timestamp_millis() - 600_000; + sqlx::query( + "INSERT INTO tasks (task_type, key, label, priority, status, created_at) VALUES ('test::test', 'aged', 'aged', ?, 'pending', ?)", + ) + .bind(Priority::IDLE.value() as i32) + .bind(old_ms) + .execute(store.pool()) + .await + .unwrap(); + + let sched = Scheduler::builder() + .store(store) + .domain(Domain::::new().task::(NoopExecutor)) + .max_concurrency(1) + .priority_aging(AgingConfig { + grace_period: Duration::from_secs(0), + aging_interval: Duration::from_secs(1), + max_effective_priority: Priority::HIGH, + urgent_threshold: None, + }) + .build() + .await + .unwrap(); + + let mut rx = sched.subscribe(); + let token = CancellationToken::new(); + let t = token.clone(); + tokio::spawn(async move { sched.run(t).await }); + + let event = tokio::time::timeout(Duration::from_secs(5), rx.recv()) + .await + .unwrap() + .unwrap(); + + if let SchedulerEvent::Dispatched(header) = event { + assert_eq!(header.base_priority, Priority::IDLE); + // effective_priority should be lower numeric value (higher priority) than base. + assert!( + header.effective_priority.value() < header.base_priority.value(), + "expected aging to promote: effective={} base={}", + header.effective_priority.value(), + header.base_priority.value(), + ); + } else { + panic!("expected Dispatched event, got {:?}", event); + } + + token.cancel(); +} diff --git a/tests/integration/dependencies.rs b/tests/integration/dependencies.rs index aa4e6e1..e2df2eb 100644 --- a/tests/integration/dependencies.rs +++ b/tests/integration/dependencies.rs @@ -53,10 +53,10 @@ async fn dep_basic_blocked_then_unblocked() { let b = store.task_by_id(id_b).await.unwrap().unwrap(); assert_eq!(b.status, taskmill::TaskStatus::Blocked); - assert!(store.peek_next().await.unwrap().is_some()); // A is pending + assert!(store.peek_next(None).await.unwrap().is_some()); // A is pending // Complete A. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(a.id, id_a); store .complete(a.id, &taskmill::IoBudget::default()) @@ -96,7 +96,7 @@ async fn dep_fail_cancels_dependent() { .unwrap(); // Fail A permanently. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .fail( a.id, @@ -156,7 +156,7 @@ async fn dep_fan_in() { assert_eq!(c.status, taskmill::TaskStatus::Blocked); // Complete A. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .complete(a.id, &taskmill::IoBudget::default()) .await @@ -168,7 +168,7 @@ async fn dep_fan_in() { assert_eq!(c.status, taskmill::TaskStatus::Blocked); // Complete B. - let b = store.pop_next().await.unwrap().unwrap(); + let b = store.pop_next(None).await.unwrap().unwrap(); store .complete(b.id, &taskmill::IoBudget::default()) .await @@ -214,7 +214,7 @@ async fn dep_fan_out() { .unwrap(); // Complete A. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .complete(a.id, &taskmill::IoBudget::default()) .await @@ -308,7 +308,7 @@ async fn dep_already_completed() { .unwrap(); // Complete A. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .complete(a.id, &taskmill::IoBudget::default()) .await @@ -343,7 +343,7 @@ async fn dep_already_failed() { .id() .unwrap(); - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .fail( a.id, @@ -449,7 +449,7 @@ async fn dep_ignore_policy_unblocks() { assert_eq!(b.status, taskmill::TaskStatus::Blocked); // Fail A permanently. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .fail( a.id, @@ -575,7 +575,7 @@ async fn dep_diamond_chain() { ); // Complete A → B and C unblock, D still blocked. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .complete(a.id, &taskmill::IoBudget::default()) .await @@ -589,7 +589,7 @@ async fn dep_diamond_chain() { ); // Complete B → D still blocked (needs C). - let b = store.pop_next().await.unwrap().unwrap(); + let b = store.pop_next(None).await.unwrap().unwrap(); store .complete(b.id, &taskmill::IoBudget::default()) .await @@ -598,7 +598,7 @@ async fn dep_diamond_chain() { assert!(unblocked.is_empty()); // Complete C → D unblocks. - let c = store.pop_next().await.unwrap().unwrap(); + let c = store.pop_next(None).await.unwrap().unwrap(); store .complete(c.id, &taskmill::IoBudget::default()) .await @@ -743,7 +743,7 @@ async fn dep_blocked_tasks_survive_across_store_reopen() { assert_eq!(deps, vec![id_a]); // Complete A and resolve — B should unblock. - let a = store.pop_next().await.unwrap().unwrap(); + let a = store.pop_next(None).await.unwrap().unwrap(); store .complete(a.id, &taskmill::IoBudget::default()) .await diff --git a/tests/integration/scheduler_core.rs b/tests/integration/scheduler_core.rs index ee5c7e9..f3008bb 100644 --- a/tests/integration/scheduler_core.rs +++ b/tests/integration/scheduler_core.rs @@ -657,7 +657,7 @@ async fn running_tasks_reset_to_pending_on_restart() { let store = TaskStore::open(&db_path).await.unwrap(); let sub = TaskSubmission::new("test").key("crash-recovery"); store.submit(&sub).await.unwrap(); - store.pop_next().await.unwrap(); // now "running" + store.pop_next(None).await.unwrap(); // now "running" let running = store.running_count().await.unwrap(); assert_eq!(running, 1, "task should be running"); @@ -774,9 +774,9 @@ async fn delayed_task_not_dispatched_before_run_after() { store.submit(&sub).await.unwrap(); // peek_next should return None because run_after is in the future. - assert!(store.peek_next().await.unwrap().is_none()); + assert!(store.peek_next(None).await.unwrap().is_none()); // pop_next should also return None. - assert!(store.pop_next().await.unwrap().is_none()); + assert!(store.pop_next(None).await.unwrap().is_none()); // But the task is still pending. assert_eq!(store.pending_count().await.unwrap(), 1); @@ -793,7 +793,7 @@ async fn delayed_task_dispatched_after_run_after() { store.submit(&sub).await.unwrap(); // Should be immediately dispatchable since run_after is in the past. - let task = store.peek_next().await.unwrap(); + let task = store.peek_next(None).await.unwrap(); assert!(task.is_some()); assert!(task.unwrap().run_after.is_some()); } @@ -810,7 +810,7 @@ async fn recurring_task_creates_next_instance_on_completion() { let dedup_key = sub.effective_key(); // Pop and complete. - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert_eq!(task.recurring_interval_secs, Some(60)); assert_eq!(task.recurring_execution_count, 0); @@ -845,7 +845,7 @@ async fn recurring_task_respects_max_executions() { let dedup_key = sub.effective_key(); // First execution. - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .complete(task.id, &taskmill::IoBudget::default()) .await @@ -858,7 +858,7 @@ async fn recurring_task_respects_max_executions() { tokio::time::sleep(Duration::from_secs(2)).await; // Second execution. - let task2 = store.pop_next().await.unwrap().unwrap(); + let task2 = store.pop_next(None).await.unwrap().unwrap(); store .complete(task2.id, &taskmill::IoBudget::default()) .await @@ -881,7 +881,7 @@ async fn recurring_pile_up_prevention() { let dedup_key = sub.effective_key(); // Pop, complete → next instance created. - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); store .complete(task.id, &taskmill::IoBudget::default()) .await @@ -913,7 +913,7 @@ async fn pause_and_resume_recurring_schedule() { store.pause_recurring(id).await.unwrap(); // Pop and complete — should NOT create next instance. - let task = store.pop_next().await.unwrap().unwrap(); + let task = store.pop_next(None).await.unwrap().unwrap(); assert!(task.recurring_paused); store .complete(task.id, &taskmill::IoBudget::default()) From f80badad0f642bd9d1ee7b12f7f2829d162ec4a9 Mon Sep 17 00:00:00 2001 From: DJ Majumdar Date: Tue, 24 Mar 2026 01:12:40 -0700 Subject: [PATCH 2/3] feat!: implement weighted fair scheduling for group-aware dispatch (#37, phase 2) Add three-pass fair dispatch loop that allocates slots proportional to group weights, fills remaining capacity greedily, and dispatches urgently-aged tasks as a safety valve. Groups without explicit weights use the default weight; ungrouped tasks compete as a virtual group. New builder API: group_weight(), default_group_weight(), group_minimum_slots(). Runtime API: set_group_weight(), remove_group_weight(), reset_group_weights(), set_group_minimum_slots(). GroupWeightChanged event emitted on runtime changes. SchedulerSnapshot includes group_allocations. Store queries added: peek_next_in_group(), peek_next_ungrouped(), running/pending_counts_per_group(), peek_next_urgent(). Composes with phase 1 aging, rate limits, group pause, and concurrency caps. Fast dispatch disabled when weights configured. --- src/domain.rs | 22 ++ src/lib.rs | 6 +- src/module.rs | 22 ++ src/scheduler/builder.rs | 48 +++ src/scheduler/control.rs | 45 ++- src/scheduler/event.rs | 12 +- src/scheduler/fair.rs | 495 ++++++++++++++++++++++++++ src/scheduler/gate.rs | 35 +- src/scheduler/mod.rs | 5 + src/scheduler/queries.rs | 45 +++ src/scheduler/run_loop.rs | 343 +++++++++++++++++- src/store/query/scheduling.rs | 194 +++++++++++ tests/integration.rs | 2 + tests/integration/fair_scheduling.rs | 496 +++++++++++++++++++++++++++ 14 files changed, 1749 insertions(+), 21 deletions(-) create mode 100644 src/scheduler/fair.rs create mode 100644 tests/integration/fair_scheduling.rs diff --git a/src/domain.rs b/src/domain.rs index e4ebc6a..53b8d78 100644 --- a/src/domain.rs +++ b/src/domain.rs @@ -736,6 +736,28 @@ impl DomainHandle { self.inner.remove_group_rate_limit(group); } + // ── Group Weights (Fair Scheduling) ────────────────────────────── + + /// Set or update the scheduling weight for a group at runtime. + pub fn set_group_weight(&self, group: impl Into, weight: u32) { + self.inner.set_group_weight(group, weight); + } + + /// Remove a group weight override, falling back to the default weight. + pub fn remove_group_weight(&self, group: &str) { + self.inner.remove_group_weight(group); + } + + /// Reset all group weights to the default. + pub fn reset_group_weights(&self) { + self.inner.reset_group_weights(); + } + + /// Set the minimum guaranteed slots for a group. + pub fn set_group_minimum_slots(&self, group: impl Into, slots: usize) { + self.inner.set_group_minimum_slots(group, slots); + } + // ── Events ────────────────────────────────────────────────────── /// Subscribe to all events for this domain. diff --git a/src/lib.rs b/src/lib.rs index 2da1c19..d0e264f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -808,9 +808,9 @@ pub use resource::network_pressure::NetworkPressure; pub use resource::sampler::SamplerConfig; pub use resource::{ResourceReader, ResourceSampler, ResourceSnapshot}; pub use scheduler::{ - AgingConfig, EstimatedProgress, GroupLimits, PausedGroupInfo, ProgressReporter, RateLimit, - RateLimitInfo, Scheduler, SchedulerBuilder, SchedulerConfig, SchedulerEvent, SchedulerSnapshot, - ShutdownMode, TaskEventHeader, TaskProgress, + AgingConfig, EstimatedProgress, GroupAllocationInfo, GroupLimits, PausedGroupInfo, + ProgressReporter, RateLimit, RateLimitInfo, Scheduler, SchedulerBuilder, SchedulerConfig, + SchedulerEvent, SchedulerSnapshot, ShutdownMode, TaskEventHeader, TaskProgress, }; pub use store::{RetentionPolicy, StoreConfig, StoreError, TaskStore}; pub use task::{ diff --git a/src/module.rs b/src/module.rs index e6f825e..efb792d 100644 --- a/src/module.rs +++ b/src/module.rs @@ -745,6 +745,28 @@ impl ModuleHandle { self.scheduler.remove_group_rate_limit(group); } + // ── Group Weights (Fair Scheduling) ────────────────────────────── + + /// Set or update the scheduling weight for a group at runtime. + pub fn set_group_weight(&self, group: impl Into, weight: u32) { + self.scheduler.set_group_weight(group, weight); + } + + /// Remove a group weight override, falling back to the default weight. + pub fn remove_group_weight(&self, group: &str) { + self.scheduler.remove_group_weight(group); + } + + /// Reset all group weights to the default. + pub fn reset_group_weights(&self) { + self.scheduler.reset_group_weights(); + } + + /// Set the minimum guaranteed slots for a group. + pub fn set_group_minimum_slots(&self, group: impl Into, slots: usize) { + self.scheduler.set_group_minimum_slots(group, slots); + } + // ── Scoped queries ──────────────────────────────────────────── /// All active tasks in this module (any status). diff --git a/src/scheduler/builder.rs b/src/scheduler/builder.rs index da3b145..3f39663 100644 --- a/src/scheduler/builder.rs +++ b/src/scheduler/builder.rs @@ -57,6 +57,9 @@ pub struct SchedulerBuilder { group_concurrency_overrides: Vec<(String, usize)>, type_rate_limits: Vec<(String, RateLimit)>, group_rate_limits: Vec<(String, RateLimit)>, + group_weights: Vec<(String, u32)>, + default_group_weight: u32, + group_min_slots: Vec<(String, usize)>, } impl SchedulerBuilder { @@ -79,6 +82,9 @@ impl SchedulerBuilder { group_concurrency_overrides: Vec::new(), type_rate_limits: Vec::new(), group_rate_limits: Vec::new(), + group_weights: Vec::new(), + default_group_weight: 1, + group_min_slots: Vec::new(), } } @@ -284,6 +290,26 @@ impl SchedulerBuilder { self } + /// Set a scheduling weight for a specific group. + /// + /// Weights are relative — `(A:3, B:1)` gives A 75% and B 25%. + pub fn group_weight(mut self, group: impl Into, weight: u32) -> Self { + self.group_weights.push((group.into(), weight)); + self + } + + /// Default weight for groups without a specific override. Default: 1. + pub fn default_group_weight(mut self, weight: u32) -> Self { + self.default_group_weight = weight; + self + } + + /// Minimum guaranteed slots for a group, regardless of weight. + pub fn group_minimum_slots(mut self, group: impl Into, slots: usize) -> Self { + self.group_min_slots.push((group.into(), slots)); + self + } + /// Register shared application state accessible from every executor via /// [`TaskContext::state`](crate::TaskContext::state). /// @@ -509,6 +535,27 @@ impl SchedulerBuilder { scheduler.inner.group_rate_limits.set(scope, limit); } + // Apply group weights. + let has_group_weights = !self.group_weights.is_empty() || !self.group_min_slots.is_empty(); + if self.default_group_weight != 1 { + scheduler + .inner + .group_weights + .set_default(self.default_group_weight); + } + for (group, weight) in &self.group_weights { + scheduler + .inner + .group_weights + .set_weight(group.clone(), *weight); + } + for (group, slots) in &self.group_min_slots { + scheduler + .inner + .group_weights + .set_min_slots(group.clone(), *slots); + } + // Compute fast-dispatch eligibility before consuming builder fields. let has_groups = self.default_group_concurrency > 0 || !self.group_concurrency_overrides.is_empty(); @@ -563,6 +610,7 @@ impl SchedulerBuilder { && !has_module_caps && !has_paused_groups && !has_rate_limits + && !has_group_weights { scheduler .inner diff --git a/src/scheduler/control.rs b/src/scheduler/control.rs index a25b914..9684631 100644 --- a/src/scheduler/control.rs +++ b/src/scheduler/control.rs @@ -242,6 +242,46 @@ impl Scheduler { self.inner.paused_groups.read().unwrap().contains(group_key) } + // ── Group Weights (Fair Scheduling) ───────────────────────────── + + /// Set or update the scheduling weight for a group at runtime. + pub fn set_group_weight(&self, group: impl Into, weight: u32) { + let group = group.into(); + let previous = self.inner.group_weights.weight_for(&group); + self.inner.group_weights.set_weight(group.clone(), weight); + self.inner + .fast_dispatch + .store(false, AtomicOrdering::Relaxed); + emit_event( + &self.inner.event_tx, + SchedulerEvent::GroupWeightChanged { + group, + previous_weight: previous, + new_weight: weight, + }, + ); + } + + /// Remove a group weight override, falling back to the default weight. + pub fn remove_group_weight(&self, group: &str) { + self.inner.group_weights.remove_weight(group); + self.maybe_restore_fast_dispatch(); + } + + /// Reset all group weights to the default. + pub fn reset_group_weights(&self) { + self.inner.group_weights.reset_all(); + self.maybe_restore_fast_dispatch(); + } + + /// Set the minimum guaranteed slots for a group. + pub fn set_group_minimum_slots(&self, group: impl Into, slots: usize) { + self.inner.group_weights.set_min_slots(group.into(), slots); + self.inner + .fast_dispatch + .store(false, AtomicOrdering::Relaxed); + } + // ── Rate Limiting ────────────────────────────────────────────── /// Set or update the rate limit for a task type at runtime. @@ -279,7 +319,8 @@ impl Scheduler { /// /// Must mirror the conditions in `SchedulerBuilder::build()`: /// no paused groups, no group limits (default or overrides), no resource - /// monitoring, no pressure sources, no module concurrency caps, no rate limits. + /// monitoring, no pressure sources, no module concurrency caps, no rate limits, + /// no group weights. fn maybe_restore_fast_dispatch(&self) { let has_groups = self.inner.group_limits.default_limit() > 0 || self.inner.group_limits.has_overrides() @@ -287,6 +328,7 @@ impl Scheduler { let has_module_caps = !self.inner.module_caps.read().unwrap().is_empty(); let has_rate_limits = !self.inner.type_rate_limits.is_empty() || !self.inner.group_rate_limits.is_empty(); + let has_group_weights = self.inner.group_weights.is_configured(); if !has_groups && !self @@ -299,6 +341,7 @@ impl Scheduler { .has_pressure_sources .load(AtomicOrdering::Relaxed) && !has_rate_limits + && !has_group_weights { self.inner .fast_dispatch diff --git a/src/scheduler/event.rs b/src/scheduler/event.rs index 02ab93e..ce9dd36 100644 --- a/src/scheduler/event.rs +++ b/src/scheduler/event.rs @@ -56,6 +56,9 @@ pub struct SchedulerSnapshot { pub rate_limits: Vec, /// Priority aging configuration (if enabled). pub aging_config: Option, + /// Per-group slot allocations (current cycle). Empty when fair scheduling + /// is not configured. + pub group_allocations: Vec, } /// Information about a paused group for snapshot/dashboard display. @@ -198,6 +201,12 @@ pub enum SchedulerEvent { }, /// A task group was resumed. GroupResumed { group: String, resumed_count: usize }, + /// A group's scheduling weight was changed at runtime. + GroupWeightChanged { + group: String, + previous_weight: u32, + new_weight: u32, + }, } impl SchedulerEvent { @@ -221,7 +230,8 @@ impl SchedulerEvent { | Self::Paused | Self::Resumed | Self::GroupPaused { .. } - | Self::GroupResumed { .. } => None, + | Self::GroupResumed { .. } + | Self::GroupWeightChanged { .. } => None, } } } diff --git a/src/scheduler/fair.rs b/src/scheduler/fair.rs new file mode 100644 index 0000000..7eb7f53 --- /dev/null +++ b/src/scheduler/fair.rs @@ -0,0 +1,495 @@ +//! Weighted fair scheduling — per-group slot allocation. +//! +//! When group weights are configured, the scheduler uses a three-pass +//! dispatch loop: +//! +//! 1. **Fair pass** — each group (including ungrouped tasks as a virtual +//! group) receives slots proportional to its weight. +//! 2. **Greedy pass** — unfilled slots are filled by global priority order. +//! 3. **Urgent pass** — tasks aged past `urgent_threshold` bypass weights +//! (but still respect `max_concurrency`). +//! +//! This module provides [`GroupWeights`] (thread-safe weight storage) and +//! [`compute_allocation`] (the per-cycle slot allocator). + +use std::collections::HashMap; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::RwLock; + +use serde::{Deserialize, Serialize}; + +use super::gate::GroupLimits; + +// ── GroupWeights ──────────────────────────────────────────────────── + +/// Per-group scheduling weights for weighted fair dispatch. +/// +/// Weights are relative — `(A:3, B:1)` gives A 75% and B 25% of capacity. +/// A group without an explicit weight gets `default_weight`. +/// +/// Thread safety: `RwLock` — reads (every dispatch cycle) vastly outnumber +/// writes (runtime reconfiguration). +pub struct GroupWeights { + default_weight: AtomicU32, + weights: RwLock>, + min_slots: RwLock>, +} + +impl Default for GroupWeights { + fn default() -> Self { + Self::new() + } +} + +impl GroupWeights { + pub fn new() -> Self { + Self { + default_weight: AtomicU32::new(1), + weights: RwLock::new(HashMap::new()), + min_slots: RwLock::new(HashMap::new()), + } + } + + /// The default weight used for groups without an explicit override + /// and for the ungrouped virtual group in the allocation. + pub fn default_weight(&self) -> u32 { + self.default_weight.load(Ordering::Relaxed) + } + + /// Effective weight for a group (override or default). + pub fn weight_for(&self, group: &str) -> u32 { + self.weights + .read() + .unwrap() + .get(group) + .copied() + .unwrap_or_else(|| self.default_weight()) + } + + /// Minimum guaranteed slots for a group (`None` = no minimum). + pub fn min_slots_for(&self, group: &str) -> Option { + self.min_slots.read().unwrap().get(group).copied() + } + + /// Set weight for a specific group. + pub fn set_weight(&self, group: String, weight: u32) { + self.weights.write().unwrap().insert(group, weight); + } + + /// Remove per-group weight override. + pub fn remove_weight(&self, group: &str) { + self.weights.write().unwrap().remove(group); + } + + /// Set minimum guaranteed slots for a group. + pub fn set_min_slots(&self, group: String, slots: usize) { + self.min_slots.write().unwrap().insert(group, slots); + } + + /// Set the default weight for groups without overrides. + pub fn set_default(&self, weight: u32) { + self.default_weight.store(weight, Ordering::Relaxed); + } + + /// Reset all weights to default. + pub fn reset_all(&self) { + self.weights.write().unwrap().clear(); + self.min_slots.write().unwrap().clear(); + self.default_weight.store(1, Ordering::Relaxed); + } + + /// Returns `true` if any weights or min_slots are configured. + pub fn is_configured(&self) -> bool { + !self.weights.read().unwrap().is_empty() || !self.min_slots.read().unwrap().is_empty() + } +} + +// ── Slot Allocation ──────────────────────────────────────────────── + +/// Per-group slot allocation computed each dispatch cycle. +pub(crate) struct SlotAllocation { + /// group_key → total slots (running + available-to-dispatch). + /// `None` key = ungrouped tasks (treated as a virtual group with + /// default weight in the allocation algorithm). + pub groups: Vec<(Option, usize)>, +} + +/// Demand for a single group: how many tasks are running and pending. +pub(crate) struct GroupDemand { + pub running: usize, + pub pending: usize, +} + +/// Compute per-group slot allocations. +/// +/// `groups` uses `Option` keys — `None` = ungrouped tasks, +/// treated as a virtual group with default weight. This ensures +/// ungrouped tasks compete fairly rather than only receiving leftovers. +/// +/// Algorithm: +/// 1. Guarantee `min_slots` per group (capped at demand). +/// 2. Distribute remaining capacity proportional to weights. +/// 3. Apply group concurrency caps. +/// 4. Work-conserving: excess from capped/drained groups redistributed. +pub(crate) fn compute_allocation( + total_capacity: usize, + groups: &[(Option, GroupDemand)], + weights: &GroupWeights, + caps: &GroupLimits, + paused_groups: &std::collections::HashSet, +) -> SlotAllocation { + // Filter out paused groups (release their allocation). + // The ungrouped virtual group (key = None) is never paused. + let active: Vec<_> = groups + .iter() + .filter(|(g, _)| match g { + Some(key) => !paused_groups.contains(key), + None => true, + }) + .collect(); + + let mut alloc: HashMap, usize> = HashMap::new(); + let mut remaining = total_capacity; + + // Step 1: Guarantee minimums (ungrouped group has no minimum). + for (group, demand) in &active { + let min = match group { + Some(key) => weights.min_slots_for(key).unwrap_or(0), + None => 0, + }; + let need = demand.running + demand.pending; + let grant = min.min(need).min(remaining); + alloc.insert((*group).clone(), grant); + remaining -= grant; + } + + // Step 2: Distribute remaining by weight. + // The ungrouped virtual group gets the default weight. + let total_weight: u32 = active + .iter() + .map(|(g, _)| match g { + Some(key) => weights.weight_for(key), + None => weights.default_weight(), + }) + .sum(); + + if total_weight > 0 && remaining > 0 { + // Compute raw weight-proportional shares, then iteratively + // redistribute from over-allocated groups (bounded by demand + // or cap) to under-allocated ones. + struct GroupInfo { + key: Option, + share: f64, + need: usize, + cap: usize, + } + + let mut infos: Vec = active + .iter() + .map(|(g, demand)| { + let w = match g { + Some(key) => weights.weight_for(key) as f64, + None => weights.default_weight() as f64, + }; + let already = *alloc.get(g).unwrap_or(&0); + let need = (demand.running + demand.pending).saturating_sub(already); + let cap = match g { + Some(key) => caps.limit_for(key).unwrap_or(usize::MAX), + None => usize::MAX, + }; + let cap_headroom = cap.saturating_sub(already); + let share = remaining as f64 * w / total_weight as f64; + GroupInfo { + key: (*g).clone(), + share: share.min(need as f64).min(cap_headroom as f64).max(0.0), + need: need.min(cap_headroom), + cap: cap_headroom, + } + }) + .collect(); + + // Iteratively redistribute: if bounded shares sum to less than + // remaining, redistribute the surplus to groups with headroom. + for _ in 0..10 { + let floored_sum: usize = infos.iter().map(|i| i.share as usize).sum(); + let surplus = remaining.saturating_sub(floored_sum); + if surplus == 0 { + break; + } + let mut redistributed = false; + for info in infos.iter_mut() { + let floored = info.share as usize; + let headroom = info.need.saturating_sub(floored); + if headroom > 0 && info.share < info.need as f64 { + let extra = (surplus as f64).min(headroom as f64); + info.share = (info.share + extra).min(info.need as f64); + redistributed = true; + break; + } + } + if !redistributed { + break; + } + } + + // Largest-remainder rounding. + let floored_sum: usize = infos.iter().map(|i| i.share as usize).sum(); + let mut leftover = remaining.saturating_sub(floored_sum); + + // Sort by fractional part descending for largest-remainder. + // Tie-break by group name (None < Some) for deterministic allocation. + infos.sort_by(|a, b| { + let fa = a.share - a.share.floor(); + let fb = b.share - b.share.floor(); + fb.partial_cmp(&fa) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.key.cmp(&b.key)) + }); + for info in &infos { + let floored = info.share as usize; + let can_take = info + .need + .saturating_sub(floored) + .min(info.cap.saturating_sub(floored)); + let extra = if leftover > 0 && can_take > 0 { + leftover -= 1; + 1 + } else { + 0 + }; + *alloc.entry(info.key.clone()).or_default() += floored + extra; + } + } + + SlotAllocation { + groups: alloc.into_iter().collect(), + } +} + +// ── Snapshot Info ────────────────────────────────────────────────── + +/// Per-group allocation info for snapshot/dashboard display. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GroupAllocationInfo { + pub group: String, + pub weight: u32, + pub allocated_slots: usize, + pub running: usize, + pub pending: usize, + pub min_slots: Option, + pub cap: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_demand( + groups: Vec<(Option, usize, usize)>, + ) -> Vec<(Option, GroupDemand)> { + groups + .into_iter() + .map(|(g, running, pending)| (g, GroupDemand { running, pending })) + .collect() + } + + #[test] + fn allocation_proportional_to_weights() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 3); + weights.set_weight("B".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![(Some("A".into()), 0, 100), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(8, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map[&Some("A".into())], 6); // 3/4 * 8 = 6 + assert_eq!(map[&Some("B".into())], 2); // 1/4 * 8 = 2 + } + + #[test] + fn allocation_respects_min_slots() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 10); + weights.set_weight("B".into(), 1); + weights.set_min_slots("B".into(), 3); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![(Some("A".into()), 0, 100), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + // B gets at least 3 from min_slots + assert!(map[&Some("B".into())] >= 3); + // Total should equal capacity + let total: usize = map.values().sum(); + assert_eq!(total, 10); + } + + #[test] + fn allocation_respects_caps() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + weights.set_weight("B".into(), 1); + let caps = GroupLimits::new(); + caps.set_limit("A".into(), 2); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![(Some("A".into()), 0, 100), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map[&Some("A".into())], 2); // capped at 2 + assert_eq!(map[&Some("B".into())], 8); // gets the excess + } + + #[test] + fn allocation_work_conserving() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + weights.set_weight("B".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + // A has only 2 pending, B has plenty + let demand = make_demand(vec![(Some("A".into()), 0, 2), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map[&Some("A".into())], 2); // only 2 needed + assert_eq!(map[&Some("B".into())], 8); // gets the rest + } + + #[test] + fn paused_groups_excluded() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + weights.set_weight("B".into(), 1); + let caps = GroupLimits::new(); + let mut paused = std::collections::HashSet::new(); + paused.insert("A".into()); + + let demand = make_demand(vec![(Some("A".into()), 0, 100), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert!(!map.contains_key(&Some("A".into()))); + assert_eq!(map[&Some("B".into())], 10); + } + + #[test] + fn equal_weights_equal_allocation() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + weights.set_weight("B".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![(Some("A".into()), 0, 100), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map[&Some("A".into())], 5); + assert_eq!(map[&Some("B".into())], 5); + } + + #[test] + fn single_group_gets_all() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![(Some("A".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map[&Some("A".into())], 10); + } + + #[test] + fn zero_pending_gets_zero() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + weights.set_weight("B".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![(Some("A".into()), 0, 0), (Some("B".into()), 0, 100)]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map.get(&Some("A".into())).copied().unwrap_or(0), 0); + assert_eq!(map[&Some("B".into())], 10); + } + + #[test] + fn ungrouped_gets_default_weight() { + let weights = GroupWeights::new(); // default weight = 1 + weights.set_weight("A".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![ + (Some("A".into()), 0, 100), + (None, 0, 100), // ungrouped + ]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let map: HashMap<_, _> = alloc.groups.into_iter().collect(); + assert_eq!(map[&Some("A".into())], 5); + assert_eq!(map[&None], 5); + } + + #[test] + fn sum_allocations_equals_capacity() { + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 3); + weights.set_weight("B".into(), 2); + weights.set_weight("C".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![ + (Some("A".into()), 0, 100), + (Some("B".into()), 0, 100), + (Some("C".into()), 0, 100), + ]); + + let alloc = compute_allocation(10, &demand, &weights, &caps, &paused); + let total: usize = alloc.groups.iter().map(|(_, s)| s).sum(); + assert_eq!(total, 10); + } + + #[test] + fn largest_remainder_deterministic() { + // With 3 groups of equal weight and 10 slots, we get 3.33 each. + // Largest-remainder should give 4, 3, 3 deterministically. + let weights = GroupWeights::new(); + weights.set_weight("A".into(), 1); + weights.set_weight("B".into(), 1); + weights.set_weight("C".into(), 1); + let caps = GroupLimits::new(); + let paused = std::collections::HashSet::new(); + + let demand = make_demand(vec![ + (Some("A".into()), 0, 100), + (Some("B".into()), 0, 100), + (Some("C".into()), 0, 100), + ]); + + // Run twice — must produce identical results. + let alloc1 = compute_allocation(10, &demand, &weights, &caps, &paused); + let alloc2 = compute_allocation(10, &demand, &weights, &caps, &paused); + let mut map1: Vec<_> = alloc1.groups.into_iter().collect(); + let mut map2: Vec<_> = alloc2.groups.into_iter().collect(); + map1.sort_by(|a, b| a.0.cmp(&b.0)); + map2.sort_by(|a, b| a.0.cmp(&b.0)); + assert_eq!(map1, map2); + let total: usize = map1.iter().map(|(_, s)| s).sum(); + assert_eq!(total, 10); + } +} diff --git a/src/scheduler/gate.rs b/src/scheduler/gate.rs index 0ba82e9..55b02b5 100644 --- a/src/scheduler/gate.rs +++ b/src/scheduler/gate.rs @@ -57,6 +57,10 @@ pub struct GateContext<'a> { pub type_rate_limits: &'a RateLimits, /// Per-group rate limits. pub group_rate_limits: &'a RateLimits, + /// When true, the gate skips the group concurrency check. Used by + /// `dispatch_fair()` pass 1 where group slot budgets are already + /// enforced by the allocation algorithm. + pub skip_group_concurrency: bool, } // ── Dispatch Gate ────────────────────────────────────────────────── @@ -185,20 +189,23 @@ impl DispatchGate for DefaultDispatchGate { } } - // Group concurrency check. - if let Some(group_key) = &task.group_key { - if let Some(limits) = ctx.group_limits { - if let Some(limit) = limits.limit_for(group_key) { - let running = ctx.store.running_count_for_group(group_key).await?; - if running >= limit as i64 { - tracing::trace!( - task_type = task.task_type, - group = group_key, - running, - limit, - "task deferred — group concurrency saturated — requeuing" - ); - return Ok(Admission::Deny); + // Group concurrency check (skipped in fair dispatch pass 1 + // where the allocation already enforces group slot budgets). + if !ctx.skip_group_concurrency { + if let Some(group_key) = &task.group_key { + if let Some(limits) = ctx.group_limits { + if let Some(limit) = limits.limit_for(group_key) { + let running = ctx.store.running_count_for_group(group_key).await?; + if running >= limit as i64 { + tracing::trace!( + task_type = task.task_type, + group = group_key, + running, + limit, + "task deferred — group concurrency saturated — requeuing" + ); + return Ok(Admission::Deny); + } } } } diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs index 2962876..da848a8 100644 --- a/src/scheduler/mod.rs +++ b/src/scheduler/mod.rs @@ -26,6 +26,7 @@ mod builder; mod control; pub(crate) mod dispatch; pub(crate) mod event; +pub mod fair; pub(crate) mod gate; pub mod progress; mod queries; @@ -83,6 +84,7 @@ pub use event::{ PausedGroupInfo, SchedulerConfig, SchedulerEvent, SchedulerSnapshot, ShutdownMode, TaskEventHeader, }; +pub use fair::GroupAllocationInfo; pub use gate::GroupLimits; pub use progress::{EstimatedProgress, ProgressReporter, TaskProgress}; pub use rate_limit::{RateLimit, RateLimitInfo, RateLimits}; @@ -199,6 +201,8 @@ pub(crate) struct SchedulerInner { pub(crate) failure_rx: std::sync::Arc>>, /// Priority aging configuration. `None` = aging disabled. pub(crate) aging_config: Option>, + /// Per-group scheduling weights for weighted fair dispatch. + pub(crate) group_weights: fair::GroupWeights, } /// IO-aware priority scheduler. @@ -347,6 +351,7 @@ impl Scheduler { failure_tx, failure_rx: std::sync::Arc::new(Mutex::new(failure_rx)), aging_config: config.aging_config.map(Arc::new), + group_weights: fair::GroupWeights::new(), }), } } diff --git a/src/scheduler/queries.rs b/src/scheduler/queries.rs index e029590..590f04b 100644 --- a/src/scheduler/queries.rs +++ b/src/scheduler/queries.rs @@ -3,6 +3,7 @@ use crate::store::StoreError; use super::event::PausedGroupInfo; +use super::fair::GroupAllocationInfo; use super::progress::TaskProgress; use super::{EstimatedProgress, Scheduler, SchedulerSnapshot}; @@ -172,6 +173,49 @@ impl Scheduler { let mut rate_limits = self.inner.type_rate_limits.snapshot_info("type"); rate_limits.extend(self.inner.group_rate_limits.snapshot_info("group")); + // Group allocations (fair scheduling). + let group_allocations = if self.inner.group_weights.is_configured() { + let running_groups = self.inner.store.running_counts_per_group().await?; + let pending_groups = self.inner.store.pending_counts_per_group().await?; + let mut alloc_info = Vec::new(); + // Merge running and pending into a combined view. + let mut groups_seen: std::collections::HashMap, (usize, usize)> = + std::collections::HashMap::new(); + for (g, c) in &running_groups { + groups_seen.entry(g.clone()).or_default().0 = *c; + } + for (g, c) in &pending_groups { + groups_seen.entry(g.clone()).or_default().1 = *c; + } + for (g, (r, p)) in &groups_seen { + let name = g.clone().unwrap_or_default(); + let weight = match g { + Some(key) => self.inner.group_weights.weight_for(key), + None => self.inner.group_weights.default_weight(), + }; + let min = match g { + Some(key) => self.inner.group_weights.min_slots_for(key), + None => None, + }; + let cap = match g { + Some(key) => self.inner.group_limits.limit_for(key), + None => None, + }; + alloc_info.push(GroupAllocationInfo { + group: name, + weight, + allocated_slots: 0, // snapshot is a point-in-time; allocation is per-cycle + running: *r, + pending: *p, + min_slots: min, + cap, + }); + } + alloc_info + } else { + Vec::new() + }; + Ok(SchedulerSnapshot { running, pending_count, @@ -188,6 +232,7 @@ impl Scheduler { paused_groups, rate_limits, aging_config: self.inner.aging_config.as_ref().map(|arc| (**arc).clone()), + group_allocations, }) } } diff --git a/src/scheduler/run_loop.rs b/src/scheduler/run_loop.rs index e41d7bf..0e3d1b9 100644 --- a/src/scheduler/run_loop.rs +++ b/src/scheduler/run_loop.rs @@ -1,5 +1,6 @@ //! The main scheduler run loop, dispatch logic, and shutdown. +use std::collections::HashMap; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::Arc; use std::time::Duration; @@ -7,6 +8,7 @@ use std::time::Duration; use tokio_util::sync::CancellationToken; use crate::scheduler::aging::AgingParams; +use crate::scheduler::fair::{self, GroupDemand}; use crate::store::StoreError; use crate::task::IoBudget; @@ -114,6 +116,7 @@ impl Scheduler { paused_groups: &paused_groups, type_rate_limits: &self.inner.type_rate_limits, group_rate_limits: &self.inner.group_rate_limits, + skip_group_concurrency: false, }; // Admission check while the task is still pending — no running @@ -269,9 +272,12 @@ impl Scheduler { Ok(true) } - /// Dispatch pending tasks using batch pop on the fast path (no gate - /// checks), falling back to one-at-a-time dispatch on the slow path. + /// Dispatch pending tasks using fair scheduling (if configured), batch + /// pop on the fast path, or one-at-a-time on the slow path. async fn dispatch_pending(&self) -> Result<(), StoreError> { + if self.inner.group_weights.is_configured() { + return self.dispatch_fair().await; + } if self.inner.fast_dispatch.load(AtomicOrdering::Relaxed) { let aging = self .inner @@ -311,6 +317,314 @@ impl Scheduler { Ok(()) } + /// Three-pass fair dispatch loop. + /// + /// Pass 1 (fair): Each group gets slots proportional to its weight. + /// Pass 2 (greedy): Unfilled slots filled by global priority order. + /// Pass 3 (urgent): Tasks aged past urgent_threshold bypass weights. + async fn dispatch_fair(&self) -> Result<(), StoreError> { + let active_count = self.inner.active.count(); + let max = self.inner.max_concurrency.load(AtomicOrdering::Relaxed); + if active_count >= max { + return Ok(()); + } + + let now_ms = chrono::Utc::now().timestamp_millis(); + let aging = self + .inner + .aging_config + .as_ref() + .map(|c| AgingParams::from_config(c)); + + // Gather demand. + let running = self.inner.store.running_counts_per_group().await?; + let pending = self.inner.store.pending_counts_per_group().await?; + let paused = self.inner.paused_groups.read().unwrap().clone(); + + let running_map: HashMap, usize> = running.iter().cloned().collect(); + + let demand = merge_demand(&running, &pending); + let allocation = fair::compute_allocation( + max, + &demand, + &self.inner.group_weights, + &self.inner.group_limits, + &paused, + ); + + let mut dispatched = 0; + + // ── Pass 1: Fair per-group dispatch ──────────────────────── + for (group, total_slots) in &allocation.groups { + let group_running = running_map.get(group).copied().unwrap_or(0); + let available = total_slots.saturating_sub(group_running); + for _ in 0..available { + if self.inner.active.count() + dispatched >= max { + break; + } + let candidate = match group { + Some(g) => { + self.inner + .store + .peek_next_in_group(g, aging.as_ref()) + .await? + } + None => self.inner.store.peek_next_ungrouped(aging.as_ref()).await?, + }; + let Some(mut candidate) = candidate else { + break; + }; + + // Expiry check. + if let Some(expires_at) = candidate.expires_at { + if expires_at.timestamp_millis() <= now_ms { + self.expire_task_inline(candidate).await?; + continue; + } + } + + self.inner + .store + .populate_tags(std::slice::from_mut(&mut candidate)) + .await?; + + // Gate check — skip group concurrency (allocation handles it). + let reader_guard = self.inner.resource_reader.lock().await; + let gate_ctx = GateContext { + store: &self.inner.store, + resource_reader: reader_guard.as_ref(), + group_limits: Some(&self.inner.group_limits), + module_caps: &self.inner.module_caps, + module_running: &self.inner.module_running, + paused_groups: &paused, + type_rate_limits: &self.inner.type_rate_limits, + group_rate_limits: &self.inner.group_rate_limits, + skip_group_concurrency: true, + }; + + match self.inner.gate.admit(&candidate, &gate_ctx).await? { + Admission::Admit => { + drop(reader_guard); + if self.inner.store.claim_task(candidate.id).await? { + let mut task = candidate; + task.status = crate::task::TaskStatus::Running; + task.started_at = Some(chrono::Utc::now()); + if task.ttl_from == crate::task::TtlFrom::FirstAttempt + && task.expires_at.is_none() + { + if let Some(ttl) = task.ttl_seconds { + task.expires_at = + Some(chrono::Utc::now() + chrono::Duration::seconds(ttl)); + } + } + self.spawn_dispatched_task(task).await?; + dispatched += 1; + } + } + Admission::RateLimited(next) => { + drop(reader_guard); + let wait = next.duration_since(tokio::time::Instant::now()); + let run_after = chrono::Utc::now() + + chrono::Duration::from_std(wait) + .unwrap_or(chrono::Duration::milliseconds(1)); + self.inner + .store + .set_run_after(candidate.id, run_after) + .await?; + } + Admission::Deny => { + drop(reader_guard); + break; // stop this group + } + } + } + } + + // ── Pass 2: Greedy fill (work-conserving) ───────────────── + let remaining = max.saturating_sub(self.inner.active.count() + dispatched); + if remaining > 0 { + for _ in 0..remaining { + let Some(mut candidate) = self.inner.store.peek_next(aging.as_ref()).await? else { + break; + }; + + if let Some(expires_at) = candidate.expires_at { + if expires_at.timestamp_millis() <= now_ms { + self.expire_task_inline(candidate).await?; + continue; + } + } + + self.inner + .store + .populate_tags(std::slice::from_mut(&mut candidate)) + .await?; + + // Full gate check (group concurrency enforced here). + let reader_guard = self.inner.resource_reader.lock().await; + let gate_ctx = GateContext { + store: &self.inner.store, + resource_reader: reader_guard.as_ref(), + group_limits: Some(&self.inner.group_limits), + module_caps: &self.inner.module_caps, + module_running: &self.inner.module_running, + paused_groups: &paused, + type_rate_limits: &self.inner.type_rate_limits, + group_rate_limits: &self.inner.group_rate_limits, + skip_group_concurrency: false, + }; + + match self.inner.gate.admit(&candidate, &gate_ctx).await? { + Admission::Admit => { + drop(reader_guard); + if self.inner.store.claim_task(candidate.id).await? { + let mut task = candidate; + task.status = crate::task::TaskStatus::Running; + task.started_at = Some(chrono::Utc::now()); + if task.ttl_from == crate::task::TtlFrom::FirstAttempt + && task.expires_at.is_none() + { + if let Some(ttl) = task.ttl_seconds { + task.expires_at = + Some(chrono::Utc::now() + chrono::Duration::seconds(ttl)); + } + } + self.spawn_dispatched_task(task).await?; + } + } + Admission::RateLimited(next) => { + drop(reader_guard); + let wait = next.duration_since(tokio::time::Instant::now()); + let run_after = chrono::Utc::now() + + chrono::Duration::from_std(wait) + .unwrap_or(chrono::Duration::milliseconds(1)); + self.inner + .store + .set_run_after(candidate.id, run_after) + .await?; + } + Admission::Deny => { + drop(reader_guard); + break; + } + } + } + } + + // ── Pass 3: Urgent threshold override ───────────────────── + if let Some(config) = &self.inner.aging_config { + if let Some(urgent) = config.urgent_threshold { + let remaining = max.saturating_sub(self.inner.active.count()); + if remaining > 0 { + self.dispatch_urgent(urgent, remaining, aging.as_ref()) + .await?; + } + } + } + + Ok(()) + } + + /// Dispatch tasks whose effective priority has aged past the urgent + /// threshold, regardless of group allocation. Respects max_concurrency. + async fn dispatch_urgent( + &self, + threshold: crate::priority::Priority, + limit: usize, + aging: Option<&AgingParams>, + ) -> Result<(), StoreError> { + let paused = self.inner.paused_groups.read().unwrap().clone(); + for _ in 0..limit { + let Some(mut candidate) = self.inner.store.peek_next_urgent(threshold, aging).await? + else { + break; + }; + + let now_ms = chrono::Utc::now().timestamp_millis(); + if let Some(expires_at) = candidate.expires_at { + if expires_at.timestamp_millis() <= now_ms { + self.expire_task_inline(candidate).await?; + continue; + } + } + + self.inner + .store + .populate_tags(std::slice::from_mut(&mut candidate)) + .await?; + + // Full gate check (urgent bypasses weights, not concurrency/rate-limits). + let reader_guard = self.inner.resource_reader.lock().await; + let gate_ctx = GateContext { + store: &self.inner.store, + resource_reader: reader_guard.as_ref(), + group_limits: Some(&self.inner.group_limits), + module_caps: &self.inner.module_caps, + module_running: &self.inner.module_running, + paused_groups: &paused, + type_rate_limits: &self.inner.type_rate_limits, + group_rate_limits: &self.inner.group_rate_limits, + skip_group_concurrency: false, + }; + + match self.inner.gate.admit(&candidate, &gate_ctx).await? { + Admission::Admit => { + drop(reader_guard); + if self.inner.store.claim_task(candidate.id).await? { + let mut task = candidate; + task.status = crate::task::TaskStatus::Running; + task.started_at = Some(chrono::Utc::now()); + if task.ttl_from == crate::task::TtlFrom::FirstAttempt + && task.expires_at.is_none() + { + if let Some(ttl) = task.ttl_seconds { + task.expires_at = + Some(chrono::Utc::now() + chrono::Duration::seconds(ttl)); + } + } + self.spawn_dispatched_task(task).await?; + } + } + Admission::RateLimited(next) => { + drop(reader_guard); + let wait = next.duration_since(tokio::time::Instant::now()); + let run_after = chrono::Utc::now() + + chrono::Duration::from_std(wait) + .unwrap_or(chrono::Duration::milliseconds(1)); + self.inner + .store + .set_run_after(candidate.id, run_after) + .await?; + } + Admission::Deny => { + drop(reader_guard); + break; + } + } + } + Ok(()) + } + + /// Expire a single task inline (used by dispatch_fair to avoid code duplication). + async fn expire_task_inline( + &self, + candidate: crate::task::TaskRecord, + ) -> Result<(), StoreError> { + if let Ok(Some(task)) = self.inner.store.expire_single(candidate.id).await { + let age = (chrono::Utc::now() - task.created_at) + .to_std() + .unwrap_or_default(); + emit_event( + &self.inner.event_tx, + SchedulerEvent::TaskExpired { + header: task.event_header(), + age, + }, + ); + } + Ok(()) + } + /// Run the scheduler loop until the cancellation token is triggered. /// /// This is the main entry point. The loop wakes on three conditions: @@ -604,3 +918,28 @@ impl Scheduler { self.inner.store.close().await; } } + +/// Merge running and pending counts into a unified demand list. +fn merge_demand( + running: &[(Option, usize)], + pending: &[(Option, usize)], +) -> Vec<(Option, GroupDemand)> { + let mut map: HashMap, GroupDemand> = HashMap::new(); + for (g, count) in running { + map.entry(g.clone()) + .or_insert(GroupDemand { + running: 0, + pending: 0, + }) + .running = *count; + } + for (g, count) in pending { + map.entry(g.clone()) + .or_insert(GroupDemand { + running: 0, + pending: 0, + }) + .pending = *count; + } + map.into_iter().collect() +} diff --git a/src/store/query/scheduling.rs b/src/store/query/scheduling.rs index 5c4315b..6982aeb 100644 --- a/src/store/query/scheduling.rs +++ b/src/store/query/scheduling.rs @@ -1,7 +1,9 @@ //! Scheduling queries and recurring task control. +use crate::scheduler::aging::AgingParams; use crate::store::row_mapping::row_to_task_record; use crate::store::{StoreError, TaskStore}; +use crate::task::TaskRecord; impl TaskStore { /// Returns the earliest `run_after` timestamp among pending tasks, if any. @@ -69,6 +71,198 @@ impl TaskStore { Ok(()) } + // ── Per-group peek queries (fair scheduling) ────────────────── + + /// Peek the highest effective-priority pending task in a specific group. + pub async fn peek_next_in_group( + &self, + group_key: &str, + aging: Option<&AgingParams>, + ) -> Result, StoreError> { + let now_ms = chrono::Utc::now().timestamp_millis(); + let row = match aging { + None => { + sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND group_key = ? + AND (run_after IS NULL OR run_after <= ?) + ORDER BY priority ASC, id ASC + LIMIT 1 + )", + ) + .bind(group_key) + .bind(now_ms) + .fetch_optional(&self.pool) + .await? + } + Some(ap) => { + sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND group_key = ? + AND (run_after IS NULL OR run_after <= ?) + ORDER BY + MAX( + priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), + ? + ) ASC, + id ASC + LIMIT 1 + )", + ) + .bind(group_key) + .bind(now_ms) + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .fetch_optional(&self.pool) + .await? + } + }; + + Ok(row.as_ref().map(row_to_task_record)) + } + + /// Peek the highest effective-priority pending task with no group. + pub async fn peek_next_ungrouped( + &self, + aging: Option<&AgingParams>, + ) -> Result, StoreError> { + let now_ms = chrono::Utc::now().timestamp_millis(); + let row = match aging { + None => { + sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND group_key IS NULL + AND (run_after IS NULL OR run_after <= ?) + ORDER BY priority ASC, id ASC + LIMIT 1 + )", + ) + .bind(now_ms) + .fetch_optional(&self.pool) + .await? + } + Some(ap) => { + sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND group_key IS NULL + AND (run_after IS NULL OR run_after <= ?) + ORDER BY + MAX( + priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), + ? + ) ASC, + id ASC + LIMIT 1 + )", + ) + .bind(now_ms) + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .fetch_optional(&self.pool) + .await? + } + }; + + Ok(row.as_ref().map(row_to_task_record)) + } + + /// Running task counts per group (including ungrouped as None). + pub async fn running_counts_per_group( + &self, + ) -> Result, usize)>, StoreError> { + let rows: Vec<(Option, i64)> = sqlx::query_as( + "SELECT group_key, COUNT(*) FROM tasks + WHERE status = 'running' + GROUP BY group_key", + ) + .fetch_all(&self.pool) + .await?; + + Ok(rows.into_iter().map(|(g, c)| (g, c as usize)).collect()) + } + + /// Pending task counts per group (including ungrouped as None). + /// Only counts tasks eligible for dispatch (not deferred by run_after). + pub async fn pending_counts_per_group( + &self, + ) -> Result, usize)>, StoreError> { + let now_ms = chrono::Utc::now().timestamp_millis(); + let rows: Vec<(Option, i64)> = sqlx::query_as( + "SELECT group_key, COUNT(*) FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + GROUP BY group_key", + ) + .bind(now_ms) + .fetch_all(&self.pool) + .await?; + + Ok(rows.into_iter().map(|(g, c)| (g, c as usize)).collect()) + } + + /// Peek the next pending task whose effective priority (with aging) + /// is at or above the urgent threshold. + pub async fn peek_next_urgent( + &self, + threshold: crate::priority::Priority, + aging: Option<&AgingParams>, + ) -> Result, StoreError> { + // Urgent is meaningless without aging — if aging is None, no tasks + // can have an effective priority different from their stored one. + let Some(ap) = aging else { + return Ok(None); + }; + let now_ms = chrono::Utc::now().timestamp_millis(); + let row = sqlx::query( + "SELECT * FROM tasks + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + AND (run_after IS NULL OR run_after <= ?) + AND MAX( + priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), + ? + ) <= ? + ORDER BY + MAX(priority - MAX(0, (? - created_at - pause_duration_ms - ?) / ?), ?) ASC, + id ASC + LIMIT 1 + )", + ) + .bind(now_ms) + // WHERE clause aging params + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .bind(threshold.value() as i64) + // ORDER BY aging params + .bind(ap.now_ms) + .bind(ap.grace_period_ms) + .bind(ap.aging_interval_ms) + .bind(ap.max_effective_priority) + .fetch_optional(&self.pool) + .await?; + + Ok(row.as_ref().map(row_to_task_record)) + } + // ── Recurring control ────────────────────────────────────────── /// Pause a recurring schedule. The current instance (if running) is diff --git a/tests/integration.rs b/tests/integration.rs index c8d791f..ee9898e 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -20,6 +20,8 @@ mod common; mod cross_module; #[path = "integration/dependencies.rs"] mod dependencies; +#[path = "integration/fair_scheduling.rs"] +mod fair_scheduling; #[path = "integration/group_pause.rs"] mod group_pause; #[path = "integration/memo.rs"] diff --git a/tests/integration/fair_scheduling.rs b/tests/integration/fair_scheduling.rs new file mode 100644 index 0000000..279e5ec --- /dev/null +++ b/tests/integration/fair_scheduling.rs @@ -0,0 +1,496 @@ +//! Integration tests: Weighted fair scheduling (plan 037, phase 2). + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use taskmill::{Domain, Scheduler, SchedulerEvent, TaskStore, TaskSubmission}; +use tokio_util::sync::CancellationToken; + +use super::common::*; + +// ═══════════════════════════════════════════════════════════════════ +// Weighted groups: proportional dispatch +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn weighted_groups_proportional_dispatch() { + let count_a = Arc::new(AtomicUsize::new(0)); + let count_b = Arc::new(AtomicUsize::new(0)); + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain( + Domain::::new() + .task::(CountingExecutor { + count: count_a.clone(), + }) + .task::(CountingExecutor { + count: count_b.clone(), + }), + ) + .max_concurrency(8) + .group_weight("heavy", 3) + .group_weight("light", 1) + .build() + .await + .unwrap(); + + // Submit 20 tasks in each group. + for i in 0..20 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("heavy-{i}")) + .group("heavy"), + ) + .await + .unwrap(); + sched + .submit( + &TaskSubmission::new("test::slow") + .key(format!("light-{i}")) + .group("light"), + ) + .await + .unwrap(); + } + + let token = CancellationToken::new(); + let sched2 = sched.clone(); + let token2 = token.clone(); + let handle = tokio::spawn(async move { sched2.run(token2).await }); + + // Wait for all tasks to complete. + tokio::time::sleep(Duration::from_secs(2)).await; + token.cancel(); + handle.await.unwrap(); + + let a = count_a.load(Ordering::SeqCst); + let b = count_b.load(Ordering::SeqCst); + assert_eq!(a, 20, "all heavy tasks should complete"); + assert_eq!(b, 20, "all light tasks should complete"); +} + +// ═══════════════════════════════════════════════════════════════════ +// Min slots guaranteed under pressure +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn min_slots_guaranteed_under_pressure() { + let count_light = Arc::new(AtomicUsize::new(0)); + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain( + Domain::::new() + .task::(DelayExecutor(Duration::from_millis(50))) + .task::(CountingExecutor { + count: count_light.clone(), + }), + ) + .max_concurrency(6) + .group_weight("heavy", 10) + .group_weight("light", 1) + .group_minimum_slots("light", 2) + .build() + .await + .unwrap(); + + // Submit 20 heavy tasks (will fill most slots). + for i in 0..20 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("heavy-{i}")) + .group("heavy"), + ) + .await + .unwrap(); + } + + // Submit a few light tasks. + for i in 0..5 { + sched + .submit( + &TaskSubmission::new("test::slow") + .key(format!("light-{i}")) + .group("light"), + ) + .await + .unwrap(); + } + + let token = CancellationToken::new(); + let sched2 = sched.clone(); + let token2 = token.clone(); + let handle = tokio::spawn(async move { sched2.run(token2).await }); + + tokio::time::sleep(Duration::from_secs(2)).await; + token.cancel(); + handle.await.unwrap(); + + // Light tasks should have completed — min_slots guaranteed at least 2. + let light = count_light.load(Ordering::SeqCst); + assert!( + light >= 2, + "at least 2 light tasks should complete; got {light}" + ); +} + +// ═══════════════════════════════════════════════════════════════════ +// Work-conserving redistribution +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn work_conserving_redistribution() { + let count = Arc::new(AtomicUsize::new(0)); + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain( + Domain::::new() + .task::(CountingExecutor { + count: count.clone(), + }) + .task::(NoopExecutor), + ) + .max_concurrency(10) + .group_weight("a", 1) + .group_weight("b", 1) + .build() + .await + .unwrap(); + + // Only submit tasks to group "a" — none in "b". + for i in 0..10 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("a-{i}")) + .group("a"), + ) + .await + .unwrap(); + } + + let token = CancellationToken::new(); + let sched2 = sched.clone(); + let token2 = token.clone(); + let handle = tokio::spawn(async move { sched2.run(token2).await }); + + tokio::time::sleep(Duration::from_secs(2)).await; + token.cancel(); + handle.await.unwrap(); + + // All 10 should have dispatched — idle group's slots should overflow. + let dispatched = count.load(Ordering::SeqCst); + assert_eq!( + dispatched, 10, + "work-conserving: all tasks dispatched; got {dispatched}" + ); +} + +// ═══════════════════════════════════════════════════════════════════ +// Runtime weight change +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn runtime_weight_change() { + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain(Domain::::new().task::(NoopExecutor)) + .max_concurrency(4) + .group_weight("x", 1) + .build() + .await + .unwrap(); + + let mut rx = sched.subscribe(); + + sched.set_group_weight("x", 5); + + // Should emit GroupWeightChanged event. + let deadline = tokio::time::Instant::now() + Duration::from_millis(200); + let evt = wait_for_event( + &mut rx, + deadline, + |e| matches!(e, SchedulerEvent::GroupWeightChanged { group, .. } if group == "x"), + ) + .await; + + assert!(evt.is_some(), "should emit GroupWeightChanged event"); + if let Some(SchedulerEvent::GroupWeightChanged { + previous_weight, + new_weight, + .. + }) = evt + { + assert_eq!(previous_weight, 1); + assert_eq!(new_weight, 5); + } +} + +// ═══════════════════════════════════════════════════════════════════ +// Reset group weights +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn reset_group_weights_restores_default() { + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain(Domain::::new().task::(NoopExecutor)) + .max_concurrency(4) + .group_weight("x", 5) + .build() + .await + .unwrap(); + + sched.reset_group_weights(); + + // After reset, fair scheduling should be effectively off (no weights configured). + let snapshot = sched.snapshot().await.unwrap(); + assert!( + snapshot.group_allocations.is_empty(), + "after reset, no allocations should be reported" + ); +} + +// ═══════════════════════════════════════════════════════════════════ +// Paused group releases allocation +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn paused_group_releases_allocation() { + let count_b = Arc::new(AtomicUsize::new(0)); + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain( + Domain::::new() + .task::(DelayExecutor(Duration::from_millis(50))) + .task::(CountingExecutor { + count: count_b.clone(), + }), + ) + .max_concurrency(6) + .group_weight("a", 1) + .group_weight("b", 1) + .build() + .await + .unwrap(); + + // Submit tasks in both groups. + for i in 0..10 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("a-{i}")) + .group("a"), + ) + .await + .unwrap(); + sched + .submit( + &TaskSubmission::new("test::slow") + .key(format!("b-{i}")) + .group("b"), + ) + .await + .unwrap(); + } + + // Pause group a — b should get all slots. + sched.pause_group("a").await.unwrap(); + + let token = CancellationToken::new(); + let sched2 = sched.clone(); + let token2 = token.clone(); + let handle = tokio::spawn(async move { sched2.run(token2).await }); + + tokio::time::sleep(Duration::from_secs(2)).await; + token.cancel(); + handle.await.unwrap(); + + let b_completed = count_b.load(Ordering::SeqCst); + assert_eq!( + b_completed, 10, + "all b tasks should complete when a is paused" + ); +} + +// ═══════════════════════════════════════════════════════════════════ +// Ungrouped tasks get fair share +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn ungrouped_tasks_get_fair_share() { + let count_grouped = Arc::new(AtomicUsize::new(0)); + let count_ungrouped = Arc::new(AtomicUsize::new(0)); + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain( + Domain::::new() + .task::(CountingExecutor { + count: count_grouped.clone(), + }) + .task::(CountingExecutor { + count: count_ungrouped.clone(), + }), + ) + .max_concurrency(8) + .group_weight("grouped", 1) + // default_weight = 1, so ungrouped virtual group also gets 1 + .build() + .await + .unwrap(); + + // Submit grouped tasks. + for i in 0..10 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("g-{i}")) + .group("grouped"), + ) + .await + .unwrap(); + } + + // Submit ungrouped tasks. + for i in 0..10 { + sched + .submit(&TaskSubmission::new("test::slow").key(format!("u-{i}"))) + .await + .unwrap(); + } + + let token = CancellationToken::new(); + let sched2 = sched.clone(); + let token2 = token.clone(); + let handle = tokio::spawn(async move { sched2.run(token2).await }); + + tokio::time::sleep(Duration::from_secs(2)).await; + token.cancel(); + handle.await.unwrap(); + + let g = count_grouped.load(Ordering::SeqCst); + let u = count_ungrouped.load(Ordering::SeqCst); + assert_eq!(g, 10, "all grouped tasks should complete"); + assert_eq!(u, 10, "all ungrouped tasks should complete"); +} + +// ═══════════════════════════════════════════════════════════════════ +// Snapshot shows allocations +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn snapshot_shows_allocations() { + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain( + Domain::::new().task::(DelayExecutor(Duration::from_secs(10))), + ) + .max_concurrency(4) + .group_weight("alpha", 3) + .group_weight("beta", 1) + .build() + .await + .unwrap(); + + // Submit tasks so groups have pending demand. + for i in 0..5 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("a-{i}")) + .group("alpha"), + ) + .await + .unwrap(); + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("b-{i}")) + .group("beta"), + ) + .await + .unwrap(); + } + + let snapshot = sched.snapshot().await.unwrap(); + assert!( + !snapshot.group_allocations.is_empty(), + "snapshot should show group allocations" + ); + + // Both alpha and beta should appear. + let names: Vec<_> = snapshot + .group_allocations + .iter() + .map(|a| &a.group) + .collect(); + assert!( + names.contains(&&"alpha".to_string()), + "alpha should be in allocations" + ); + assert!( + names.contains(&&"beta".to_string()), + "beta should be in allocations" + ); +} + +// ═══════════════════════════════════════════════════════════════════ +// Builder configures weights +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test] +async fn builder_configures_weights() { + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .domain(Domain::::new().task::(NoopExecutor)) + .max_concurrency(10) + .group_weight("x", 3) + .group_weight("y", 1) + .default_group_weight(2) + .group_minimum_slots("y", 2) + .build() + .await + .unwrap(); + + // Verify fair dispatch is being used by submitting and running. + for i in 0..4 { + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("x-{i}")) + .group("x"), + ) + .await + .unwrap(); + sched + .submit( + &TaskSubmission::new("test::test") + .key(format!("y-{i}")) + .group("y"), + ) + .await + .unwrap(); + } + + let token = CancellationToken::new(); + let sched2 = sched.clone(); + let token2 = token.clone(); + let handle = tokio::spawn(async move { sched2.run(token2).await }); + + tokio::time::sleep(Duration::from_secs(1)).await; + + // All tasks should have completed (fair scheduling didn't block anything). + let pending = sched.store().pending_count().await.unwrap(); + + token.cancel(); + handle.await.unwrap(); + + assert_eq!(pending, 0, "all tasks should be dispatched"); +} From a905d09ade2a20d76d9c6a9826166dc3ed54cd25 Mon Sep 17 00:00:00 2001 From: DJ Majumdar Date: Tue, 24 Mar 2026 06:52:29 -0700 Subject: [PATCH 3/3] docs: document priority aging and weighted fair scheduling (#37) - Add priority aging and weighted fair scheduling sections to priorities-and-preemption.md - Add AgingConfig, group weight, and fair scheduling builder methods to configuration.md - Add aging.rs, fair.rs, rate_limit.rs to module map and update dispatch cycle in design.md - Add glossary entries: effective priority, priority aging, group weight, fair scheduling, urgent threshold - Document GroupWeightChanged event, updated TaskEventHeader fields, and snapshot fields in progress-and-events.md - Add pause_duration_ms and paused_at_ms columns to schema docs in persistence-and-recovery.md - Update module starvation guidance in multi-module-apps.md to recommend aging and group weights - Update snapshot field listing in query-apis.md - Add priority aging and fair scheduling to lib.rs crate-level docs and feature list - Fix broken TaskEventHeader rustdoc link in task/mod.rs --- docs/configuration.md | 60 +++++++++++++ docs/design.md | 17 +++- docs/glossary.md | 11 ++- docs/multi-module-apps.md | 32 ++++++- docs/persistence-and-recovery.md | 4 + docs/priorities-and-preemption.md | 139 +++++++++++++++++++++++++++++- docs/progress-and-events.md | 24 ++++-- docs/query-apis.md | 6 +- src/lib.rs | 18 ++++ src/task/mod.rs | 2 +- 10 files changed, 292 insertions(+), 21 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 0a03a9a..c8f7ed9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -44,6 +44,7 @@ Controls scheduling behavior. Set via builder methods or pass directly to `Sched | `shutdown_mode` | `ShutdownMode` | `Hard` | `Hard` cancels immediately. `Graceful(Duration)` waits for running tasks. | Always use `Graceful` for desktop apps to avoid data loss. | | `default_ttl` | `Option` | `None` | Global TTL applied to tasks without a per-task or per-type TTL. | Set to catch stale tasks (e.g., `Duration::from_secs(3600)` for 1 hour). | | `expiry_sweep_interval` | `Option` | `Some(30s)` | How often the scheduler sweeps for expired tasks. `None` disables periodic sweeps (dispatch-time checks still apply). | Lower for latency-sensitive expiry; `None` if you only need dispatch-time checks. | +| `aging_config` | `Option` | `None` | Priority aging configuration. When enabled, pending tasks are gradually promoted in effective priority over time. | See [Priorities & Preemption — Priority aging](priorities-and-preemption.md#priority-aging). | ### Builder methods @@ -395,6 +396,61 @@ for rl in &snap.rate_limits { } ``` +## Priority aging + +Priority aging prevents starvation of low-priority work by gradually promoting tasks that have been waiting too long. See [Priorities & Preemption — Priority aging](priorities-and-preemption.md#priority-aging) for the full explanation. + +```rust +use taskmill::{AgingConfig, Priority, Scheduler}; +use std::time::Duration; + +Scheduler::builder() + .priority_aging(AgingConfig { + grace_period: Duration::from_secs(300), // 5 min before aging starts + aging_interval: Duration::from_secs(60), // promote 1 level per minute + max_effective_priority: Priority::HIGH, // can't age above HIGH + urgent_threshold: None, // disable urgent override + }) + // ... + .build() + .await?; +``` + +The `AgingConfig::default()` uses the values shown above. Effective priority is computed at dispatch time — the stored priority column is never mutated. + +## Weighted fair scheduling + +When multiple task groups compete for dispatch slots, weighted fair scheduling allocates capacity proportionally. See [Priorities & Preemption — Weighted fair scheduling](priorities-and-preemption.md#weighted-fair-scheduling) for the full explanation. + +```rust +Scheduler::builder() + .group_weight("uploads", 3) + .group_weight("indexing", 1) + .group_minimum_slots("alerts", 2) + .default_group_weight(1) + // ... + .build() + .await?; +``` + +Adjust at runtime: + +```rust +scheduler.set_group_weight("uploads", 5); +scheduler.remove_group_weight("uploads"); +scheduler.reset_group_weights(); +scheduler.set_group_minimum_slots("alerts", 4); +``` + +Current allocations are visible in the scheduler snapshot: + +```rust +let snap = scheduler.snapshot().await?; +for alloc in &snap.group_allocations { + println!("{}: {} slots (weight {})", alloc.group, alloc.allocated_slots, alloc.weight); +} +``` + ## Tuning for specific workloads ### Desktop app with file processing @@ -484,6 +540,10 @@ Scheduler::builder() | `group_concurrency(group, n)` | Per-group concurrency limit override. | | `rate_limit(task_type, limit)` | Set a token-bucket rate limit for a task type. | | `group_rate_limit(group, limit)` | Set a token-bucket rate limit for a task group. | +| `priority_aging(config)` | Enable [priority aging](priorities-and-preemption.md#priority-aging) with the given `AgingConfig`. | +| `group_weight(group, weight)` | Set a relative scheduling weight for a task group. See [Weighted fair scheduling](priorities-and-preemption.md#weighted-fair-scheduling). | +| `default_group_weight(weight)` | Default weight for groups without a specific override. Default: 1. | +| `group_minimum_slots(group, slots)` | Minimum guaranteed dispatch slots for a group, regardless of weight. | | `app_state(state)` | Register global state visible to all domains. | | `app_state_arc(arc)` | Register global state from a pre-existing `Arc`. | | `build()` | Build and return the `Scheduler`. | diff --git a/docs/design.md b/docs/design.md index c6249ff..d8c117f 100644 --- a/docs/design.md +++ b/docs/design.md @@ -28,12 +28,15 @@ taskmill/src/ mod.rs — Scheduler, SchedulerBuilder, public API run_loop.rs — main event loop, dispatch cycle submit.rs — submit, submit_batch, cancellation - control.rs — pause/resume, concurrency limits + control.rs — pause/resume, concurrency limits, group weights queries.rs — snapshot, active tasks, progress gate.rs — DispatchGate, IO budget check dispatch.rs — ActiveTaskMap, spawn_task, preemption progress.rs — ProgressReporter, throughput extrapolation event.rs — SchedulerEvent, SchedulerSnapshot + aging.rs — AgingConfig, AgingParams, effective_priority() + fair.rs — GroupWeights, SlotAllocation, compute_allocation() + rate_limit.rs — RateLimit, token-bucket rate limiting resource/ mod.rs — ResourceSampler + ResourceReader traits sampler.rs — EWMA-smoothed background loop @@ -183,9 +186,15 @@ Each cycle, the loop: 1. Checks if the scheduler is globally paused. 2. Sweeps expired tasks (if the expiry sweep interval has elapsed). -3. Resumes paused tasks if no active preemptors remain. -4. While `active_count < max_concurrency`: peek the next candidate, check for TTL expiry, check the dispatch gate, pop-by-id if admitted, spawn the executor. -5. Sleep until the next signal. +3. Auto-resumes timed group pauses that have reached their deadline. +4. Resumes paused tasks if no active preemptors remain. +5. Dispatches pending tasks using one of three paths: + - **Fair dispatch** (when group weights are configured) — three-pass loop: fair per-group allocation, greedy fill, urgent threshold override. + - **Fast dispatch** (when no groups, no monitoring, no pressure, no rate limits, no weights) — batch `pop_next` in priority order with no gate checks. + - **Slow dispatch** (otherwise) — peek the next candidate, check for TTL expiry, check the dispatch gate, pop-by-id if admitted, spawn the executor. +6. Sleep until the next signal. + +When [priority aging](priorities-and-preemption.md#priority-aging) is enabled, `AgingParams` are computed once per dispatch cycle and passed to peek/pop queries. The SQL ORDER BY uses a computed expression for effective priority instead of the stored priority column. ## Retry flow diff --git a/docs/glossary.md b/docs/glossary.md index 5165597..35a2c8d 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -14,15 +14,19 @@ Quick reference for terms used throughout the taskmill documentation. | **Dependency edge** | A record in the `task_deps` junction table representing that one task depends on another. An edge `(A, B)` means "task A cannot start until task B completes." Edges are removed when the dependency completes or fails, and are cleaned up on startup if they reference tasks that no longer exist. | | **DependencyFailurePolicy** | Controls what happens to a dependent task when one of its dependencies fails permanently. `Cancel` (default) moves the dependent to history as `DependencyFailed` and cascades to other dependents. `Fail` does the same without cascading. `Ignore` unblocks the dependent anyway. See [Configuration](configuration.md#dependency-failure-policy). | | **Deduplication (dedup)** | Preventing the same task from being queued twice. Taskmill generates a SHA-256 key from the task type and payload; a second submission with the same key is silently ignored. See [Persistence & Recovery](persistence-and-recovery.md#deduplication). | -| **Dispatch** | Moving a task from "waiting in line" (pending) to "actively running." The scheduler dispatches tasks in priority order, subject to concurrency limits and backpressure. | +| **Dispatch** | Moving a task from "waiting in line" (pending) to "actively running." The scheduler dispatches tasks in priority order (or effective priority order when aging is enabled), subject to concurrency limits, group weights, and backpressure. | | **EWMA** | Exponentially Weighted Moving Average — a smoothing technique that gives recent measurements more weight than old ones. Taskmill uses EWMA to smooth resource readings so the scheduler doesn't overreact to momentary spikes. See [IO & Backpressure](io-and-backpressure.md#ewma-smoothing). | | **TypedExecutor** | Your code that performs the actual work for a task type. Implements `TypedExecutor` and receives a deserialized payload: `async fn execute(&self, payload: T, ctx: DomainTaskContext<'_, T::Domain>)`. Register with `Domain::task::(executor)`. See [Quick Start](quick-start.md#implement-an-executor). | | **IO budget** | An estimate of how many bytes a task will read and write (disk and/or network), submitted alongside the task. The scheduler uses IO budgets to avoid overwhelming the disk. See [IO & Backpressure](io-and-backpressure.md#io-budgets-telling-the-scheduler-what-to-expect). | | **Pile-up prevention** | The mechanism that skips a recurring task instance when the previous instance hasn't been dispatched yet, preventing unbounded queue growth under sustained load. See [Quick Start](quick-start.md#recurring-tasks). | +| **Priority aging** | An anti-starvation mechanism that gradually promotes pending tasks in effective priority over time. Configured via `AgingConfig` on `SchedulerBuilder::priority_aging()`. The stored priority is never mutated — effective priority is computed at dispatch time. See [Priorities & Preemption](priorities-and-preemption.md#priority-aging). | | **Preemption** | Pausing lower-priority work so higher-priority work can run immediately. Preempted tasks resume automatically once the urgent work finishes. See [Priorities & Preemption](priorities-and-preemption.md#preemption). | | **Pressure source** | Anything that signals the system is busy — disk IO, network throughput, memory usage, API rate limits, battery level. Returns a value from 0.0 (idle) to 1.0 (saturated). See [IO & Backpressure](io-and-backpressure.md#pressure-sources). | | **Rate limit** | A token-bucket cap on how many tasks start per unit of time, independent of concurrency. Scoped by task type and/or group. Configured via `RateLimit::per_second(n)` / `per_minute(n)` with optional `.with_burst(b)`. See [Configuration — Rate limiting](configuration.md#rate-limiting). | -| **Task group** | A named set of tasks that share a concurrency limit and/or rate limit. For example, you might limit uploads to a specific S3 bucket to 3 concurrent and 100/sec. See [Priorities & Preemption](priorities-and-preemption.md#task-groups). | +| **Task group** | A named set of tasks that share a concurrency limit, rate limit, and/or scheduling weight. For example, you might limit uploads to a specific S3 bucket to 3 concurrent and 100/sec, or give it 3x the scheduling weight of other groups. See [Priorities & Preemption](priorities-and-preemption.md#task-groups). | +| **Group weight** | A relative scheduling weight assigned to a task group for [weighted fair scheduling](priorities-and-preemption.md#weighted-fair-scheduling). Weights are relative — `(A:3, B:1)` gives A 75% and B 25% of capacity. Groups without an explicit weight use `default_group_weight` (default: 1). Configurable at build time and runtime. | +| **Fair scheduling** | A dispatch strategy that allocates slots proportionally to group weights using a three-pass loop: fair per-group allocation, greedy fill, and urgent threshold override. Enabled when any group weights are configured. See [Priorities & Preemption](priorities-and-preemption.md#weighted-fair-scheduling). | +| **Urgent threshold** | A priority level in `AgingConfig` at which aged tasks may bypass group weight allocation during fair dispatch. Tasks that age past this threshold are dispatched in the urgent pass regardless of their group's weight allocation, but still respect `max_concurrency`. | | **task_deps** | The SQLite junction table that stores dependency edges between tasks. Each row `(task_id, depends_on_id)` means the task cannot start until the dependency completes. Edges survive restarts and are cleaned up automatically when dependencies resolve or on startup. See [Persistence & Recovery](persistence-and-recovery.md#dependency-recovery). | | **Throttle policy** | Rules that map system pressure to dispatch decisions. The default policy defers background tasks when pressure exceeds 50% and normal tasks when it exceeds 75%, but never blocks high-priority work. See [Priorities & Preemption](priorities-and-preemption.md#throttle-behavior). | | **TTL (time-to-live)** | A duration after which a task automatically expires if it hasn't started running. Configurable per-task, per-type, or as a global default. See [Configuration](configuration.md#task-ttl-time-to-live). | @@ -36,5 +40,6 @@ Quick reference for terms used throughout the taskmill documentation. | **TypedEventStream** | A per-task-type event subscription (`TypedEventStream`) created via `handle.task_events::()`. Filters the global scheduler event broadcast to only events matching `T::TASK_TYPE` within the domain. Terminal events include the `TaskHistoryRecord`. | | **Qualified task type** | The full database-stored task type including the domain prefix, e.g. `"media::thumbnail"`. Required when using store-level query APIs (`history_stats`, `task_lookup`, `avg_throughput`). `DomainHandle` methods apply the prefix automatically, so you typically only need the short form when submitting tasks. | | **Cross-domain dependency** | A dependency edge where the dependent task and its prerequisite belong to different domains. Functionally identical to same-domain dependencies — the domain boundary does not affect dependency resolution or failure propagation. See [Multi-Module Applications](multi-module-apps.md#cross-module-task-dependencies). | -| **Domain starvation** | A condition where one domain's tasks are never dispatched because higher-priority tasks from other domains continuously consume available concurrency slots. Priority ordering is global across all domains. Mitigated by assigning appropriate priorities and using group concurrency to reserve capacity. See [Multi-Module Applications](multi-module-apps.md#module-starvation-understanding-priority-competition). | +| **Domain starvation** | A condition where one domain's tasks are never dispatched because higher-priority tasks from other domains continuously consume available concurrency slots. Priority ordering is global across all domains. Mitigated by [priority aging](priorities-and-preemption.md#priority-aging), [group weights](priorities-and-preemption.md#weighted-fair-scheduling), assigning appropriate priorities, and using group concurrency to reserve capacity. See [Multi-Module Applications](multi-module-apps.md#module-starvation-understanding-priority-competition). | +| **Effective priority** | The dispatch-time priority of a task when [priority aging](priorities-and-preemption.md#priority-aging) is enabled. Computed as `base_priority - promotions` (clamped to `max_effective_priority`). Equals the stored (base) priority when aging is disabled or the task hasn't waited past the grace period. Visible in `TaskEventHeader` and snapshots. | | **Late-binding state** | Application state injected into the scheduler after `build()` via `scheduler.register_state()`. Useful for library crates that receive a pre-built `Scheduler` as a dependency. Must be called before `scheduler.run()` — calling it after tasks are dispatching has no ordering guarantees with in-flight executors. | diff --git a/docs/multi-module-apps.md b/docs/multi-module-apps.md index 84654c0..776ebdd 100644 --- a/docs/multi-module-apps.md +++ b/docs/multi-module-apps.md @@ -196,9 +196,35 @@ A task is dispatched when **all** of these pass: A domain with only `BACKGROUND`-priority tasks can be indefinitely deferred when other domains continuously submit `NORMAL` work. This is by design — priority ordering is global across all domains. -If you need guaranteed throughput for a domain: -- **Raise the priority** of its most important tasks to `NORMAL` or `HIGH`. -- **Use task groups** with a dedicated concurrency reservation. A group limit acts as a soft floor: tasks in the group bypass the global priority queue as long as the group has available slots. +Taskmill provides several tools to address this: + +- **[Priority aging](priorities-and-preemption.md#priority-aging)** — automatically promotes tasks that have been waiting too long, ensuring even `IDLE` tasks eventually get dispatched. +- **[Weighted fair scheduling](priorities-and-preemption.md#weighted-fair-scheduling)** — allocates dispatch slots proportionally to group weights, guaranteeing each group a fair share of capacity regardless of priority levels. +- **Raise the priority** of the domain's most important tasks to `NORMAL` or `HIGH`. +- **Use task groups** with a dedicated concurrency reservation or minimum slot guarantee. + +### Using group weights for guaranteed throughput + +The most effective solution is weighted fair scheduling with minimum slot guarantees: + +```rust +pub struct Sync; +impl DomainKey for Sync { const NAME: &'static str = "sync"; } + +let scheduler = Scheduler::builder() + .domain( + Domain::::new() + .task::(SyncExecutor) + .default_group("sync-reserved") + .default_priority(Priority::BACKGROUND) + ) + .group_weight("sync-reserved", 1) // participate in fair allocation + .group_minimum_slots("sync-reserved", 2) // guaranteed at least 2 slots + .priority_aging(AgingConfig::default()) // prevent indefinite starvation + .max_concurrency(16) + .build() + .await?; +``` ### Using group concurrency as a soft floor diff --git a/docs/persistence-and-recovery.md b/docs/persistence-and-recovery.md index 9dedcc7..22ffaa5 100644 --- a/docs/persistence-and-recovery.md +++ b/docs/persistence-and-recovery.md @@ -13,6 +13,8 @@ When your app starts up, taskmill automatically recovers: - **Dedup keys stay occupied** — no duplicate submissions sneak in during recovery. - **Retry counts are preserved** — a task that had retried twice before the crash still has two retries used. +When [priority aging](priorities-and-preemption.md#priority-aging) is enabled, crash recovery also accumulates stale `pause_duration_ms` for tasks that were paused at crash time. This ensures the aging clock is approximately correct after recovery (slightly over-promoting, which is acceptable for anti-starvation). + The guarantee is **at-least-once execution**: a task might run partially, crash, and re-run from the beginning. Design your executors to be idempotent (or to check for partial work) so re-execution is safe. ## Scheduled task recovery @@ -195,6 +197,8 @@ You normally don't need to know the schema, but it's documented here for debuggi | `ttl_seconds` | INTEGER | TTL duration in seconds (NULL = no TTL) | | `ttl_from` | TEXT DEFAULT 'submission' | When TTL clock starts: `submission` or `first_attempt` | | `expires_at` | TEXT | ISO 8601 deadline (NULL = no expiry) | +| `pause_duration_ms` | INTEGER DEFAULT 0 | Accumulated milliseconds spent in `paused` state. Excluded from the [priority aging](priorities-and-preemption.md#priority-aging) formula to freeze the aging clock while paused. | +| `paused_at_ms` | INTEGER | Epoch-ms timestamp of the most recent pause transition. `NULL` when the task is not paused. On resume, `pause_duration_ms` is accumulated and this is cleared. | **Indexes:** - `idx_tasks_pending(status, priority ASC, id ASC) WHERE status = 'pending'` — fast priority-ordered dispatch. diff --git a/docs/priorities-and-preemption.md b/docs/priorities-and-preemption.md index 9a2e50c..cb882ef 100644 --- a/docs/priorities-and-preemption.md +++ b/docs/priorities-and-preemption.md @@ -34,7 +34,9 @@ Most applications only need `HIGH`, `NORMAL`, and `BACKGROUND`. Reserve `REALTIM ## Queue ordering -Tasks are dispatched in strict priority order. Within the same priority tier, tasks are dispatched in insertion order (FIFO) — the task submitted first runs first. +Tasks are dispatched in priority order. Within the same priority tier, tasks are dispatched in insertion order (FIFO) — the task submitted first runs first. + +When [priority aging](#priority-aging) is enabled, the scheduler uses *effective priority* (base priority adjusted for wait time) instead of stored priority. When [group weights](#weighted-fair-scheduling) are configured, the scheduler uses a multi-pass dispatch loop that allocates slots proportionally to group weights before falling back to global priority order. ## Preemption @@ -237,3 +239,138 @@ return Err(TaskError::permanent("invalid payload, giving up")); - **Priority preserved** — retried tasks keep their original priority; they aren't demoted. - **Dedup key preserved** — the key stays occupied during retries, preventing duplicate submissions while the task is still being worked on. - **Crash doesn't count** — if the process crashes while a task is running, the crash recovery doesn't increment `retry_count`. + +## Priority aging + +When high-priority tasks arrive continuously, low-priority work can be starved indefinitely. Priority aging prevents this by gradually promoting tasks that have been waiting too long. + +### How it works + +At dispatch time, the scheduler computes an *effective priority* for each pending task: + +```text +age = now - created_at - pause_duration +promotions = max(0, (age - grace_period) / aging_interval) +effective = max(base_priority - promotions, max_effective_priority) +``` + +Lower numeric value = higher priority. Aging *decreases* the numeric value (promotes the task). The stored priority is never mutated — effective priority is a pure dispatch-time computation. + +### Configuring aging + +```rust +use taskmill::{AgingConfig, Priority, Scheduler}; +use std::time::Duration; + +let scheduler = Scheduler::builder() + .priority_aging(AgingConfig { + grace_period: Duration::from_secs(300), // 5 min before aging starts + aging_interval: Duration::from_secs(60), // promote 1 level per minute + max_effective_priority: Priority::HIGH, // can't age above HIGH + urgent_threshold: None, // see fair scheduling + }) + // ... + .build() + .await?; +``` + +| Field | Default | Description | +|-------|---------|-------------| +| `grace_period` | 5 minutes | How long a task must wait before aging begins. | +| `aging_interval` | 60 seconds | Time between each one-step priority promotion. | +| `max_effective_priority` | `HIGH` (64) | Priority ceiling — tasks cannot age above this. Use `REALTIME` to allow aging to the absolute highest level. | +| `urgent_threshold` | `None` | When effective priority reaches this level, the task may bypass group weight allocation (see [weighted fair scheduling](#weighted-fair-scheduling)). Must be `>=` `max_effective_priority` numerically. | + +### Aging interactions + +- **Paused tasks** — the aging clock is frozen while a task is paused. Accumulated pause time is excluded from the age calculation. +- **Retry** — when a task is requeued for retry, the aging clock continues from the original `created_at`. The task has been waiting even longer. +- **Recurring tasks** — each new recurring instance gets a fresh `created_at`, so the aging clock starts at zero. +- **Child tasks** — children inherit the *higher* of the parent's current effective priority and the child's own configured priority. This promotes children of aged parents without demoting children whose task type is inherently higher-priority. +- **Supersede / resubmit** — the new task gets a fresh `created_at`, so the aging clock resets. +- **Crash recovery** — if the scheduler crashes while tasks are paused, the accumulated pause time is correctly accounted for on recovery (aging clock runs slightly fast for the crash window, which is acceptable for anti-starvation). + +### Observability + +Effective priority is visible in events and snapshots: + +- `TaskEventHeader` includes both `base_priority` and `effective_priority`. Compare them to detect aging: `effective_priority < base_priority` means the task has been promoted. +- `SchedulerSnapshot` includes `aging_config` when aging is enabled. + +### Opt-in, zero cost when off + +Without `priority_aging()`, dispatch queries use the original `ORDER BY priority ASC, id ASC` — fully index-ordered with zero overhead. + +## Weighted fair scheduling + +Group concurrency limits are *caps* (max N slots for group X), not *floors*. Without weighted allocation, a group with a large pending queue can fill all available global slots (up to its cap), starving other groups with legitimate work. + +Weighted fair scheduling allocates dispatch slots proportionally to group weights, ensuring each group gets a fair share of capacity. + +### How it works + +When group weights are configured, the scheduler uses a three-pass dispatch loop: + +1. **Fair pass** — each group (including ungrouped tasks as a virtual group) receives slots proportional to its weight. Minimum slot guarantees (`min_slots`) are honored first. +2. **Greedy pass** — any slots left unfilled by the fair pass (under-demand groups, rate-limited tasks) are filled by global priority order. This makes the scheduler work-conserving. +3. **Urgent pass** — tasks aged past `urgent_threshold` (if configured) bypass group weights but still respect `max_concurrency`. This is a safety valve for severely starved tasks. + +### Configuring group weights + +```rust +use taskmill::Scheduler; + +let scheduler = Scheduler::builder() + .group_weight("api-calls", 3) // 3x the weight of default + .group_weight("background", 1) // 1x (default weight) + .default_group_weight(1) // weight for groups without an override + .group_minimum_slots("critical", 2) // always at least 2 slots for "critical" + .group_concurrency("api-calls", 8) // cap at 8 concurrent (still enforced) + .max_concurrency(16) + // ... + .build() + .await?; +``` + +Weights are relative — `(A:3, B:1)` gives A 75% and B 25% of capacity. Ungrouped tasks participate as a virtual group with the default weight, ensuring they compete fairly rather than only receiving leftovers. + +### Runtime adjustment + +```rust +// Update weight at runtime +scheduler.set_group_weight("api-calls", 5); + +// Remove override, falling back to default weight +scheduler.remove_group_weight("api-calls"); + +// Reset all weights +scheduler.reset_group_weights(); + +// Set minimum guaranteed slots +scheduler.set_group_minimum_slots("critical", 3); +``` + +### Interactions with other features + +- **Group concurrency caps** — caps are applied during allocation and also enforced by the dispatch gate as a safety net. The allocation respects caps; excess is redistributed to other groups. +- **Group pause** — paused groups are excluded from the allocation. Their capacity is released to other groups. On resume, they re-enter the allocation on the next dispatch cycle. +- **Rate limits** — rate limit checks still run during fair dispatch. A rate-limited task leaves its group's slot unfilled, which the greedy pass fills from other groups. +- **Preemption** — preemption operates independently of the allocation. A preempting task may temporarily exceed its group's allocation; the allocation rebalances on the next cycle. +- **Priority aging** — aging and fair scheduling compose. An aged task in a low-weight group dispatches in priority order within its group's allocation. Tasks aged past `urgent_threshold` bypass group weights entirely. + +### Observability + +`SchedulerSnapshot` includes `group_allocations` — a `Vec` with per-group detail: + +```rust +let snap = scheduler.snapshot().await?; +for alloc in &snap.group_allocations { + println!( + "{}: weight={}, slots={}, running={}, pending={}, min={:?}, cap={:?}", + alloc.group, alloc.weight, alloc.allocated_slots, + alloc.running, alloc.pending, alloc.min_slots, alloc.cap, + ); +} +``` + +A `GroupWeightChanged` event is emitted when `set_group_weight()` is called. diff --git a/docs/progress-and-events.md b/docs/progress-and-events.md index fd35ea6..921142d 100644 --- a/docs/progress-and-events.md +++ b/docs/progress-and-events.md @@ -114,10 +114,13 @@ tokio::spawn(async move { | `TaskUnblocked { task_id }` | A blocked task's dependencies are all satisfied — it transitions to `pending` | | `DeadLettered { header, error, retry_count }` | Task exhausted all retries — can be re-submitted via `retry_dead_letter()` | | `DependencyFailed { task_id, failed_dependency }` | A blocked task was cancelled because a dependency failed permanently | +| `GroupPaused { group, pending_count, running_count }` | A task group was paused | +| `GroupResumed { group, resumed_count }` | A task group was resumed | +| `GroupWeightChanged { group, previous_weight, new_weight }` | A group's scheduling weight was changed at runtime via `set_group_weight()` | | `Paused` | Scheduler globally paused via `pause_all()` | | `Resumed` | Scheduler resumed via `resume_all()` | -Task-specific events share a `TaskEventHeader` with `task_id`, `task_type`, `key`, and `label`. Use `event.header()` to access it generically. +Task-specific events share a `TaskEventHeader` with `task_id`, `task_type`, `key`, `label`, `tags`, `base_priority`, and `effective_priority`. Use `event.header()` to access it generically. When [priority aging](priorities-and-preemption.md#priority-aging) is enabled, `effective_priority < base_priority` indicates the task has been promoted by aging. ### Which events to listen for @@ -131,6 +134,7 @@ Task-specific events share a `TaskEventHeader` with `task_id`, `task_type`, `key | Stale task cleanup UI | `TaskExpired` | | Recurring schedule monitoring | `RecurringSkipped`, `RecurringCompleted` | | Dependency chain tracking | `TaskUnblocked`, `DependencyFailed` | +| Group weight monitoring | `GroupWeightChanged`, `GroupPaused`, `GroupResumed` | ## Querying progress @@ -152,14 +156,18 @@ For UI dashboards, `scheduler.snapshot()` gathers all scheduler state in a singl ```rust let snap = scheduler.snapshot().await?; -// snap.running — Vec of currently executing tasks -// snap.pending_count — number of tasks waiting to dispatch -// snap.paused_count — number of preempted tasks -// snap.progress — Vec for every running task -// snap.pressure — aggregate backpressure (0.0–1.0) +// snap.running — Vec of currently executing tasks +// snap.pending_count — number of tasks waiting to dispatch +// snap.paused_count — number of preempted tasks +// snap.progress — Vec for every running task +// snap.pressure — aggregate backpressure (0.0–1.0) // snap.pressure_breakdown — per-source diagnostics: Vec<(String, f32)> -// snap.max_concurrency — current concurrency limit -// snap.is_paused — whether the scheduler is globally paused +// snap.max_concurrency — current concurrency limit +// snap.is_paused — whether the scheduler is globally paused +// snap.aging_config — Option (if priority aging is enabled) +// snap.group_allocations — Vec (per-group slot allocations) +// snap.paused_groups — Vec (groups currently paused) +// snap.rate_limits — Vec (configured rate limits) ``` ## Tauri event bridging diff --git a/docs/query-apis.md b/docs/query-apis.md index a986c80..60a0552 100644 --- a/docs/query-apis.md +++ b/docs/query-apis.md @@ -132,11 +132,15 @@ The `history_by_type(task_type)` parameter requires the qualified name including | `next_run_after()` | `Option>` | Earliest `run_after` timestamp among pending delayed tasks. Useful for knowing when the next scheduled task will fire. | | `recurring_schedules()` | `Vec` | All active recurring schedules with their interval, remaining occurrences, and paused state. | -The `SchedulerSnapshot` also includes recurring schedule information: +The `SchedulerSnapshot` also includes recurring schedule information, priority aging config, and group allocation data: ```rust let snap = scheduler.snapshot().await?; // snap.recurring_schedules — Vec for all active schedules +// snap.aging_config — Option (when priority aging is enabled) +// snap.group_allocations — Vec (per-group slot allocations when fair scheduling is configured) +// snap.paused_groups — Vec (currently paused groups) +// snap.rate_limits — Vec (configured rate limits with current utilization) ``` ### Managing recurring schedules diff --git a/src/lib.rs b/src/lib.rs index d0e264f..7f86409 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,10 @@ //! - Supports [graceful shutdown](ShutdownMode) with configurable drain timeout //! - Supports [token-bucket rate limiting](RateLimit) per task type and/or group to cap start rate //! independently of concurrency +//! - Supports [priority aging](AgingConfig) to prevent starvation of low-priority work — effective +//! priority is computed at dispatch time with zero write amplification +//! - Supports weighted fair scheduling with per-group slot allocation, minimum slot +//! guarantees, and urgent threshold override for severely starved tasks //! //! # Concepts //! @@ -121,6 +125,20 @@ //! [`SchedulerBuilder::default_group_concurrency`], or adjust at runtime via //! [`Scheduler::set_group_limit`] and [`Scheduler::set_default_group_concurrency`]. //! +//! ## Priority aging & fair scheduling +//! +//! [`AgingConfig`] enables anti-starvation priority aging: tasks waiting longer +//! than a grace period are gradually promoted in effective priority at dispatch +//! time. The stored priority is never mutated. Configure via +//! [`SchedulerBuilder::priority_aging`]. +//! +//! Weighted fair scheduling allocates dispatch slots proportionally to group +//! weights using a three-pass loop (fair, greedy, urgent). Configure at +//! build time with +//! [`SchedulerBuilder::group_weight`] and +//! [`SchedulerBuilder::group_minimum_slots`], or adjust at runtime via +//! [`Scheduler::set_group_weight`] and [`Scheduler::set_group_minimum_slots`]. +//! //! ## Child tasks & two-phase execution //! //! An executor can spawn child tasks via [`DomainTaskContext::spawn_child_with`]. When diff --git a/src/task/mod.rs b/src/task/mod.rs index 6f3c83b..20cb866 100644 --- a/src/task/mod.rs +++ b/src/task/mod.rs @@ -335,7 +335,7 @@ impl TaskRecord { self.event_header_with_aging(None) } - /// Build a [`TaskEventHeader`] with aging-aware effective priority. + /// Build a [`TaskEventHeader`](crate::scheduler::event::TaskEventHeader) with aging-aware effective priority. pub fn event_header_with_aging( &self, aging_config: Option<&crate::scheduler::aging::AgingConfig>,