From 1db7b25f0c0910d0031132c18f11e7120b988a25 Mon Sep 17 00:00:00 2001 From: Marcos Date: Wed, 6 May 2026 13:14:48 -0300 Subject: [PATCH] feat(scheduler): add HOST_LINK_BW constant + 3-way bandwidth model (closes #21) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `HOST_LINK_BW_BYTES_PER_SEC = 100_000_000` (100 MB/s) to the bandwidth model, capturing the rev-A GbE host link as the third tier of the bandwidth hierarchy: Local DDR (per card) ~2.0 GB/s LOCAL_DDR_BW Inter-card (per direction) ~500 MB/s INTERCARD_BW Host link (GbE) ~100 MB/s HOST_LINK_BW (NEW) Source-of-truth: Stays `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md` (Stays PR #34, merged 2026-05-06). Community measurements on Versa-ECP5 and ECPIX-5 land at 800-940 Mbps UDP iperf3, i.e. 80-94 % of GbE line rate. The 100 MB/s number is the realistic post-IP/UDP/Ethernet-header steady-state ceiling. The host link is 5x slower than inter-card and 20x slower than local DDR — it is the dominant cost when collective ops must reach the host (model load, gradient checkpoint to host RAM, dataset streaming, prompt-embedding upload). ## Scope Minimal — per the issue spec's "if pick_strategy already handles this" branch: - `pick_strategy` is the per-token TP/MP decision and most decode tokens stay on-card; host-link cost is small per-token and only matters at session boundaries. - No callers exist today for a session-level cost-budget API, so introducing `bytes_per_second_per_token_estimate` would be speculative generality (YAGNI). Defer until the runtime needs it. - This PR keeps the public surface to a constant + module-level doctest update + tests. ## Tests 3 new unit tests in `bandwidth.rs`: - `host_link_bw_constant_matches_recon_doc` — pins value to 100_000_000 (guards against silent "round up to 125 MB/s line rate" drift). - `host_link_bw_is_slowest_hop` — pins the three-tier ordering HOST_LINK < INTERCARD < LOCAL_DDR. - `host_link_bw_is_inside_observed_range` — pins 80-125 MB/s envelope (community recon range, with line-rate ceiling). Plus the existing `constants_are_positive` test extended to cover the new constant. Module-level doctest in `bandwidth.rs` updated to demonstrate all three constants. Crate-root doctest in `lib.rs` updated to assert the three-tier ordering. ## Cargo gates - `cargo build -p spanker-scheduler`: green - `cargo test -p spanker-scheduler`: 27 unit + 9 integration + 6 doctests, all green (delta: +3 unit tests vs PR #19 baseline) - `cargo clippy -p spanker-scheduler --all-targets -- -D warnings`: green - `cargo fmt -p spanker-scheduler -- --check`: clean Refs: - popsolutions/Spanker#21 (this issue) - popsolutions/Stays#34 (LiteEth ECP5 SGMII recon, source-of-truth) - popsolutions/Spanker#17 (PR that landed initial 2-tier model) - popsolutions/Spanker#19 (PR that landed pick_strategy) Authored by Agent 3 (Software Stack — Spanker). Signed-off-by: Marcos --- src/scheduler/src/bandwidth.rs | 138 ++++++++++++++++++++++++++++++++- src/scheduler/src/lib.rs | 20 +++-- 2 files changed, 152 insertions(+), 6 deletions(-) diff --git a/src/scheduler/src/bandwidth.rs b/src/scheduler/src/bandwidth.rs index 4fc233a..7e52098 100644 --- a/src/scheduler/src/bandwidth.rs +++ b/src/scheduler/src/bandwidth.rs @@ -15,6 +15,13 @@ //! - [`INTERCARD_BW_BYTES_PER_SEC`]: InnerJib7EA pinout §9 — 4 lanes //! × ~1.25 Gbps × 8b/10b = ~4 Gbps effective ≈ 500 MB/s per //! direction. +//! - [`HOST_LINK_BW_BYTES_PER_SEC`]: Stays +//! `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md` — +//! community bring-up reports on Versa-ECP5 and ECPIX-5 land at +//! 800–940 Mbps measured (UDP iperf), i.e. 80–94 % of GbE line +//! rate. Pinning the model to 100 MB/s after IP/UDP/Ethernet +//! header overhead gives the scheduler an honest cost estimate +//! for collective ops that round-trip through the host. //! //! ## Why this module exists //! @@ -25,6 +32,14 @@ //! "DDR3-1600 = 12.8 GB/s" claim) would over-shard tensor-parallel //! workloads that are actually inter-card-fetch bound on rev-A. //! +//! The third constant ([`HOST_LINK_BW_BYTES_PER_SEC`]) captures the +//! GbE host link — the slowest hop in rev-A's three-tier hierarchy. +//! It is not consumed by [`crate::pick_strategy`] today (TP/MP is the +//! per-token decision and most decode tokens stay on-card), but it +//! is the load-bearing number for any future cost-budget logic that +//! reasons about model-load, gradient-checkpoint-to-host, or +//! activation streaming round-trips through the host. +//! //! ## When to bump these constants //! //! - **`LOCAL_DDR_BW_BYTES_PER_SEC`:** when rev-B targets a faster @@ -34,11 +49,43 @@ //! - **`INTERCARD_BW_BYTES_PER_SEC`:** when ADR-014 (line coding) //! chooses 64b/66b instead of 8b/10b, this becomes ~600 MB/s. Bump //! in the same PR that lands ADR-014. +//! - **`HOST_LINK_BW_BYTES_PER_SEC`:** when rev-B moves from GbE to +//! 10 GbE or PCIe Gen2 host link, bump in lockstep with the +//! platform ADR (factor-of-10 step from 100 MB/s → 1 GB/s for +//! 10 GbE; ~500 MB/s for PCIe Gen2 x1). +//! +//! ## Bandwidth hierarchy this captures (rev-A) +//! +//! ```text +//! Local DDR (per card) ~16 Gbps ≈ 2.0 GB/s LOCAL_DDR_BW +//! Inter-card (per direction) ~4 Gbps ≈ 500 MB/s INTERCARD_BW +//! Host link (GbE) ~1 Gbps ≈ 100 MB/s HOST_LINK_BW +//! ``` +//! +//! The host link is **5× slower than inter-card** and **20× slower +//! than local DDR**. Collective ops that touch the host become +//! host-bound; without the constant the scheduler cannot model that. +//! +//! ## Usage +//! +//! ``` +//! use spanker_scheduler::{ +//! LOCAL_DDR_BW_BYTES_PER_SEC, +//! INTERCARD_BW_BYTES_PER_SEC, +//! HOST_LINK_BW_BYTES_PER_SEC, +//! }; +//! // Three-tier hierarchy: host link < inter-card < local DDR. +//! assert!(HOST_LINK_BW_BYTES_PER_SEC < INTERCARD_BW_BYTES_PER_SEC); +//! assert!(INTERCARD_BW_BYTES_PER_SEC < LOCAL_DDR_BW_BYTES_PER_SEC); +//! ``` //! //! Cross-references: //! - MAST issue #32 — ADR-001 amendment to correct the 12.8 GB/s claim //! - Stays PR #26 (merged 2026-05-06) — LiteDRAM ECP5 recon +//! - Stays PR #34 (merged 2026-05-06) — LiteEth ECP5 SGMII recon //! - InnerJib7EA PR #11 — connector pinout §9 bandwidth math +//! - This crate's issue #21 — host link constant (cross-stream from +//! Stream 4 LiteEth Day-1 recon) /// Realistic local DDR3 bandwidth ceiling on rev-A /// (ECP5-85F + open toolchain + DDR3L SO-DIMM), in bytes per second. @@ -72,6 +119,34 @@ pub const LOCAL_DDR_BW_BYTES_PER_SEC: u64 = 2_000_000_000; /// cards is cheaper than recomputing it locally. pub const INTERCARD_BW_BYTES_PER_SEC: u64 = 500_000_000; +/// Realistic GbE host-link bandwidth ceiling on rev-A +/// (LiteEth + LiteICLink + 88E1512 SGMII), in bytes per second. +/// +/// **Value:** `100_000_000` (100 MB/s). +/// +/// 1 Gbps line rate on the wire — community measurements on +/// Versa-ECP5 and ECPIX-5 land at 800–940 Mbps with iperf3 UDP +/// (80–94 % of line rate). Pinning the model to 100 MB/s after +/// IP/UDP/Ethernet header overhead gives the scheduler an honest +/// cost estimate for collective ops that round-trip through the +/// host. Source-of-truth: Stays +/// `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md` +/// (Stays PR #34, merged 2026-05-06). +/// +/// **Not** consumed by [`crate::pick_strategy`] today: TP/MP is the +/// per-token decision and most decode tokens stay on-card, so the +/// host-link cost is small per-token but matters at session +/// boundaries (model load, gradient checkpoint to host RAM, +/// dataset streaming, prompt-embedding upload). Future runtime +/// cost-budget logic can compose this constant with the per-token +/// throughput estimate to reason about session-level wall-clock. +/// +/// On rev-A the host link is 5× slower than inter-card and 20× +/// slower than local DDR — it is the dominant cost when collective +/// ops must reach the host. Bump in lockstep with the platform ADR +/// when rev-B moves to 10 GbE or PCIe Gen2. +pub const HOST_LINK_BW_BYTES_PER_SEC: u64 = 100_000_000; + #[cfg(test)] mod tests { // These tests are deliberately constant-vs-constant comparisons: @@ -138,6 +213,23 @@ mod tests { assert_eq!(INTERCARD_BW_BYTES_PER_SEC, 500_000_000); } + /// Pin the host-link constant to the LiteEth ECP5 SGMII recon + /// number — 100 MB/s is the realistic post-IP/UDP throughput on + /// the open-toolchain GbE stack per Stays + /// `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md`. + /// Community measurements on Versa-ECP5 and ECPIX-5 land at + /// 800–940 Mbps UDP iperf3, so 100 MB/s (= 800 Mbps) is the + /// modelled steady-state ceiling. + /// + /// If anyone "rounds up" to 125 MB/s (the theoretical 1 Gbps + /// line-rate number) the scheduler will under-cost any collective + /// op that round-trips through the host. This test is the + /// load-bearing guard against that drift. + #[test] + fn host_link_bw_constant_matches_recon_doc() { + assert_eq!(HOST_LINK_BW_BYTES_PER_SEC, 100_000_000); + } + /// Pin the topology of the bandwidth model: local DDR is the /// higher-throughput resource per card by a comfortable margin. /// The `≥4×` factor is what the TP-vs-MP decision logic relies on @@ -159,12 +251,56 @@ mod tests { ); } - /// Sanity: both constants are non-zero. A zero would silently + /// Pin the rev-A three-tier bandwidth hierarchy: + /// host link < inter-card < local DDR. This is the load-bearing + /// invariant the LiteEth recon (Stays PR #34) flagged when + /// noting the GbE host link is the slowest hop in rev-A. + /// + /// If a future bump inverts any of these (rev-B 10 GbE host + /// link could plausibly approach inter-card, for example) the + /// scheduler's cost model assumptions need to be re-derived; + /// the test forces that conversation rather than letting it + /// slip silently. + #[test] + fn host_link_bw_is_slowest_hop() { + assert!( + HOST_LINK_BW_BYTES_PER_SEC < INTERCARD_BW_BYTES_PER_SEC, + "HOST_LINK_BW ({HOST_LINK_BW_BYTES_PER_SEC}) must be < INTERCARD_BW \ + ({INTERCARD_BW_BYTES_PER_SEC}); rev-A GbE is the slowest hop" + ); + assert!( + INTERCARD_BW_BYTES_PER_SEC < LOCAL_DDR_BW_BYTES_PER_SEC, + "INTERCARD_BW ({INTERCARD_BW_BYTES_PER_SEC}) must be < LOCAL_DDR_BW \ + ({LOCAL_DDR_BW_BYTES_PER_SEC}); inter-card is slower than local DDR" + ); + } + + /// Pin the host-link constant inside the realistic-throughput + /// envelope reported in the LiteEth ECP5 SGMII recon. The + /// community-measured range is 800–940 Mbps UDP iperf3, so + /// the modelled value must be inside [80 MB/s, 125 MB/s] in + /// bytes per second: 80 MB/s lower bound (≈ 640 Mbps after + /// deeper application overhead — leaves room for honest + /// retunes); 125 MB/s upper bound (theoretical 1 Gbps line + /// rate — anything above is unphysical on rev-A GbE). + #[test] + fn host_link_bw_is_inside_observed_range() { + // 80 MB/s lower bound: well below the 800 Mbps UDP iperf + // floor reported in the recon, leaves room for honest + // headers-eating-overhead retunes. + assert!(HOST_LINK_BW_BYTES_PER_SEC >= 80_000_000); + // 125 MB/s upper bound: theoretical 1 Gbps line rate — + // anything above this is unphysical on rev-A GbE. + assert!(HOST_LINK_BW_BYTES_PER_SEC <= 125_000_000); + } + + /// Sanity: all three constants are non-zero. A zero would silently /// turn any throughput-divided cost estimate into a divide-by-zero /// or an infinity, producing nonsense scheduling decisions. #[test] fn constants_are_positive() { assert!(LOCAL_DDR_BW_BYTES_PER_SEC > 0); assert!(INTERCARD_BW_BYTES_PER_SEC > 0); + assert!(HOST_LINK_BW_BYTES_PER_SEC > 0); } } diff --git a/src/scheduler/src/lib.rs b/src/scheduler/src/lib.rs index 9bcb871..f68df07 100644 --- a/src/scheduler/src/lib.rs +++ b/src/scheduler/src/lib.rs @@ -30,15 +30,23 @@ //! [`bandwidth`] module. These supersede the stale ADR-001 //! "DDR3-1600 = 12.8 GB/s" number with realistic //! ECP5+open-toolchain ceilings (cross-stream MAST #32, this -//! crate's issue #14). +//! crate's issues #14 and #21). Three tiers are modelled: +//! local DDR > inter-card > host-link. //! //! ## Bandwidth-model usage //! //! ``` -//! use spanker_scheduler::{LOCAL_DDR_BW_BYTES_PER_SEC, INTERCARD_BW_BYTES_PER_SEC}; -//! // Local DDR is the higher-throughput resource per card; the -//! // TP-vs-MP decision logic relies on this comparison. +//! use spanker_scheduler::{ +//! LOCAL_DDR_BW_BYTES_PER_SEC, +//! INTERCARD_BW_BYTES_PER_SEC, +//! HOST_LINK_BW_BYTES_PER_SEC, +//! }; +//! // Three-tier hierarchy: local DDR > inter-card > host link. +//! // The TP-vs-MP per-token decision logic relies on the first +//! // inequality; session-level cost-budget logic relies on the +//! // second to recognise the host link as the slowest hop. //! assert!(LOCAL_DDR_BW_BYTES_PER_SEC > INTERCARD_BW_BYTES_PER_SEC); +//! assert!(INTERCARD_BW_BYTES_PER_SEC > HOST_LINK_BW_BYTES_PER_SEC); //! ``` #![warn(missing_docs)] @@ -50,7 +58,9 @@ pub mod decision; pub mod intercard; pub mod topology; -pub use bandwidth::{INTERCARD_BW_BYTES_PER_SEC, LOCAL_DDR_BW_BYTES_PER_SEC}; +pub use bandwidth::{ + HOST_LINK_BW_BYTES_PER_SEC, INTERCARD_BW_BYTES_PER_SEC, LOCAL_DDR_BW_BYTES_PER_SEC, +}; pub use collective::{AllGather, AllReduce, ModelParallel, ReduceOp, TensorParallel}; pub use decision::{pick_strategy, Strategy, TileShape}; pub use intercard::{Link, LinkState, INTERCARD_BUS_WIDTH, INTERCARD_LANES, INTERCARD_LANE_WIDTH};