From 1db7b25f0c0910d0031132c18f11e7120b988a25 Mon Sep 17 00:00:00 2001
From: Marcos <m@pop.coop>
Date: Wed, 6 May 2026 13:14:48 -0300
Subject: [PATCH] feat(scheduler): add HOST_LINK_BW constant + 3-way bandwidth
 model (closes #21)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `HOST_LINK_BW_BYTES_PER_SEC = 100_000_000` (100 MB/s) to the
bandwidth model, capturing the rev-A GbE host link as the third
tier of the bandwidth hierarchy:

  Local DDR  (per card)      ~2.0 GB/s   LOCAL_DDR_BW
  Inter-card (per direction) ~500 MB/s   INTERCARD_BW
  Host link  (GbE)           ~100 MB/s   HOST_LINK_BW (NEW)

Source-of-truth: Stays
`docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md`
(Stays PR #34, merged 2026-05-06). Community measurements on
Versa-ECP5 and ECPIX-5 land at 800-940 Mbps UDP iperf3, i.e.
80-94 % of GbE line rate. The 100 MB/s number is the realistic
post-IP/UDP/Ethernet-header steady-state ceiling.

The host link is 5x slower than inter-card and 20x slower than
local DDR — it is the dominant cost when collective ops must
reach the host (model load, gradient checkpoint to host RAM,
dataset streaming, prompt-embedding upload).

## Scope

Minimal — per the issue spec's "if pick_strategy already handles
this" branch:

- `pick_strategy` is the per-token TP/MP decision and most decode
  tokens stay on-card; host-link cost is small per-token and only
  matters at session boundaries.
- No callers exist today for a session-level cost-budget API, so
  introducing `bytes_per_second_per_token_estimate` would be
  speculative generality (YAGNI). Defer until the runtime needs
  it.
- This PR keeps the public surface to a constant + module-level
  doctest update + tests.

## Tests

3 new unit tests in `bandwidth.rs`:

- `host_link_bw_constant_matches_recon_doc` — pins value to
  100_000_000 (guards against silent "round up to 125 MB/s line
  rate" drift).
- `host_link_bw_is_slowest_hop` — pins the three-tier ordering
  HOST_LINK < INTERCARD < LOCAL_DDR.
- `host_link_bw_is_inside_observed_range` — pins 80-125 MB/s
  envelope (community recon range, with line-rate ceiling).

Plus the existing `constants_are_positive` test extended to cover
the new constant.

Module-level doctest in `bandwidth.rs` updated to demonstrate all
three constants. Crate-root doctest in `lib.rs` updated to assert
the three-tier ordering.

## Cargo gates

- `cargo build -p spanker-scheduler`: green
- `cargo test -p spanker-scheduler`: 27 unit + 9 integration + 6
  doctests, all green (delta: +3 unit tests vs PR #19 baseline)
- `cargo clippy -p spanker-scheduler --all-targets -- -D warnings`:
  green
- `cargo fmt -p spanker-scheduler -- --check`: clean

Refs:
- popsolutions/Spanker#21 (this issue)
- popsolutions/Stays#34 (LiteEth ECP5 SGMII recon, source-of-truth)
- popsolutions/Spanker#17 (PR that landed initial 2-tier model)
- popsolutions/Spanker#19 (PR that landed pick_strategy)

Authored by Agent 3 (Software Stack — Spanker).

Signed-off-by: Marcos <m@pop.coop>
---
 src/scheduler/src/bandwidth.rs | 138 ++++++++++++++++++++++++++++++++-
 src/scheduler/src/lib.rs       |  20 +++--
 2 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/src/scheduler/src/bandwidth.rs b/src/scheduler/src/bandwidth.rs
index 4fc233a..7e52098 100644
--- a/src/scheduler/src/bandwidth.rs
+++ b/src/scheduler/src/bandwidth.rs
@@ -15,6 +15,13 @@
 //! - [`INTERCARD_BW_BYTES_PER_SEC`]: InnerJib7EA pinout §9 — 4 lanes
 //!   × ~1.25 Gbps × 8b/10b = ~4 Gbps effective ≈ 500 MB/s per
 //!   direction.
+//! - [`HOST_LINK_BW_BYTES_PER_SEC`]: Stays
+//!   `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md` —
+//!   community bring-up reports on Versa-ECP5 and ECPIX-5 land at
+//!   800–940 Mbps measured (UDP iperf), i.e. 80–94 % of GbE line
+//!   rate. Pinning the model to 100 MB/s after IP/UDP/Ethernet
+//!   header overhead gives the scheduler an honest cost estimate
+//!   for collective ops that round-trip through the host.
 //!
 //! ## Why this module exists
 //!
@@ -25,6 +32,14 @@
 //! "DDR3-1600 = 12.8 GB/s" claim) would over-shard tensor-parallel
 //! workloads that are actually inter-card-fetch bound on rev-A.
 //!
+//! The third constant ([`HOST_LINK_BW_BYTES_PER_SEC`]) captures the
+//! GbE host link — the slowest hop in rev-A's three-tier hierarchy.
+//! It is not consumed by [`crate::pick_strategy`] today (TP/MP is the
+//! per-token decision and most decode tokens stay on-card), but it
+//! is the load-bearing number for any future cost-budget logic that
+//! reasons about model-load, gradient-checkpoint-to-host, or
+//! activation streaming round-trips through the host.
+//!
 //! ## When to bump these constants
 //!
 //! - **`LOCAL_DDR_BW_BYTES_PER_SEC`:** when rev-B targets a faster
@@ -34,11 +49,43 @@
 //! - **`INTERCARD_BW_BYTES_PER_SEC`:** when ADR-014 (line coding)
 //!   chooses 64b/66b instead of 8b/10b, this becomes ~600 MB/s. Bump
 //!   in the same PR that lands ADR-014.
+//! - **`HOST_LINK_BW_BYTES_PER_SEC`:** when rev-B moves from GbE to
+//!   10 GbE or PCIe Gen2 host link, bump in lockstep with the
+//!   platform ADR (factor-of-10 step from 100 MB/s → 1 GB/s for
+//!   10 GbE; ~500 MB/s for PCIe Gen2 x1).
+//!
+//! ## Bandwidth hierarchy this captures (rev-A)
+//!
+//! ```text
+//! Local DDR  (per card)      ~16 Gbps  ≈ 2.0 GB/s   LOCAL_DDR_BW
+//! Inter-card (per direction)  ~4 Gbps  ≈ 500 MB/s   INTERCARD_BW
+//! Host link  (GbE)            ~1 Gbps  ≈ 100 MB/s   HOST_LINK_BW
+//! ```
+//!
+//! The host link is **5× slower than inter-card** and **20× slower
+//! than local DDR**. Collective ops that touch the host become
+//! host-bound; without the constant the scheduler cannot model that.
+//!
+//! ## Usage
+//!
+//! ```
+//! use spanker_scheduler::{
+//!     LOCAL_DDR_BW_BYTES_PER_SEC,
+//!     INTERCARD_BW_BYTES_PER_SEC,
+//!     HOST_LINK_BW_BYTES_PER_SEC,
+//! };
+//! // Three-tier hierarchy: host link < inter-card < local DDR.
+//! assert!(HOST_LINK_BW_BYTES_PER_SEC < INTERCARD_BW_BYTES_PER_SEC);
+//! assert!(INTERCARD_BW_BYTES_PER_SEC < LOCAL_DDR_BW_BYTES_PER_SEC);
+//! ```
 //!
 //! Cross-references:
 //! - MAST issue #32 — ADR-001 amendment to correct the 12.8 GB/s claim
 //! - Stays PR #26 (merged 2026-05-06) — LiteDRAM ECP5 recon
+//! - Stays PR #34 (merged 2026-05-06) — LiteEth ECP5 SGMII recon
 //! - InnerJib7EA PR #11 — connector pinout §9 bandwidth math
+//! - This crate's issue #21 — host link constant (cross-stream from
+//!   Stream 4 LiteEth Day-1 recon)
 
 /// Realistic local DDR3 bandwidth ceiling on rev-A
 /// (ECP5-85F + open toolchain + DDR3L SO-DIMM), in bytes per second.
@@ -72,6 +119,34 @@ pub const LOCAL_DDR_BW_BYTES_PER_SEC: u64 = 2_000_000_000;
 /// cards is cheaper than recomputing it locally.
 pub const INTERCARD_BW_BYTES_PER_SEC: u64 = 500_000_000;
 
+/// Realistic GbE host-link bandwidth ceiling on rev-A
+/// (LiteEth + LiteICLink + 88E1512 SGMII), in bytes per second.
+///
+/// **Value:** `100_000_000` (100 MB/s).
+///
+/// 1 Gbps line rate on the wire — community measurements on
+/// Versa-ECP5 and ECPIX-5 land at 800–940 Mbps with iperf3 UDP
+/// (80–94 % of line rate). Pinning the model to 100 MB/s after
+/// IP/UDP/Ethernet header overhead gives the scheduler an honest
+/// cost estimate for collective ops that round-trip through the
+/// host. Source-of-truth: Stays
+/// `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md`
+/// (Stays PR #34, merged 2026-05-06).
+///
+/// **Not** consumed by [`crate::pick_strategy`] today: TP/MP is the
+/// per-token decision and most decode tokens stay on-card, so the
+/// host-link cost is small per-token but matters at session
+/// boundaries (model load, gradient checkpoint to host RAM,
+/// dataset streaming, prompt-embedding upload). Future runtime
+/// cost-budget logic can compose this constant with the per-token
+/// throughput estimate to reason about session-level wall-clock.
+///
+/// On rev-A the host link is 5× slower than inter-card and 20×
+/// slower than local DDR — it is the dominant cost when collective
+/// ops must reach the host. Bump in lockstep with the platform ADR
+/// when rev-B moves to 10 GbE or PCIe Gen2.
+pub const HOST_LINK_BW_BYTES_PER_SEC: u64 = 100_000_000;
+
 #[cfg(test)]
 mod tests {
     // These tests are deliberately constant-vs-constant comparisons:
@@ -138,6 +213,23 @@ mod tests {
         assert_eq!(INTERCARD_BW_BYTES_PER_SEC, 500_000_000);
     }
 
+    /// Pin the host-link constant to the LiteEth ECP5 SGMII recon
+    /// number — 100 MB/s is the realistic post-IP/UDP throughput on
+    /// the open-toolchain GbE stack per Stays
+    /// `docs/upstream-contributions/2026-05-06-liteeth-ecp5-sgmii.md`.
+    /// Community measurements on Versa-ECP5 and ECPIX-5 land at
+    /// 800–940 Mbps UDP iperf3, so 100 MB/s (= 800 Mbps) is the
+    /// modelled steady-state ceiling.
+    ///
+    /// If anyone "rounds up" to 125 MB/s (the theoretical 1 Gbps
+    /// line-rate number) the scheduler will under-cost any collective
+    /// op that round-trips through the host. This test is the
+    /// load-bearing guard against that drift.
+    #[test]
+    fn host_link_bw_constant_matches_recon_doc() {
+        assert_eq!(HOST_LINK_BW_BYTES_PER_SEC, 100_000_000);
+    }
+
     /// Pin the topology of the bandwidth model: local DDR is the
     /// higher-throughput resource per card by a comfortable margin.
     /// The `≥4×` factor is what the TP-vs-MP decision logic relies on
@@ -159,12 +251,56 @@ mod tests {
         );
     }
 
-    /// Sanity: both constants are non-zero. A zero would silently
+    /// Pin the rev-A three-tier bandwidth hierarchy:
+    /// host link < inter-card < local DDR. This is the load-bearing
+    /// invariant the LiteEth recon (Stays PR #34) flagged when
+    /// noting the GbE host link is the slowest hop in rev-A.
+    ///
+    /// If a future bump inverts any of these (rev-B 10 GbE host
+    /// link could plausibly approach inter-card, for example) the
+    /// scheduler's cost model assumptions need to be re-derived;
+    /// the test forces that conversation rather than letting it
+    /// slip silently.
+    #[test]
+    fn host_link_bw_is_slowest_hop() {
+        assert!(
+            HOST_LINK_BW_BYTES_PER_SEC < INTERCARD_BW_BYTES_PER_SEC,
+            "HOST_LINK_BW ({HOST_LINK_BW_BYTES_PER_SEC}) must be < INTERCARD_BW \
+             ({INTERCARD_BW_BYTES_PER_SEC}); rev-A GbE is the slowest hop"
+        );
+        assert!(
+            INTERCARD_BW_BYTES_PER_SEC < LOCAL_DDR_BW_BYTES_PER_SEC,
+            "INTERCARD_BW ({INTERCARD_BW_BYTES_PER_SEC}) must be < LOCAL_DDR_BW \
+             ({LOCAL_DDR_BW_BYTES_PER_SEC}); inter-card is slower than local DDR"
+        );
+    }
+
+    /// Pin the host-link constant inside the realistic-throughput
+    /// envelope reported in the LiteEth ECP5 SGMII recon. The
+    /// community-measured range is 800–940 Mbps UDP iperf3, so
+    /// the modelled value must be inside [80 MB/s, 125 MB/s] in
+    /// bytes per second: 80 MB/s lower bound (≈ 640 Mbps after
+    /// deeper application overhead — leaves room for honest
+    /// retunes); 125 MB/s upper bound (theoretical 1 Gbps line
+    /// rate — anything above is unphysical on rev-A GbE).
+    #[test]
+    fn host_link_bw_is_inside_observed_range() {
+        // 80 MB/s lower bound: well below the 800 Mbps UDP iperf
+        // floor reported in the recon, leaves room for honest
+        // headers-eating-overhead retunes.
+        assert!(HOST_LINK_BW_BYTES_PER_SEC >= 80_000_000);
+        // 125 MB/s upper bound: theoretical 1 Gbps line rate —
+        // anything above this is unphysical on rev-A GbE.
+        assert!(HOST_LINK_BW_BYTES_PER_SEC <= 125_000_000);
+    }
+
+    /// Sanity: all three constants are non-zero. A zero would silently
     /// turn any throughput-divided cost estimate into a divide-by-zero
     /// or an infinity, producing nonsense scheduling decisions.
     #[test]
     fn constants_are_positive() {
         assert!(LOCAL_DDR_BW_BYTES_PER_SEC > 0);
         assert!(INTERCARD_BW_BYTES_PER_SEC > 0);
+        assert!(HOST_LINK_BW_BYTES_PER_SEC > 0);
     }
 }
diff --git a/src/scheduler/src/lib.rs b/src/scheduler/src/lib.rs
index 9bcb871..f68df07 100644
--- a/src/scheduler/src/lib.rs
+++ b/src/scheduler/src/lib.rs
@@ -30,15 +30,23 @@
 //!   [`bandwidth`] module. These supersede the stale ADR-001
 //!   "DDR3-1600 = 12.8 GB/s" number with realistic
 //!   ECP5+open-toolchain ceilings (cross-stream MAST #32, this
-//!   crate's issue #14).
+//!   crate's issues #14 and #21). Three tiers are modelled:
+//!   local DDR > inter-card > host-link.
 //!
 //! ## Bandwidth-model usage
 //!
 //! ```
-//! use spanker_scheduler::{LOCAL_DDR_BW_BYTES_PER_SEC, INTERCARD_BW_BYTES_PER_SEC};
-//! // Local DDR is the higher-throughput resource per card; the
-//! // TP-vs-MP decision logic relies on this comparison.
+//! use spanker_scheduler::{
+//!     LOCAL_DDR_BW_BYTES_PER_SEC,
+//!     INTERCARD_BW_BYTES_PER_SEC,
+//!     HOST_LINK_BW_BYTES_PER_SEC,
+//! };
+//! // Three-tier hierarchy: local DDR > inter-card > host link.
+//! // The TP-vs-MP per-token decision logic relies on the first
+//! // inequality; session-level cost-budget logic relies on the
+//! // second to recognise the host link as the slowest hop.
 //! assert!(LOCAL_DDR_BW_BYTES_PER_SEC > INTERCARD_BW_BYTES_PER_SEC);
+//! assert!(INTERCARD_BW_BYTES_PER_SEC > HOST_LINK_BW_BYTES_PER_SEC);
 //! ```
 
 #![warn(missing_docs)]
@@ -50,7 +58,9 @@ pub mod decision;
 pub mod intercard;
 pub mod topology;
 
-pub use bandwidth::{INTERCARD_BW_BYTES_PER_SEC, LOCAL_DDR_BW_BYTES_PER_SEC};
+pub use bandwidth::{
+    HOST_LINK_BW_BYTES_PER_SEC, INTERCARD_BW_BYTES_PER_SEC, LOCAL_DDR_BW_BYTES_PER_SEC,
+};
 pub use collective::{AllGather, AllReduce, ModelParallel, ReduceOp, TensorParallel};
 pub use decision::{pick_strategy, Strategy, TileShape};
 pub use intercard::{Link, LinkState, INTERCARD_BUS_WIDTH, INTERCARD_LANES, INTERCARD_LANE_WIDTH};