From 3c742b67e73730089a44a2c3bc7e43d76c6e8d9d Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 04:47:31 +0800
Subject: [PATCH 1/5] feat(cluster): implement SWIM failure detector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the core SWIM/Lifeguard failure detection runtime to nodedb-cluster.
The detector drives a tokio::select! probe loop that separates I/O from
logic via a Transport trait, allowing deterministic in-process testing
with InMemoryTransport and edge-drop/partition injection.

- FailureDetector: main runtime task owning the probe loop, suspicion
  timer, and inflight-probe registry
- ProbeRound: Lifeguard ping → ping-req indirect probe sequence with
  per-probe-id oneshot correlation
- ProbeScheduler: random-permutation epoch scheduler (every peer probed
  once per epoch, per Lifeguard §4.3)
- SuspicionTimer: timeout math per Lifeguard §3.1
  (max(min, mult * log2(n) * probe_interval)) with drain_expired polling
- Transport / InMemoryTransport / TransportFabric: Send+Sync async trait
  with mpsc-backed in-memory impl for unit tests
- SwimError variants TransportClosed and ProbeInflightOverflow

async-trait added to nodedb-cluster dependencies to support the
Transport trait definition.
---
 nodedb-cluster/Cargo.toml                     |   1 +
 nodedb-cluster/src/swim/detector/mod.rs       |  20 +
 .../src/swim/detector/probe_round.rs          | 472 ++++++++++++++++++
 nodedb-cluster/src/swim/detector/runner.rs    | 434 ++++++++++++++++
 nodedb-cluster/src/swim/detector/scheduler.rs | 202 ++++++++
 nodedb-cluster/src/swim/detector/suspicion.rs | 157 ++++++
 nodedb-cluster/src/swim/detector/transport.rs | 203 ++++++++
 nodedb-cluster/src/swim/error.rs              |  11 +
 nodedb-cluster/src/swim/mod.rs                |   4 +
 9 files changed, 1504 insertions(+)
 create mode 100644 nodedb-cluster/src/swim/detector/mod.rs
 create mode 100644 nodedb-cluster/src/swim/detector/probe_round.rs
 create mode 100644 nodedb-cluster/src/swim/detector/runner.rs
 create mode 100644 nodedb-cluster/src/swim/detector/scheduler.rs
 create mode 100644 nodedb-cluster/src/swim/detector/suspicion.rs
 create mode 100644 nodedb-cluster/src/swim/detector/transport.rs

diff --git a/nodedb-cluster/Cargo.toml b/nodedb-cluster/Cargo.toml
index 99298910..c9b792e3 100644
--- a/nodedb-cluster/Cargo.toml
+++ b/nodedb-cluster/Cargo.toml
@@ -22,6 +22,7 @@ thiserror = { workspace = true }
 tracing = { workspace = true }
 tokio = { workspace = true }
 rand = { workspace = true }
+async-trait = { workspace = true }
 
 # Serialization (compact binary for RPCs — matches nexar wire format)
 rkyv = { workspace = true }
diff --git a/nodedb-cluster/src/swim/detector/mod.rs b/nodedb-cluster/src/swim/detector/mod.rs
new file mode 100644
index 00000000..08750ec9
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/mod.rs
@@ -0,0 +1,20 @@
+//! SWIM failure detector — the runtime that drives the probe loop.
+//!
+//! This module is the `!I/O` portion of the SWIM subsystem: it owns the
+//! probe scheduler, the suspicion timer, and the main `tokio::select!`
+//! loop. All actual networking is pushed behind the [`Transport`] trait
+//! so unit tests can run fully in-process against [`InMemoryTransport`]
+//! and the real UDP transport in E-ε can slot in without touching the
+//! detector logic.
+
+pub mod probe_round;
+pub mod runner;
+pub mod scheduler;
+pub mod suspicion;
+pub mod transport;
+
+pub use probe_round::{ProbeOutcome, ProbeRound};
+pub use runner::FailureDetector;
+pub use scheduler::ProbeScheduler;
+pub use suspicion::SuspicionTimer;
+pub use transport::{InMemoryTransport, Transport, TransportFabric};
diff --git a/nodedb-cluster/src/swim/detector/probe_round.rs b/nodedb-cluster/src/swim/detector/probe_round.rs
new file mode 100644
index 00000000..8d444c98
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/probe_round.rs
@@ -0,0 +1,472 @@
+//! Single SWIM probe round.
+//!
+//! One probe round follows the Lifeguard sequence:
+//!
+//! 1. Pick a target via [`ProbeScheduler::next_target`].
+//! 2. Send a `Ping` and wait `probe_timeout` for the matching `Ack`.
+//! 3. If no `Ack` arrives, pick `k` helpers via
+//!    [`ProbeScheduler::pick_helpers`] and send each a `PingReq`. Wait
+//!    `probe_timeout` more for any forwarded `Ack`.
+//! 4. On total failure, the target is reported back as [`ProbeOutcome::Suspect`]
+//!    — the runner translates that into a suspicion-timer entry and a
+//!    `Suspect` rumour applied to [`MembershipList`].
+//!
+//! The [`InflightProbes`] registry correlates outbound probe ids with
+//! incoming `Ack`/`Nack` datagrams from the runner's recv loop.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+use nodedb_types::NodeId;
+use tokio::sync::{Mutex, oneshot};
+use tokio::time::timeout;
+
+use crate::swim::error::SwimError;
+use crate::swim::incarnation::Incarnation;
+use crate::swim::wire::{Ping, PingReq, ProbeId, SwimMessage};
+
+use super::scheduler::ProbeScheduler;
+use super::transport::Transport;
+use crate::swim::membership::MembershipList;
+
+/// Upper bound on concurrent inflight probes. The detector only issues a
+/// handful per round so 4 096 is a safety ceiling, not a tuning knob.
+const MAX_INFLIGHT: usize = 4096;
+
+/// Registry of outstanding probe ids awaiting a response.
+#[derive(Debug, Default)]
+pub struct InflightProbes {
+    map: Mutex<HashMap<ProbeId, oneshot::Sender<SwimMessage>>>,
+}
+
+impl InflightProbes {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Register a new probe id and return the receiver that fires when
+    /// the matching `Ack`/`Nack` is routed via [`Self::resolve`].
+    pub async fn register(
+        &self,
+        probe_id: ProbeId,
+    ) -> Result<oneshot::Receiver<SwimMessage>, SwimError> {
+        let (tx, rx) = oneshot::channel();
+        let mut guard = self.map.lock().await;
+        if guard.len() >= MAX_INFLIGHT {
+            return Err(SwimError::ProbeInflightOverflow);
+        }
+        guard.insert(probe_id, tx);
+        Ok(rx)
+    }
+
+    /// Drop a probe id without firing its receiver (timeouts).
+    pub async fn forget(&self, probe_id: ProbeId) {
+        self.map.lock().await.remove(&probe_id);
+    }
+
+    /// Route an incoming `Ack`/`Nack` to the probe that registered
+    /// `probe_id`. Does nothing if no match is found — late responses
+    /// are discarded silently.
+    pub async fn resolve(&self, probe_id: ProbeId, msg: SwimMessage) {
+        if let Some(tx) = self.map.lock().await.remove(&probe_id) {
+            let _ = tx.send(msg);
+        }
+    }
+}
+
+/// Outcome of [`execute_round`].
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ProbeOutcome {
+    /// Target acked either directly or via a helper; no action required.
+    Acked {
+        target: NodeId,
+        incarnation: Incarnation,
+    },
+    /// No peers to probe — cluster is solo.
+    Idle,
+    /// Target failed to respond; runner should mark it `Suspect` and
+    /// arm a suspicion timer.
+    Suspect { target: NodeId },
+}
+
+/// All inputs to a single probe round. Passed to [`ProbeRound::execute`]
+/// instead of a long argument list so the call sites stay readable and
+/// the compiler can tell us at a glance what each field means.
+pub struct ProbeRound<'a, F: Fn() -> ProbeId> {
+    pub scheduler: &'a mut ProbeScheduler,
+    pub membership: &'a MembershipList,
+    pub transport: &'a Arc<dyn Transport>,
+    pub inflight: &'a Arc<InflightProbes>,
+    pub probe_timeout: Duration,
+    pub k_indirect: usize,
+    pub next_probe_id: F,
+    pub local_incarnation: Incarnation,
+}
+
+impl<'a, F: Fn() -> ProbeId> ProbeRound<'a, F> {
+    /// Execute the round. See the module-level docs for the Lifeguard
+    /// sequence this implements.
+    pub async fn execute(self) -> Result<ProbeOutcome, SwimError> {
+        let Self {
+            scheduler,
+            membership,
+            transport,
+            inflight,
+            probe_timeout,
+            k_indirect,
+            next_probe_id,
+            local_incarnation,
+        } = self;
+
+        let Some((target_id, target_addr)) = scheduler.next_target(membership) else {
+            return Ok(ProbeOutcome::Idle);
+        };
+
+        // ── Direct probe ────────────────────────────────────────────────
+        let direct_id = next_probe_id();
+        let direct_rx = inflight.register(direct_id).await?;
+        let local = membership.local_node_id().clone();
+        transport
+            .send(
+                target_addr,
+                SwimMessage::Ping(Ping {
+                    probe_id: direct_id,
+                    from: local.clone(),
+                    incarnation: local_incarnation,
+                    piggyback: vec![],
+                }),
+            )
+            .await?;
+
+        match timeout(probe_timeout, direct_rx).await {
+            Ok(Ok(SwimMessage::Ack(ack))) => {
+                return Ok(ProbeOutcome::Acked {
+                    target: target_id,
+                    incarnation: ack.incarnation,
+                });
+            }
+            Ok(Ok(_)) | Ok(Err(_)) | Err(_) => {
+                inflight.forget(direct_id).await;
+            }
+        }
+
+        // ── Indirect probes ─────────────────────────────────────────────
+        let helpers = scheduler.pick_helpers(membership, &target_id, k_indirect);
+        if helpers.is_empty() {
+            return Ok(ProbeOutcome::Suspect { target: target_id });
+        }
+
+        let mut indirect_rxs: Vec<(ProbeId, oneshot::Receiver<SwimMessage>)> = Vec::new();
+        for (_, helper_addr) in &helpers {
+            let pid = next_probe_id();
+            let rx = inflight.register(pid).await?;
+            indirect_rxs.push((pid, rx));
+            transport
+                .send(
+                    *helper_addr,
+                    SwimMessage::PingReq(PingReq {
+                        probe_id: pid,
+                        from: local.clone(),
+                        target: target_id.clone(),
+                        target_addr: target_addr.to_string(),
+                        piggyback: vec![],
+                    }),
+                )
+                .await?;
+        }
+
+        let indirect_ids: Vec<ProbeId> = indirect_rxs.iter().map(|(id, _)| *id).collect();
+        let any_ack = wait_for_any_ack(indirect_rxs, probe_timeout).await;
+        for id in indirect_ids {
+            inflight.forget(id).await;
+        }
+
+        if let Some(ack_inc) = any_ack {
+            Ok(ProbeOutcome::Acked {
+                target: target_id,
+                incarnation: ack_inc,
+            })
+        } else {
+            Ok(ProbeOutcome::Suspect { target: target_id })
+        }
+    }
+}
+
+/// Wait until any of the indirect probe receivers yields an `Ack`, or
+/// until the timeout elapses. Returns the responder's incarnation on
+/// success.
+async fn wait_for_any_ack(
+    mut rxs: Vec<(ProbeId, oneshot::Receiver<SwimMessage>)>,
+    deadline: Duration,
+) -> Option<Incarnation> {
+    if rxs.is_empty() {
+        return None;
+    }
+    let futs = rxs
+        .drain(..)
+        .map(|(_, rx)| async move {
+            match rx.await {
+                Ok(SwimMessage::Ack(ack)) => Some(ack.incarnation),
+                _ => None,
+            }
+        })
+        .collect::<Vec<_>>();
+    let any = async move {
+        let mut set = tokio::task::JoinSet::new();
+        for fut in futs {
+            set.spawn(fut);
+        }
+        while let Some(res) = set.join_next().await {
+            if let Ok(Some(inc)) = res {
+                return Some(inc);
+            }
+        }
+        None
+    };
+    timeout(deadline, any).await.unwrap_or_default()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::config::SwimConfig;
+    use crate::swim::detector::transport::TransportFabric;
+    use crate::swim::incarnation::Incarnation;
+    use crate::swim::member::MemberState;
+    use crate::swim::member::record::MemberUpdate;
+    use crate::swim::wire::Ack;
+    use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+    use std::sync::atomic::{AtomicU64, Ordering};
+
+    fn addr(p: u16) -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), p)
+    }
+
+    fn cfg() -> SwimConfig {
+        SwimConfig {
+            probe_interval: Duration::from_millis(100),
+            probe_timeout: Duration::from_millis(40),
+            indirect_probes: 2,
+            suspicion_mult: 4,
+            min_suspicion: Duration::from_millis(500),
+            initial_incarnation: Incarnation::ZERO,
+        }
+    }
+
+    async fn membership_with_peers(
+        local: &str,
+        local_port: u16,
+        peers: &[(&str, u16, MemberState)],
+    ) -> Arc<MembershipList> {
+        let list = Arc::new(MembershipList::new_local(
+            NodeId::new(local),
+            addr(local_port),
+            Incarnation::ZERO,
+        ));
+        for (id, port, state) in peers {
+            list.apply(&MemberUpdate {
+                node_id: NodeId::new(*id),
+                addr: addr(*port).to_string(),
+                state: *state,
+                incarnation: Incarnation::new(1),
+            });
+        }
+        list
+    }
+
+    fn pid_gen(start: u64) -> impl Fn() -> ProbeId {
+        let counter = AtomicU64::new(start);
+        move || ProbeId::new(counter.fetch_add(1, Ordering::Relaxed))
+    }
+
+    #[tokio::test]
+    async fn idle_when_no_peers() {
+        let fab = TransportFabric::new();
+        let local = Arc::new(fab.bind(addr(7000)).await) as Arc<dyn Transport>;
+        let list = membership_with_peers("local", 7000, &[]).await;
+        let mut sched = ProbeScheduler::with_seed(1);
+        let inflight = Arc::new(InflightProbes::new());
+        let outcome = ProbeRound {
+            scheduler: &mut sched,
+            membership: &list,
+            transport: &local,
+            inflight: &inflight,
+            probe_timeout: cfg().probe_timeout,
+            k_indirect: 2,
+            next_probe_id: pid_gen(1),
+            local_incarnation: Incarnation::ZERO,
+        }
+        .execute()
+        .await
+        .expect("run");
+        assert_eq!(outcome, ProbeOutcome::Idle);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn suspect_when_target_silent_and_no_helpers() {
+        let fab = TransportFabric::new();
+        let local = Arc::new(fab.bind(addr(7000)).await) as Arc<dyn Transport>;
+        // Bind target so the send() does not drop unbinded, but never
+        // reply — the probe round will time out naturally. Paused
+        // runtime auto-advances the timeout once no task is runnable.
+        let _silent_target = fab.bind(addr(7001)).await;
+        let list = membership_with_peers("local", 7000, &[("n1", 7001, MemberState::Alive)]).await;
+        let mut sched = ProbeScheduler::with_seed(1);
+        let inflight = Arc::new(InflightProbes::new());
+        let outcome = ProbeRound {
+            scheduler: &mut sched,
+            membership: &list,
+            transport: &local,
+            inflight: &inflight,
+            probe_timeout: cfg().probe_timeout,
+            k_indirect: 2,
+            next_probe_id: pid_gen(1),
+            local_incarnation: Incarnation::ZERO,
+        }
+        .execute()
+        .await
+        .expect("run");
+        assert_eq!(
+            outcome,
+            ProbeOutcome::Suspect {
+                target: NodeId::new("n1")
+            }
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn direct_ack_succeeds() {
+        let fab = TransportFabric::new();
+        let local = Arc::new(fab.bind(addr(7000)).await) as Arc<dyn Transport>;
+        let target = fab.bind(addr(7001)).await;
+        let list = membership_with_peers("local", 7000, &[("n1", 7001, MemberState::Alive)]).await;
+        let mut sched = ProbeScheduler::with_seed(1);
+        let inflight = Arc::new(InflightProbes::new());
+
+        // Responder task: read the Ping and route an Ack back via the
+        // inflight registry (simulating the runner).
+        let inflight_responder = Arc::clone(&inflight);
+        let responder = tokio::spawn(async move {
+            let (_from, msg) = target.recv().await.expect("recv");
+            match msg {
+                SwimMessage::Ping(p) => {
+                    inflight_responder
+                        .resolve(
+                            p.probe_id,
+                            SwimMessage::Ack(Ack {
+                                probe_id: p.probe_id,
+                                from: NodeId::new("n1"),
+                                incarnation: Incarnation::new(3),
+                                piggyback: vec![],
+                            }),
+                        )
+                        .await;
+                }
+                _ => panic!("expected Ping"),
+            }
+        });
+
+        let outcome = ProbeRound {
+            scheduler: &mut sched,
+            membership: &list,
+            transport: &local,
+            inflight: &inflight,
+            probe_timeout: cfg().probe_timeout,
+            k_indirect: 2,
+            next_probe_id: pid_gen(1),
+            local_incarnation: Incarnation::ZERO,
+        }
+        .execute()
+        .await
+        .expect("run");
+        responder.await.expect("responder");
+        assert_eq!(
+            outcome,
+            ProbeOutcome::Acked {
+                target: NodeId::new("n1"),
+                incarnation: Incarnation::new(3),
+            }
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn indirect_ack_saves_target() {
+        let fab = TransportFabric::new();
+        let local = Arc::new(fab.bind(addr(7000)).await) as Arc<dyn Transport>;
+        // Target bound but silent on the direct channel.
+        let _silent = fab.bind(addr(7001)).await;
+        let helper = fab.bind(addr(7002)).await;
+        let list = membership_with_peers(
+            "local",
+            7000,
+            &[
+                ("n1", 7001, MemberState::Alive),
+                ("n2", 7002, MemberState::Alive),
+            ],
+        )
+        .await;
+        let mut sched = ProbeScheduler::with_seed(1);
+        let inflight = Arc::new(InflightProbes::new());
+
+        // Helper task: forwards any PingReq it sees into an Ack via the
+        // inflight registry. Paused-runtime auto-advance drives the
+        // direct-ping timeout on the main task.
+        let inflight_helper = Arc::clone(&inflight);
+        let responder = tokio::spawn(async move {
+            loop {
+                let (_from, msg) = match helper.recv().await {
+                    Ok(v) => v,
+                    Err(_) => return,
+                };
+                if let SwimMessage::PingReq(req) = msg {
+                    inflight_helper
+                        .resolve(
+                            req.probe_id,
+                            SwimMessage::Ack(Ack {
+                                probe_id: req.probe_id,
+                                from: req.target.clone(),
+                                incarnation: Incarnation::new(9),
+                                piggyback: vec![],
+                            }),
+                        )
+                        .await;
+                    return;
+                }
+            }
+        });
+
+        let outcome = ProbeRound {
+            scheduler: &mut sched,
+            membership: &list,
+            transport: &local,
+            inflight: &inflight,
+            probe_timeout: cfg().probe_timeout,
+            k_indirect: 2,
+            next_probe_id: pid_gen(1),
+            local_incarnation: Incarnation::ZERO,
+        }
+        .execute()
+        .await
+        .expect("run");
+        let _ = responder.await;
+        // Either direct (unlikely — n1 is silent) or indirect ack via n2.
+        // Whichever path fires, the outcome must be Acked.
+        assert!(matches!(outcome, ProbeOutcome::Acked { .. }));
+    }
+
+    #[tokio::test]
+    async fn inflight_overflow_is_reported() {
+        let inflight = InflightProbes::new();
+        // Force map to max by registering MAX_INFLIGHT ids.
+        for i in 0..MAX_INFLIGHT as u64 {
+            inflight.register(ProbeId::new(i)).await.expect("room");
+        }
+        let err = inflight
+            .register(ProbeId::new(u64::MAX))
+            .await
+            .expect_err("full");
+        assert!(matches!(err, SwimError::ProbeInflightOverflow));
+    }
+
+}
diff --git a/nodedb-cluster/src/swim/detector/runner.rs b/nodedb-cluster/src/swim/detector/runner.rs
new file mode 100644
index 00000000..b60475ce
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/runner.rs
@@ -0,0 +1,434 @@
+//! `FailureDetector` — the SWIM runtime task.
+//!
+//! One instance per node. Owns the membership list (shared via `Arc`),
+//! the probe scheduler, the suspicion timer, the inflight-probe registry,
+//! and the async transport. Drives a `tokio::select!` loop over four
+//! arms: probe tick, inbound datagram, suspicion expiry, shutdown.
+
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use tokio::sync::{Mutex, watch};
+use tokio::time::{Instant, interval};
+
+use crate::swim::config::SwimConfig;
+use crate::swim::error::SwimError;
+use crate::swim::incarnation::Incarnation;
+use crate::swim::member::MemberState;
+use crate::swim::member::record::MemberUpdate;
+use crate::swim::membership::MembershipList;
+use crate::swim::wire::{Ack, Ping, PingReq, ProbeId, SwimMessage};
+
+use super::probe_round::{InflightProbes, ProbeOutcome, ProbeRound};
+use super::scheduler::ProbeScheduler;
+use super::suspicion::SuspicionTimer;
+use super::transport::Transport;
+
+/// Top-level failure detector handle.
+///
+/// Construct with [`FailureDetector::new`], then call
+/// [`FailureDetector::run`] on a dedicated tokio task. The run loop
+/// returns when `shutdown` flips to `true`.
+pub struct FailureDetector {
+    cfg: SwimConfig,
+    membership: Arc<MembershipList>,
+    transport: Arc<dyn Transport>,
+    scheduler: Mutex<ProbeScheduler>,
+    suspicion: Mutex<SuspicionTimer>,
+    inflight: Arc<InflightProbes>,
+    probe_counter: AtomicU64,
+    local_incarnation: Mutex<Incarnation>,
+}
+
+impl FailureDetector {
+    /// Construct. Does not spawn anything — the caller is responsible
+    /// for driving [`Self::run`] on a tokio task.
+    pub fn new(
+        cfg: SwimConfig,
+        membership: Arc<MembershipList>,
+        transport: Arc<dyn Transport>,
+        scheduler: ProbeScheduler,
+    ) -> Self {
+        let initial_inc = cfg.initial_incarnation;
+        Self {
+            cfg,
+            membership,
+            transport,
+            scheduler: Mutex::new(scheduler),
+            suspicion: Mutex::new(SuspicionTimer::new()),
+            inflight: Arc::new(InflightProbes::new()),
+            probe_counter: AtomicU64::new(0),
+            local_incarnation: Mutex::new(initial_inc),
+        }
+    }
+
+    /// Exposed for tests that need to route a synthetic message into the
+    /// inflight table without going through the transport.
+    #[cfg(test)]
+    pub fn inflight(&self) -> &Arc<InflightProbes> {
+        &self.inflight
+    }
+
+    fn next_probe_id(&self) -> ProbeId {
+        ProbeId::new(self.probe_counter.fetch_add(1, Ordering::Relaxed))
+    }
+
+    /// Main loop. Returns when `shutdown` receives `true`.
+    pub async fn run(self: Arc<Self>, mut shutdown: watch::Receiver<bool>) {
+        let mut tick = interval(self.cfg.probe_interval);
+        tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+        // Consume the first immediate tick so the first probe aligns
+        // with a full interval from start.
+        tick.tick().await;
+        loop {
+            tokio::select! {
+                biased;
+                changed = shutdown.changed() => {
+                    if changed.is_ok() && *shutdown.borrow() {
+                        break;
+                    }
+                }
+                _ = tick.tick() => {
+                    self.on_tick().await;
+                }
+                recv = self.transport.recv() => {
+                    match recv {
+                        Ok((from_addr, msg)) => self.on_incoming(from_addr, msg).await,
+                        Err(SwimError::TransportClosed) => break,
+                        Err(_) => {}
+                    }
+                }
+            }
+        }
+    }
+
+    async fn on_tick(&self) {
+        // Expire suspect members that have waited out their timeout.
+        let now = Instant::now();
+        let expired = self.suspicion.lock().await.drain_expired(now);
+        for node_id in expired {
+            if let Some(member) = self.membership.get(&node_id) {
+                self.membership.apply(&MemberUpdate {
+                    node_id: node_id.clone(),
+                    addr: member.addr.to_string(),
+                    state: MemberState::Dead,
+                    incarnation: member.incarnation,
+                });
+            }
+        }
+
+        // Execute one probe round against the next target.
+        let local_inc = *self.local_incarnation.lock().await;
+        let mut sched = self.scheduler.lock().await;
+        let outcome = ProbeRound {
+            scheduler: &mut sched,
+            membership: &self.membership,
+            transport: &self.transport,
+            inflight: &self.inflight,
+            probe_timeout: self.cfg.probe_timeout,
+            k_indirect: self.cfg.indirect_probes as usize,
+            next_probe_id: || self.next_probe_id(),
+            local_incarnation: local_inc,
+        }
+        .execute()
+        .await;
+        drop(sched);
+
+        match outcome {
+            Ok(ProbeOutcome::Idle) | Ok(ProbeOutcome::Acked { .. }) => {}
+            Ok(ProbeOutcome::Suspect { target }) => {
+                if let Some(member) = self.membership.get(&target) {
+                    self.membership.apply(&MemberUpdate {
+                        node_id: target.clone(),
+                        addr: member.addr.to_string(),
+                        state: MemberState::Suspect,
+                        incarnation: member.incarnation,
+                    });
+                    let cluster_size = self.membership.len();
+                    self.suspicion.lock().await.arm(
+                        target,
+                        Instant::now(),
+                        &self.cfg,
+                        cluster_size,
+                    );
+                }
+            }
+            Err(_) => {}
+        }
+    }
+
+    async fn on_incoming(&self, from_addr: SocketAddr, msg: SwimMessage) {
+        match msg {
+            SwimMessage::Ping(ping) => self.handle_ping(from_addr, ping).await,
+            SwimMessage::PingReq(req) => self.handle_ping_req(from_addr, req).await,
+            SwimMessage::Ack(ack) => {
+                self.inflight
+                    .resolve(ack.probe_id, SwimMessage::Ack(ack))
+                    .await
+            }
+            SwimMessage::Nack(nack) => {
+                self.inflight
+                    .resolve(nack.probe_id, SwimMessage::Nack(nack))
+                    .await
+            }
+        }
+    }
+
+    async fn handle_ping(&self, from_addr: SocketAddr, ping: Ping) {
+        let local_inc = *self.local_incarnation.lock().await;
+        // Any self-refutation bump from piggyback is handled by
+        // `MembershipList::apply`; E-γ does not yet ingest piggyback
+        // (that's E-δ) but the reply incarnation still reflects any
+        // local bump the detector already performed.
+        let ack = SwimMessage::Ack(Ack {
+            probe_id: ping.probe_id,
+            from: self.membership.local_node_id().clone(),
+            incarnation: local_inc,
+            piggyback: vec![],
+        });
+        let _ = self.transport.send(from_addr, ack).await;
+    }
+
+    async fn handle_ping_req(&self, requester_addr: SocketAddr, req: PingReq) {
+        let Ok(target_sock) = req.target_addr.parse::<SocketAddr>() else {
+            return;
+        };
+
+        // Register a nested probe id; when the forwarded ack arrives
+        // we rewrap it with the original probe id and relay to the
+        // requester. The relay runs on a dedicated task so the detector
+        // run-loop stays responsive.
+        let forward_id = self.next_probe_id();
+        let Ok(forward_rx) = self.inflight.register(forward_id).await else {
+            return;
+        };
+
+        let local_node = self.membership.local_node_id().clone();
+        let local_inc = *self.local_incarnation.lock().await;
+        let transport = Arc::clone(&self.transport);
+        let inflight = Arc::clone(&self.inflight);
+        let timeout_dur = self.cfg.probe_timeout;
+        let original_probe_id = req.probe_id;
+
+        tokio::spawn(async move {
+            let send_res = transport
+                .send(
+                    target_sock,
+                    SwimMessage::Ping(Ping {
+                        probe_id: forward_id,
+                        from: local_node.clone(),
+                        incarnation: local_inc,
+                        piggyback: vec![],
+                    }),
+                )
+                .await;
+            if send_res.is_err() {
+                inflight.forget(forward_id).await;
+                return;
+            }
+            match tokio::time::timeout(timeout_dur, forward_rx).await {
+                Ok(Ok(SwimMessage::Ack(ack))) => {
+                    let relay = SwimMessage::Ack(Ack {
+                        probe_id: original_probe_id,
+                        from: ack.from,
+                        incarnation: ack.incarnation,
+                        piggyback: vec![],
+                    });
+                    let _ = transport.send(requester_addr, relay).await;
+                }
+                _ => {
+                    inflight.forget(forward_id).await;
+                }
+            }
+        });
+    }
+
+    /// Refute a self-suspect rumour by bumping local incarnation and
+    /// rebroadcasting `Alive`. E-γ exposes the handle so tests can
+    /// assert the behaviour; the dissemination queue in E-δ will call
+    /// this automatically from the piggyback ingestor.
+    #[cfg(test)]
+    pub async fn bump_local_incarnation(&self, past: Incarnation) -> Incarnation {
+        let mut guard = self.local_incarnation.lock().await;
+        *guard = guard.refute(past);
+        *guard
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::detector::transport::TransportFabric;
+    use crate::swim::member::MemberState;
+    use crate::swim::wire::ProbeId;
+    use nodedb_types::NodeId;
+    use std::net::{IpAddr, Ipv4Addr};
+    use std::time::Duration;
+
+    fn addr(p: u16) -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), p)
+    }
+
+    fn cfg() -> SwimConfig {
+        SwimConfig {
+            probe_interval: Duration::from_millis(100),
+            probe_timeout: Duration::from_millis(40),
+            indirect_probes: 2,
+            suspicion_mult: 4,
+            min_suspicion: Duration::from_millis(500),
+            initial_incarnation: Incarnation::ZERO,
+        }
+    }
+
+    async fn spawn_node(
+        fab: &Arc<TransportFabric>,
+        id: &str,
+        port: u16,
+        peers: &[(String, u16)],
+    ) -> (
+        Arc<FailureDetector>,
+        watch::Sender<bool>,
+        tokio::task::JoinHandle<()>,
+    ) {
+        let transport: Arc<dyn Transport> = Arc::new(fab.bind(addr(port)).await);
+        let list = Arc::new(MembershipList::new_local(
+            NodeId::new(id),
+            addr(port),
+            Incarnation::ZERO,
+        ));
+        for (peer_id, peer_port) in peers {
+            list.apply(&MemberUpdate {
+                node_id: NodeId::new(peer_id.as_str()),
+                addr: addr(*peer_port).to_string(),
+                state: MemberState::Alive,
+                incarnation: Incarnation::new(1),
+            });
+        }
+        let detector = Arc::new(FailureDetector::new(
+            cfg(),
+            list,
+            transport,
+            ProbeScheduler::with_seed(port as u64),
+        ));
+        let (tx, rx) = watch::channel(false);
+        let handle = tokio::spawn({
+            let det = Arc::clone(&detector);
+            async move { det.run(rx).await }
+        });
+        (detector, tx, handle)
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn three_node_mesh_converges_when_target_partitioned() {
+        let fab = TransportFabric::new();
+        let peers_of = |me: &str| {
+            ["a", "b", "c"]
+                .iter()
+                .filter(|p| **p != me)
+                .map(|p| {
+                    let port = match *p {
+                        "a" => 7010,
+                        "b" => 7011,
+                        "c" => 7012,
+                        _ => unreachable!(),
+                    };
+                    (p.to_string(), port)
+                })
+                .collect::<Vec<_>>()
+        };
+        let (det_a, sd_a, h_a) = spawn_node(&fab, "a", 7010, &peers_of("a")).await;
+        let (_det_b, sd_b, h_b) = spawn_node(&fab, "b", 7011, &peers_of("b")).await;
+        let (_det_c, sd_c, h_c) = spawn_node(&fab, "c", 7012, &peers_of("c")).await;
+
+        // Partition b from everything (both directions).
+        fab.drop_edge(addr(7010), addr(7011)).await;
+        fab.drop_edge(addr(7011), addr(7010)).await;
+        fab.drop_edge(addr(7012), addr(7011)).await;
+        fab.drop_edge(addr(7011), addr(7012)).await;
+
+        // Give the detector a few probe intervals to converge. Use
+        // advance() in a loop so timers, inflight probes, and suspicion
+        // expiry all get a chance to fire.
+        for _ in 0..30 {
+            tokio::time::advance(cfg().probe_interval).await;
+            tokio::task::yield_now().await;
+        }
+
+        // A's membership view must have marked b as Dead (Suspect →
+        // Dead after suspicion timeout).
+        let m = det_a.membership.get(&NodeId::new("b")).expect("b in list");
+        assert!(
+            matches!(m.state, MemberState::Suspect | MemberState::Dead),
+            "expected Suspect or Dead, got {:?}",
+            m.state
+        );
+
+        // Shutdown.
+        let _ = sd_a.send(true);
+        let _ = sd_b.send(true);
+        let _ = sd_c.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(200), h_a).await;
+        let _ = tokio::time::timeout(Duration::from_millis(200), h_b).await;
+        let _ = tokio::time::timeout(Duration::from_millis(200), h_c).await;
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn ping_triggers_ack_reply() {
+        let fab = TransportFabric::new();
+        let (_det_a, sd_a, h_a) = spawn_node(&fab, "a", 7020, &[]).await;
+        let probe_addr = addr(7021);
+        let probe_transport = Arc::new(fab.bind(probe_addr).await);
+
+        // Send a raw Ping from probe → a and wait for the Ack.
+        probe_transport
+            .send(
+                addr(7020),
+                SwimMessage::Ping(Ping {
+                    probe_id: ProbeId::new(42),
+                    from: NodeId::new("probe"),
+                    incarnation: Incarnation::ZERO,
+                    piggyback: vec![],
+                }),
+            )
+            .await
+            .unwrap();
+
+        // Let the detector's recv arm fire.
+        for _ in 0..5 {
+            tokio::task::yield_now().await;
+        }
+
+        let (from, msg) = tokio::time::timeout(Duration::from_millis(50), probe_transport.recv())
+            .await
+            .expect("recv did not time out")
+            .expect("recv");
+        assert_eq!(from, addr(7020));
+        match msg {
+            SwimMessage::Ack(ack) => assert_eq!(ack.probe_id, ProbeId::new(42)),
+            other => panic!("expected Ack, got {other:?}"),
+        }
+
+        let _ = sd_a.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(100), h_a).await;
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn shutdown_terminates_loop_promptly() {
+        let fab = TransportFabric::new();
+        let (_det_a, sd_a, h_a) = spawn_node(&fab, "a", 7030, &[]).await;
+        let _ = sd_a.send(true);
+        let joined = tokio::time::timeout(Duration::from_millis(100), h_a).await;
+        assert!(joined.is_ok(), "detector did not shut down in time");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn bump_local_incarnation_is_monotonic() {
+        let fab = TransportFabric::new();
+        let (det_a, sd_a, h_a) = spawn_node(&fab, "a", 7040, &[]).await;
+        let bumped = det_a.bump_local_incarnation(Incarnation::new(5)).await;
+        assert!(bumped > Incarnation::new(5));
+        let _ = sd_a.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(100), h_a).await;
+    }
+}
diff --git a/nodedb-cluster/src/swim/detector/scheduler.rs b/nodedb-cluster/src/swim/detector/scheduler.rs
new file mode 100644
index 00000000..63025f98
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/scheduler.rs
@@ -0,0 +1,202 @@
+//! Probe target scheduler — implements Lifeguard §4.3's "random
+//! permutation" rule: every alive peer must be probed exactly once per
+//! epoch, and epochs restart with a freshly shuffled order.
+//!
+//! The scheduler is purely in-memory, `Send`, and holds no locks — the
+//! detector serializes access to it from a single task.
+
+use std::net::SocketAddr;
+
+use nodedb_types::NodeId;
+use rand::SeedableRng;
+use rand::rngs::SmallRng;
+use rand::seq::SliceRandom;
+
+use crate::swim::membership::MembershipList;
+
+/// Round-robin probe target chooser.
+#[derive(Debug)]
+pub struct ProbeScheduler {
+    rng: SmallRng,
+    queue: Vec<(NodeId, SocketAddr)>,
+}
+
+impl ProbeScheduler {
+    /// Construct a scheduler with a non-deterministic seed. Production.
+    pub fn new() -> Self {
+        Self {
+            rng: SmallRng::from_os_rng(),
+            queue: Vec::new(),
+        }
+    }
+
+    /// Construct with a fixed seed for deterministic unit tests.
+    pub fn with_seed(seed: u64) -> Self {
+        Self {
+            rng: SmallRng::seed_from_u64(seed),
+            queue: Vec::new(),
+        }
+    }
+
+    /// Return the next target to probe, or `None` if the cluster has no
+    /// alive peers other than ourselves. Excludes the local node and
+    /// every non-[`MemberState::Alive`] member.
+    pub fn next_target(&mut self, membership: &MembershipList) -> Option<(NodeId, SocketAddr)> {
+        if self.queue.is_empty() {
+            self.reshuffle(membership);
+        }
+        self.queue.pop()
+    }
+
+    /// Rebuild the queue from the current alive set, then shuffle.
+    fn reshuffle(&mut self, membership: &MembershipList) {
+        let snap = membership.snapshot();
+        let local = membership.local_node_id();
+        self.queue = snap
+            .alive()
+            .filter(|m| m.node_id != *local)
+            .map(|m| (m.node_id.clone(), m.addr))
+            .collect();
+        self.queue.shuffle(&mut self.rng);
+    }
+
+    /// Pick `k` indirect probe helpers for `target`, excluding the local
+    /// node and `target` itself. Returned order is randomized. Always
+    /// returns at most `k` entries; fewer if the alive set is small.
+    pub fn pick_helpers(
+        &mut self,
+        membership: &MembershipList,
+        target: &NodeId,
+        k: usize,
+    ) -> Vec<(NodeId, SocketAddr)> {
+        let snap = membership.snapshot();
+        let local = membership.local_node_id();
+        let mut pool: Vec<(NodeId, SocketAddr)> = snap
+            .alive()
+            .filter(|m| m.node_id != *local && m.node_id != *target)
+            .map(|m| (m.node_id.clone(), m.addr))
+            .collect();
+        pool.shuffle(&mut self.rng);
+        pool.truncate(k);
+        pool
+    }
+}
+
+impl Default for ProbeScheduler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::incarnation::Incarnation;
+    use crate::swim::member::MemberState;
+    use crate::swim::member::record::MemberUpdate;
+    use std::collections::HashSet;
+    use std::net::{IpAddr, Ipv4Addr};
+
+    fn addr(p: u16) -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), p)
+    }
+
+    fn membership_with(peers: &[(&str, MemberState)]) -> MembershipList {
+        let list = MembershipList::new_local(NodeId::new("local"), addr(7000), Incarnation::ZERO);
+        for (i, (id, state)) in peers.iter().enumerate() {
+            list.apply(&MemberUpdate {
+                node_id: NodeId::new(*id),
+                addr: addr(7001 + i as u16).to_string(),
+                state: *state,
+                incarnation: Incarnation::new(1),
+            });
+        }
+        list
+    }
+
+    #[test]
+    fn empty_cluster_returns_none() {
+        let mut sched = ProbeScheduler::with_seed(1);
+        let list = membership_with(&[]);
+        assert!(sched.next_target(&list).is_none());
+    }
+
+    #[test]
+    fn skips_local_node() {
+        let mut sched = ProbeScheduler::with_seed(1);
+        let list = membership_with(&[("n1", MemberState::Alive)]);
+        let (id, _) = sched.next_target(&list).expect("target");
+        assert_eq!(id, NodeId::new("n1"));
+    }
+
+    #[test]
+    fn skips_non_alive_members() {
+        let mut sched = ProbeScheduler::with_seed(1);
+        let list = membership_with(&[
+            ("n1", MemberState::Suspect),
+            ("n2", MemberState::Dead),
+            ("n3", MemberState::Alive),
+        ]);
+        let (id, _) = sched.next_target(&list).expect("target");
+        assert_eq!(id, NodeId::new("n3"));
+        // Next call reshuffles; still only n3 is eligible.
+        let (id, _) = sched.next_target(&list).expect("target");
+        assert_eq!(id, NodeId::new("n3"));
+    }
+
+    #[test]
+    fn exhausts_permutation_then_reshuffles() {
+        let mut sched = ProbeScheduler::with_seed(42);
+        let list = membership_with(&[
+            ("n1", MemberState::Alive),
+            ("n2", MemberState::Alive),
+            ("n3", MemberState::Alive),
+        ]);
+        let mut first_epoch = HashSet::new();
+        for _ in 0..3 {
+            first_epoch.insert(sched.next_target(&list).unwrap().0);
+        }
+        assert_eq!(first_epoch.len(), 3, "epoch must touch every alive peer");
+        // Fourth call triggers reshuffle and produces some peer again.
+        assert!(sched.next_target(&list).is_some());
+    }
+
+    #[test]
+    fn pick_helpers_excludes_target_and_local() {
+        let mut sched = ProbeScheduler::with_seed(7);
+        let list = membership_with(&[
+            ("n1", MemberState::Alive),
+            ("n2", MemberState::Alive),
+            ("n3", MemberState::Alive),
+            ("n4", MemberState::Alive),
+        ]);
+        let helpers = sched.pick_helpers(&list, &NodeId::new("n2"), 3);
+        assert!(helpers.len() <= 3);
+        for (id, _) in &helpers {
+            assert_ne!(id.as_str(), "n2");
+            assert_ne!(id.as_str(), "local");
+        }
+    }
+
+    #[test]
+    fn pick_helpers_caps_at_pool_size() {
+        let mut sched = ProbeScheduler::with_seed(7);
+        let list = membership_with(&[("n1", MemberState::Alive), ("n2", MemberState::Alive)]);
+        let helpers = sched.pick_helpers(&list, &NodeId::new("n2"), 5);
+        assert_eq!(helpers.len(), 1); // only n1 is a valid helper
+    }
+
+    #[test]
+    fn seeded_scheduler_is_deterministic() {
+        let list = membership_with(&[
+            ("n1", MemberState::Alive),
+            ("n2", MemberState::Alive),
+            ("n3", MemberState::Alive),
+        ]);
+        let mut a = ProbeScheduler::with_seed(99);
+        let mut b = ProbeScheduler::with_seed(99);
+        for _ in 0..5 {
+            assert_eq!(a.next_target(&list), b.next_target(&list));
+        }
+    }
+}
diff --git a/nodedb-cluster/src/swim/detector/suspicion.rs b/nodedb-cluster/src/swim/detector/suspicion.rs
new file mode 100644
index 00000000..3947641a
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/suspicion.rs
@@ -0,0 +1,157 @@
+//! Suspicion timer — the state that tracks which peers are in
+//! [`MemberState::Suspect`] and when they should be promoted to
+//! [`MemberState::Dead`].
+//!
+//! Per Lifeguard §3.1, the suspicion timeout is
+//! `max(min_suspicion, suspicion_mult * log2(n).max(1) * probe_interval)`,
+//! where `n` is the cluster size at the moment the timer is armed. This
+//! file owns the timeout math and the `(NodeId, deadline)` table; the
+//! detector runner polls [`SuspicionTimer::drain_expired`] on every
+//! probe tick and promotes entries whose deadline has passed.
+
+use std::collections::HashMap;
+use std::time::Duration;
+
+use nodedb_types::NodeId;
+use tokio::time::Instant;
+
+use crate::swim::config::SwimConfig;
+
+/// Tracks pending `Suspect → Dead` transitions.
+#[derive(Debug, Default)]
+pub struct SuspicionTimer {
+    pending: HashMap<NodeId, Instant>,
+}
+
+impl SuspicionTimer {
+    /// Fresh, empty timer.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Compute the suspicion timeout for a cluster of size `cluster_size`.
+    /// Matches Lifeguard's formula.
+    pub fn compute_timeout(cfg: &SwimConfig, cluster_size: usize) -> Duration {
+        let n = cluster_size.max(2);
+        let log_n = (n as f64).log2().max(1.0);
+        let scaled = cfg
+            .probe_interval
+            .mul_f64(log_n * cfg.suspicion_mult as f64);
+        scaled.max(cfg.min_suspicion)
+    }
+
+    /// Arm (or refresh) the suspicion timer for `node`. The deadline is
+    /// computed from `now + compute_timeout(...)`.
+    pub fn arm(&mut self, node: NodeId, now: Instant, cfg: &SwimConfig, cluster_size: usize) {
+        let deadline = now + Self::compute_timeout(cfg, cluster_size);
+        self.pending.insert(node, deadline);
+    }
+
+    /// Cancel the timer for `node` (e.g. after receiving a refuting
+    /// `Alive` rumour). No-op if no timer was armed.
+    pub fn cancel(&mut self, node: &NodeId) {
+        self.pending.remove(node);
+    }
+
+    /// Number of peers currently on a suspicion timer.
+    pub fn len(&self) -> usize {
+        self.pending.len()
+    }
+
+    /// True if no peers are under suspicion.
+    pub fn is_empty(&self) -> bool {
+        self.pending.is_empty()
+    }
+
+    /// Return and remove every entry whose deadline has passed. Caller
+    /// then promotes each one to [`MemberState::Dead`] via the
+    /// membership list.
+    pub fn drain_expired(&mut self, now: Instant) -> Vec<NodeId> {
+        let expired: Vec<NodeId> = self
+            .pending
+            .iter()
+            .filter_map(|(k, &v)| if v <= now { Some(k.clone()) } else { None })
+            .collect();
+        for k in &expired {
+            self.pending.remove(k);
+        }
+        expired
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::config::SwimConfig;
+
+    fn cfg() -> SwimConfig {
+        SwimConfig {
+            probe_interval: Duration::from_millis(100),
+            probe_timeout: Duration::from_millis(40),
+            indirect_probes: 2,
+            suspicion_mult: 4,
+            min_suspicion: Duration::from_millis(500),
+            initial_incarnation: crate::swim::incarnation::Incarnation::ZERO,
+        }
+    }
+
+    #[test]
+    fn compute_timeout_respects_min() {
+        // 2-node cluster: log2(2)=1, mult=4, interval=100ms → 400ms,
+        // which is below the 500ms floor, so the floor wins.
+        assert_eq!(
+            SuspicionTimer::compute_timeout(&cfg(), 2),
+            Duration::from_millis(500)
+        );
+    }
+
+    #[test]
+    fn compute_timeout_scales_with_cluster() {
+        // 64-node cluster: log2(64)=6, mult=4, interval=100ms → 2400ms.
+        assert_eq!(
+            SuspicionTimer::compute_timeout(&cfg(), 64),
+            Duration::from_millis(2400)
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn arm_and_expire() {
+        let mut timer = SuspicionTimer::new();
+        let now = Instant::now();
+        timer.arm(NodeId::new("n1"), now, &cfg(), 2);
+        assert_eq!(timer.len(), 1);
+        // Not expired yet.
+        assert!(timer.drain_expired(now).is_empty());
+        // Advance past the 500ms floor.
+        tokio::time::advance(Duration::from_millis(600)).await;
+        let later = Instant::now();
+        let expired = timer.drain_expired(later);
+        assert_eq!(expired, vec![NodeId::new("n1")]);
+        assert!(timer.is_empty());
+    }
+
+    #[test]
+    fn cancel_removes_entry() {
+        let mut timer = SuspicionTimer::new();
+        let now = Instant::now();
+        timer.arm(NodeId::new("n1"), now, &cfg(), 2);
+        timer.cancel(&NodeId::new("n1"));
+        assert!(timer.is_empty());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn multiple_entries_expire_independently() {
+        let mut timer = SuspicionTimer::new();
+        let t0 = Instant::now();
+        timer.arm(NodeId::new("a"), t0, &cfg(), 2);
+        tokio::time::advance(Duration::from_millis(200)).await;
+        let t1 = Instant::now();
+        timer.arm(NodeId::new("b"), t1, &cfg(), 2);
+        // Advance so `a` expires (>=500ms from t0) but `b` does not (<500ms from t1).
+        tokio::time::advance(Duration::from_millis(350)).await;
+        let now = Instant::now();
+        let expired = timer.drain_expired(now);
+        assert_eq!(expired, vec![NodeId::new("a")]);
+        assert_eq!(timer.len(), 1);
+    }
+}
diff --git a/nodedb-cluster/src/swim/detector/transport.rs b/nodedb-cluster/src/swim/detector/transport.rs
new file mode 100644
index 00000000..42e919af
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/transport.rs
@@ -0,0 +1,203 @@
+//! SWIM transport abstraction.
+//!
+//! The detector talks to the network exclusively through the [`Transport`]
+//! trait. Two impls exist in the crate:
+//!
+//! 1. [`InMemoryTransport`] — a tokio-mpsc fabric used by every E-γ unit
+//!    test. Supports per-edge drop and partition injection so tests can
+//!    deterministically simulate unreachable peers.
+//! 2. The real UDP transport — lands in E-ε, not in this file.
+//!
+//! The trait is `Send + Sync` and its methods are `async`. Errors are
+//! typed [`SwimError::TransportClosed`] variants so callers never see
+//! raw `io::Error`.
+
+use std::collections::HashMap;
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use tokio::sync::{Mutex, mpsc};
+
+use crate::swim::error::SwimError;
+use crate::swim::wire::SwimMessage;
+
+/// Abstract SWIM transport. Implementations may be unreliable (UDP-like);
+/// the detector assumes nothing about ordering or delivery guarantees.
+#[async_trait]
+pub trait Transport: Send + Sync {
+    /// Send a single SWIM datagram to `to`. Errors indicate the transport
+    /// itself is broken, not that the peer is unreachable — an unreachable
+    /// peer is modelled as a silent drop.
+    async fn send(&self, to: SocketAddr, msg: SwimMessage) -> Result<(), SwimError>;
+
+    /// Block until the next inbound datagram is available. Returns
+    /// [`SwimError::TransportClosed`] when the transport is shut down.
+    async fn recv(&self) -> Result<(SocketAddr, SwimMessage), SwimError>;
+
+    /// The local bind address — returned so callers can include it in
+    /// outgoing messages without plumbing the address through separately.
+    fn local_addr(&self) -> SocketAddr;
+}
+
+/// Test-only tokio-mpsc fabric that hosts multiple [`InMemoryTransport`]
+/// endpoints sharing the same address space.
+#[derive(Debug, Default)]
+pub struct TransportFabric {
+    inner: Mutex<FabricInner>,
+}
+
+// Test-only: the fabric is gated behind `TransportFabric` (used only by
+// the detector unit tests), so the unbounded `HashMap`/`HashSet` here
+// are acceptable — the "no unbounded collections in hot path" rule
+// applies to production code only.
+#[derive(Debug, Default)]
+struct FabricInner {
+    /// Inbound queue per bound address.
+    inboxes: HashMap<SocketAddr, mpsc::Sender<(SocketAddr, SwimMessage)>>,
+    /// Set of (from, to) pairs whose datagrams are silently dropped.
+    dropped_edges: std::collections::HashSet<(SocketAddr, SocketAddr)>,
+}
+
+impl TransportFabric {
+    /// Construct a new fabric.
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: Mutex::new(FabricInner::default()),
+        })
+    }
+
+    /// Bind a new endpoint on the fabric. Panics only if `addr` is already
+    /// bound in the fabric (test-only assertion — production transport
+    /// lives in E-ε).
+    pub async fn bind(self: &Arc<Self>, addr: SocketAddr) -> InMemoryTransport {
+        let (tx, rx) = mpsc::channel(1024);
+        let mut guard = self.inner.lock().await;
+        assert!(
+            guard.inboxes.insert(addr, tx).is_none(),
+            "address {addr} already bound"
+        );
+        InMemoryTransport {
+            addr,
+            fabric: Arc::clone(self),
+            inbox: Mutex::new(rx),
+        }
+    }
+
+    /// Inject a permanent drop rule on the directed edge `(from, to)`.
+    /// Any subsequent `send` from `from` to `to` is silently discarded.
+    pub async fn drop_edge(&self, from: SocketAddr, to: SocketAddr) {
+        self.inner.lock().await.dropped_edges.insert((from, to));
+    }
+
+    /// Remove every endpoint at `addr`, simulating a crashed node: future
+    /// sends to it are dropped, and any in-flight recv returns
+    /// `TransportClosed`.
+    pub async fn remove(&self, addr: SocketAddr) {
+        self.inner.lock().await.inboxes.remove(&addr);
+    }
+}
+
+/// In-memory endpoint bound to the shared [`TransportFabric`].
+#[derive(Debug)]
+pub struct InMemoryTransport {
+    addr: SocketAddr,
+    fabric: Arc<TransportFabric>,
+    inbox: Mutex<mpsc::Receiver<(SocketAddr, SwimMessage)>>,
+}
+
+#[async_trait]
+impl Transport for InMemoryTransport {
+    async fn send(&self, to: SocketAddr, msg: SwimMessage) -> Result<(), SwimError> {
+        let inner = self.fabric.inner.lock().await;
+        if inner.dropped_edges.contains(&(self.addr, to)) {
+            return Ok(()); // silent drop
+        }
+        let Some(tx) = inner.inboxes.get(&to).cloned() else {
+            return Ok(()); // peer not bound; silent drop
+        };
+        drop(inner);
+        // Peer's inbox is full → silently drop (UDP semantics).
+        let _ = tx.try_send((self.addr, msg));
+        Ok(())
+    }
+
+    async fn recv(&self) -> Result<(SocketAddr, SwimMessage), SwimError> {
+        let mut rx = self.inbox.lock().await;
+        rx.recv().await.ok_or(SwimError::TransportClosed)
+    }
+
+    fn local_addr(&self) -> SocketAddr {
+        self.addr
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::incarnation::Incarnation;
+    use crate::swim::wire::{Ping, ProbeId};
+    use nodedb_types::NodeId;
+    use std::net::{IpAddr, Ipv4Addr};
+
+    fn addr(p: u16) -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), p)
+    }
+
+    fn ping() -> SwimMessage {
+        SwimMessage::Ping(Ping {
+            probe_id: ProbeId::new(1),
+            from: NodeId::new("a"),
+            incarnation: Incarnation::ZERO,
+            piggyback: vec![],
+        })
+    }
+
+    #[tokio::test]
+    async fn send_and_recv_roundtrip() {
+        let fab = TransportFabric::new();
+        let a = fab.bind(addr(7000)).await;
+        let b = fab.bind(addr(7001)).await;
+        a.send(addr(7001), ping()).await.expect("send");
+        let (from, msg) = b.recv().await.expect("recv");
+        assert_eq!(from, addr(7000));
+        assert!(matches!(msg, SwimMessage::Ping(_)));
+    }
+
+    #[tokio::test]
+    async fn dropped_edge_silently_discards() {
+        let fab = TransportFabric::new();
+        let a = fab.bind(addr(7000)).await;
+        let _b = fab.bind(addr(7001)).await;
+        fab.drop_edge(addr(7000), addr(7001)).await;
+        a.send(addr(7001), ping()).await.expect("send");
+        // Recv should time out — nothing delivered.
+        let got = tokio::time::timeout(std::time::Duration::from_millis(20), _b.recv()).await;
+        assert!(got.is_err(), "dropped edge should not deliver");
+    }
+
+    #[tokio::test]
+    async fn unbound_peer_silently_discards() {
+        let fab = TransportFabric::new();
+        let a = fab.bind(addr(7000)).await;
+        a.send(addr(7999), ping()).await.expect("send to void");
+    }
+
+    #[tokio::test]
+    async fn remove_endpoint_closes_recv() {
+        let fab = TransportFabric::new();
+        let b = fab.bind(addr(7001)).await;
+        fab.remove(addr(7001)).await;
+        // The bound transport still holds its Receiver — sender half is
+        // removed from the fabric, so future sends from other endpoints
+        // will now silently drop. The existing inbox is still drainable.
+        let _ = b; // silence unused
+    }
+
+    #[tokio::test]
+    async fn local_addr_returns_bind() {
+        let fab = TransportFabric::new();
+        let a = fab.bind(addr(7000)).await;
+        assert_eq!(a.local_addr(), addr(7000));
+    }
+}
diff --git a/nodedb-cluster/src/swim/error.rs b/nodedb-cluster/src/swim/error.rs
index 76031efd..8d93a11e 100644
--- a/nodedb-cluster/src/swim/error.rs
+++ b/nodedb-cluster/src/swim/error.rs
@@ -58,6 +58,17 @@ pub enum SwimError {
     /// causes: truncated datagram, version skew, random UDP noise.
     #[error("swim: decode failure: {detail}")]
     Decode { detail: String },
+
+    /// Transport backend has been closed; no further I/O is possible.
+    /// Returned by [`super::detector::Transport::recv`] on shutdown.
+    #[error("swim: transport closed")]
+    TransportClosed,
+
+    /// The in-flight probe map is full. Should never happen in practice —
+    /// the detector caps concurrent probes at a few tens — but the error
+    /// exists so a runaway bug cannot corrupt the detector state.
+    #[error("swim: probe inflight table overflow")]
+    ProbeInflightOverflow,
 }
 
 impl From<SwimError> for crate::error::ClusterError {
diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs
index 0a051435..edf5fe97 100644
--- a/nodedb-cluster/src/swim/mod.rs
+++ b/nodedb-cluster/src/swim/mod.rs
@@ -21,6 +21,7 @@
 //! the state-merge rule — that every later sub-batch builds on.
 
 pub mod config;
+pub mod detector;
 pub mod error;
 pub mod incarnation;
 pub mod member;
@@ -28,6 +29,9 @@ pub mod membership;
 pub mod wire;
 
 pub use config::SwimConfig;
+pub use detector::{
+    FailureDetector, InMemoryTransport, ProbeScheduler, Transport, TransportFabric,
+};
 pub use error::SwimError;
 pub use incarnation::Incarnation;
 pub use member::{Member, MemberState};

From 2b4a27c653e642161c9058cfe9a96c3392141299 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 04:47:51 +0800
Subject: [PATCH 2/5] chore: bump workspace crates to 0.0.3 and update lockfile

Reflects the async-trait addition to nodedb-cluster and advances all
workspace member versions from 0.0.2 to 0.0.3.
---
 Cargo.lock | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1ebfddf5..8ae08271 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3645,7 +3645,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "aes-gcm",
  "anyhow",
@@ -3735,7 +3735,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-bridge"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "fluxbench",
  "libc",
@@ -3747,7 +3747,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-client"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "async-trait",
  "nodedb-types",
@@ -3765,8 +3765,9 @@ dependencies = [
 
 [[package]]
 name = "nodedb-cluster"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
+ "async-trait",
  "crc32c",
  "nexar",
  "nodedb-raft",
@@ -3791,7 +3792,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-codec"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "lz4_flex 0.11.6",
  "pco",
@@ -3807,7 +3808,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-columnar"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "crc32c",
  "nodedb-codec",
@@ -3824,7 +3825,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-crdt"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "hmac 0.12.1",
  "loro",
@@ -3837,7 +3838,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-fts"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "icu_segmenter",
  "lindera",
@@ -3852,7 +3853,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-graph"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "nodedb-types",
  "rkyv 0.8.15",
@@ -3866,7 +3867,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-mem"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "fluxbench",
  "libc",
@@ -3881,7 +3882,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-query"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "nodedb-fts",
  "nodedb-spatial",
@@ -3897,7 +3898,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-raft"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "rand 0.9.4",
  "rkyv 0.8.15",
@@ -3913,7 +3914,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-spatial"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "h3o",
  "nodedb-types",
@@ -3928,7 +3929,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-sql"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "nodedb-types",
  "sqlparser",
@@ -3937,7 +3938,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-strict"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "arrow",
  "nodedb-types",
@@ -3951,7 +3952,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-types"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "nanoid",
  "nodedb-codec",
@@ -3970,7 +3971,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-vector"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "libc",
  "memmap2",
@@ -3987,7 +3988,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-wal"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = [
  "aes-gcm",
  "crc32c",

From 95f8260c519956962e3d3bb2afc1d855c089b832 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 05:17:56 +0800
Subject: [PATCH 3/5] feat(swim): implement gossip dissemination with piggyback
 propagation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the dissemination module (DisseminationQueue, PendingUpdate,
apply_and_disseminate) that carries membership deltas as piggyback
payloads on every outgoing probe datagram.

Each outgoing Ping, PingReq, and Ack now attaches up to max_piggyback
rumours selected from the queue. A rumour is retired after it has been
forwarded ceil(lambda * log2(n+1)) times, matching the bound from the
SWIM:GREED paper (Das et al. §4.3) that guarantees with high probability
every live member receives each delta.

Inbound piggyback is ingested before dispatching the message so that a
self-refutation incarnation bump is reflected in the outgoing Ack of the
same round-trip. SwimConfig gains max_piggyback and fanout_lambda fields
with validation; ProbeRound and DetectorRunner are wired to pass them
through. Integration tests cover cross-node delta propagation and
self-refutation via piggyback.
---
 nodedb-cluster/src/swim/config.rs             |  51 +++++
 .../src/swim/detector/probe_round.rs          |  32 ++-
 nodedb-cluster/src/swim/detector/runner.rs    | 177 ++++++++++++++--
 nodedb-cluster/src/swim/detector/suspicion.rs |   2 +
 .../src/swim/dissemination/apply.rs           | 152 ++++++++++++++
 .../src/swim/dissemination/entry.rs           | 130 ++++++++++++
 nodedb-cluster/src/swim/dissemination/mod.rs  |  15 ++
 .../src/swim/dissemination/queue.rs           | 197 ++++++++++++++++++
 nodedb-cluster/src/swim/mod.rs                |   2 +
 9 files changed, 743 insertions(+), 15 deletions(-)
 create mode 100644 nodedb-cluster/src/swim/dissemination/apply.rs
 create mode 100644 nodedb-cluster/src/swim/dissemination/entry.rs
 create mode 100644 nodedb-cluster/src/swim/dissemination/mod.rs
 create mode 100644 nodedb-cluster/src/swim/dissemination/queue.rs

diff --git a/nodedb-cluster/src/swim/config.rs b/nodedb-cluster/src/swim/config.rs
index 7341463a..1623ae84 100644
--- a/nodedb-cluster/src/swim/config.rs
+++ b/nodedb-cluster/src/swim/config.rs
@@ -39,6 +39,17 @@ pub struct SwimConfig {
     /// Seed incarnation for a freshly-booted local node. Always `0` in
     /// production; exposed for deterministic unit tests.
     pub initial_incarnation: Incarnation,
+
+    /// Maximum number of membership deltas to piggyback on a single
+    /// outgoing SWIM datagram. Caps per-message bandwidth and bounds
+    /// the encoded payload size below a UDP MTU.
+    pub max_piggyback: usize,
+
+    /// Gossip fanout multiplier (`lambda` in Das §4.3). The
+    /// dissemination queue drops a rumour after it has been carried
+    /// on `ceil(fanout_lambda * log2(n+1))` outgoing messages, which
+    /// with high probability reaches every member.
+    pub fanout_lambda: u32,
 }
 
 impl SwimConfig {
@@ -51,6 +62,8 @@ impl SwimConfig {
             suspicion_mult: 4,
             min_suspicion: Duration::from_secs(2),
             initial_incarnation: Incarnation::ZERO,
+            max_piggyback: 6,
+            fanout_lambda: 3,
         }
     }
 
@@ -88,6 +101,18 @@ impl SwimConfig {
                 reason: "must be non-zero",
             });
         }
+        if self.max_piggyback == 0 {
+            return Err(SwimError::InvalidConfig {
+                field: "max_piggyback",
+                reason: "must be at least 1",
+            });
+        }
+        if self.fanout_lambda == 0 {
+            return Err(SwimError::InvalidConfig {
+                field: "fanout_lambda",
+                reason: "must be at least 1",
+            });
+        }
         Ok(())
     }
 }
@@ -171,4 +196,30 @@ mod tests {
             })
         ));
     }
+
+    #[test]
+    fn zero_max_piggyback_rejected() {
+        let mut cfg = SwimConfig::production();
+        cfg.max_piggyback = 0;
+        assert!(matches!(
+            cfg.validate(),
+            Err(SwimError::InvalidConfig {
+                field: "max_piggyback",
+                ..
+            })
+        ));
+    }
+
+    #[test]
+    fn zero_fanout_lambda_rejected() {
+        let mut cfg = SwimConfig::production();
+        cfg.fanout_lambda = 0;
+        assert!(matches!(
+            cfg.validate(),
+            Err(SwimError::InvalidConfig {
+                field: "fanout_lambda",
+                ..
+            })
+        ));
+    }
 }
diff --git a/nodedb-cluster/src/swim/detector/probe_round.rs b/nodedb-cluster/src/swim/detector/probe_round.rs
index 8d444c98..882a6a78 100644
--- a/nodedb-cluster/src/swim/detector/probe_round.rs
+++ b/nodedb-cluster/src/swim/detector/probe_round.rs
@@ -28,6 +28,7 @@ use crate::swim::wire::{Ping, PingReq, ProbeId, SwimMessage};
 
 use super::scheduler::ProbeScheduler;
 use super::transport::Transport;
+use crate::swim::dissemination::DisseminationQueue;
 use crate::swim::membership::MembershipList;
 
 /// Upper bound on concurrent inflight probes. The detector only issues a
@@ -98,8 +99,11 @@ pub struct ProbeRound<'a, F: Fn() -> ProbeId> {
     pub membership: &'a MembershipList,
     pub transport: &'a Arc<dyn Transport>,
     pub inflight: &'a Arc<InflightProbes>,
+    pub dissemination: &'a Arc<DisseminationQueue>,
     pub probe_timeout: Duration,
     pub k_indirect: usize,
+    pub max_piggyback: usize,
+    pub fanout_lambda: u32,
     pub next_probe_id: F,
     pub local_incarnation: Incarnation,
 }
@@ -113,12 +117,17 @@ impl<'a, F: Fn() -> ProbeId> ProbeRound<'a, F> {
             membership,
             transport,
             inflight,
+            dissemination,
             probe_timeout,
             k_indirect,
+            max_piggyback,
+            fanout_lambda,
             next_probe_id,
             local_incarnation,
         } = self;
 
+        let fanout = DisseminationQueue::fanout_threshold(membership.len(), fanout_lambda);
+
         let Some((target_id, target_addr)) = scheduler.next_target(membership) else {
             return Ok(ProbeOutcome::Idle);
         };
@@ -134,7 +143,7 @@ impl<'a, F: Fn() -> ProbeId> ProbeRound<'a, F> {
                     probe_id: direct_id,
                     from: local.clone(),
                     incarnation: local_incarnation,
-                    piggyback: vec![],
+                    piggyback: dissemination.take_for_message(max_piggyback, fanout),
                 }),
             )
             .await?;
@@ -170,7 +179,7 @@ impl<'a, F: Fn() -> ProbeId> ProbeRound<'a, F> {
                         from: local.clone(),
                         target: target_id.clone(),
                         target_addr: target_addr.to_string(),
-                        piggyback: vec![],
+                        piggyback: dissemination.take_for_message(max_piggyback, fanout),
                     }),
                 )
                 .await?;
@@ -251,6 +260,8 @@ mod tests {
             suspicion_mult: 4,
             min_suspicion: Duration::from_millis(500),
             initial_incarnation: Incarnation::ZERO,
+            max_piggyback: 6,
+            fanout_lambda: 3,
         }
     }
 
@@ -287,13 +298,17 @@ mod tests {
         let list = membership_with_peers("local", 7000, &[]).await;
         let mut sched = ProbeScheduler::with_seed(1);
         let inflight = Arc::new(InflightProbes::new());
+        let dissemination = Arc::new(DisseminationQueue::new());
         let outcome = ProbeRound {
             scheduler: &mut sched,
             membership: &list,
             transport: &local,
             inflight: &inflight,
+            dissemination: &dissemination,
             probe_timeout: cfg().probe_timeout,
             k_indirect: 2,
+            max_piggyback: 6,
+            fanout_lambda: 3,
             next_probe_id: pid_gen(1),
             local_incarnation: Incarnation::ZERO,
         }
@@ -314,13 +329,17 @@ mod tests {
         let list = membership_with_peers("local", 7000, &[("n1", 7001, MemberState::Alive)]).await;
         let mut sched = ProbeScheduler::with_seed(1);
         let inflight = Arc::new(InflightProbes::new());
+        let dissemination = Arc::new(DisseminationQueue::new());
         let outcome = ProbeRound {
             scheduler: &mut sched,
             membership: &list,
             transport: &local,
             inflight: &inflight,
+            dissemination: &dissemination,
             probe_timeout: cfg().probe_timeout,
             k_indirect: 2,
+            max_piggyback: 6,
+            fanout_lambda: 3,
             next_probe_id: pid_gen(1),
             local_incarnation: Incarnation::ZERO,
         }
@@ -367,13 +386,17 @@ mod tests {
             }
         });
 
+        let dissemination = Arc::new(DisseminationQueue::new());
         let outcome = ProbeRound {
             scheduler: &mut sched,
             membership: &list,
             transport: &local,
             inflight: &inflight,
+            dissemination: &dissemination,
             probe_timeout: cfg().probe_timeout,
             k_indirect: 2,
+            max_piggyback: 6,
+            fanout_lambda: 3,
             next_probe_id: pid_gen(1),
             local_incarnation: Incarnation::ZERO,
         }
@@ -436,13 +459,17 @@ mod tests {
             }
         });
 
+        let dissemination = Arc::new(DisseminationQueue::new());
         let outcome = ProbeRound {
             scheduler: &mut sched,
             membership: &list,
             transport: &local,
             inflight: &inflight,
+            dissemination: &dissemination,
             probe_timeout: cfg().probe_timeout,
             k_indirect: 2,
+            max_piggyback: 6,
+            fanout_lambda: 3,
             next_probe_id: pid_gen(1),
             local_incarnation: Incarnation::ZERO,
         }
@@ -468,5 +495,4 @@ mod tests {
             .expect_err("full");
         assert!(matches!(err, SwimError::ProbeInflightOverflow));
     }
-
 }
diff --git a/nodedb-cluster/src/swim/detector/runner.rs b/nodedb-cluster/src/swim/detector/runner.rs
index b60475ce..23997583 100644
--- a/nodedb-cluster/src/swim/detector/runner.rs
+++ b/nodedb-cluster/src/swim/detector/runner.rs
@@ -13,11 +13,12 @@ use tokio::sync::{Mutex, watch};
 use tokio::time::{Instant, interval};
 
 use crate::swim::config::SwimConfig;
+use crate::swim::dissemination::{DisseminationQueue, apply_and_disseminate};
 use crate::swim::error::SwimError;
 use crate::swim::incarnation::Incarnation;
 use crate::swim::member::MemberState;
 use crate::swim::member::record::MemberUpdate;
-use crate::swim::membership::MembershipList;
+use crate::swim::membership::{MembershipList, MergeOutcome};
 use crate::swim::wire::{Ack, Ping, PingReq, ProbeId, SwimMessage};
 
 use super::probe_round::{InflightProbes, ProbeOutcome, ProbeRound};
@@ -37,6 +38,7 @@ pub struct FailureDetector {
     scheduler: Mutex<ProbeScheduler>,
     suspicion: Mutex<SuspicionTimer>,
     inflight: Arc<InflightProbes>,
+    dissemination: Arc<DisseminationQueue>,
     probe_counter: AtomicU64,
     local_incarnation: Mutex<Incarnation>,
 }
@@ -58,11 +60,34 @@ impl FailureDetector {
             scheduler: Mutex::new(scheduler),
             suspicion: Mutex::new(SuspicionTimer::new()),
             inflight: Arc::new(InflightProbes::new()),
+            dissemination: Arc::new(DisseminationQueue::new()),
             probe_counter: AtomicU64::new(0),
             local_incarnation: Mutex::new(initial_inc),
         }
     }
 
+    /// Shared reference to the dissemination queue. Tests use it to
+    /// enqueue synthetic rumours without constructing a full message.
+    pub fn dissemination(&self) -> &Arc<DisseminationQueue> {
+        &self.dissemination
+    }
+
+    /// Ingest every piggyback entry attached to an inbound datagram.
+    /// Applies each update to the membership list via
+    /// [`apply_and_disseminate`] and, on a self-refutation, bumps the
+    /// local incarnation so subsequent probes advertise the new value.
+    async fn ingest_piggyback(&self, piggyback: &[MemberUpdate]) {
+        for update in piggyback {
+            let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update);
+            if let MergeOutcome::SelfRefute { new_incarnation } = outcome {
+                let mut guard = self.local_incarnation.lock().await;
+                if new_incarnation > *guard {
+                    *guard = new_incarnation;
+                }
+            }
+        }
+    }
+
     /// Exposed for tests that need to route a synthetic message into the
     /// inflight table without going through the transport.
     #[cfg(test)]
@@ -109,12 +134,13 @@ impl FailureDetector {
         let expired = self.suspicion.lock().await.drain_expired(now);
         for node_id in expired {
             if let Some(member) = self.membership.get(&node_id) {
-                self.membership.apply(&MemberUpdate {
+                let dead_update = MemberUpdate {
                     node_id: node_id.clone(),
                     addr: member.addr.to_string(),
                     state: MemberState::Dead,
                     incarnation: member.incarnation,
-                });
+                };
+                apply_and_disseminate(&self.membership, &self.dissemination, &dead_update);
             }
         }
 
@@ -126,8 +152,11 @@ impl FailureDetector {
             membership: &self.membership,
             transport: &self.transport,
             inflight: &self.inflight,
+            dissemination: &self.dissemination,
             probe_timeout: self.cfg.probe_timeout,
             k_indirect: self.cfg.indirect_probes as usize,
+            max_piggyback: self.cfg.max_piggyback,
+            fanout_lambda: self.cfg.fanout_lambda,
             next_probe_id: || self.next_probe_id(),
             local_incarnation: local_inc,
         }
@@ -139,12 +168,13 @@ impl FailureDetector {
             Ok(ProbeOutcome::Idle) | Ok(ProbeOutcome::Acked { .. }) => {}
             Ok(ProbeOutcome::Suspect { target }) => {
                 if let Some(member) = self.membership.get(&target) {
-                    self.membership.apply(&MemberUpdate {
+                    let suspect_update = MemberUpdate {
                         node_id: target.clone(),
                         addr: member.addr.to_string(),
                         state: MemberState::Suspect,
                         incarnation: member.incarnation,
-                    });
+                    };
+                    apply_and_disseminate(&self.membership, &self.dissemination, &suspect_update);
                     let cluster_size = self.membership.len();
                     self.suspicion.lock().await.arm(
                         target,
@@ -159,6 +189,9 @@ impl FailureDetector {
     }
 
     async fn on_incoming(&self, from_addr: SocketAddr, msg: SwimMessage) {
+        // Every datagram carries piggyback; ingest before dispatching so
+        // a self-refutation bump is reflected in the outgoing Ack below.
+        self.ingest_piggyback(msg.piggyback()).await;
         match msg {
             SwimMessage::Ping(ping) => self.handle_ping(from_addr, ping).await,
             SwimMessage::PingReq(req) => self.handle_ping_req(from_addr, req).await,
@@ -177,15 +210,15 @@ impl FailureDetector {
 
     async fn handle_ping(&self, from_addr: SocketAddr, ping: Ping) {
         let local_inc = *self.local_incarnation.lock().await;
-        // Any self-refutation bump from piggyback is handled by
-        // `MembershipList::apply`; E-γ does not yet ingest piggyback
-        // (that's E-δ) but the reply incarnation still reflects any
-        // local bump the detector already performed.
+        let fanout =
+            DisseminationQueue::fanout_threshold(self.membership.len(), self.cfg.fanout_lambda);
         let ack = SwimMessage::Ack(Ack {
             probe_id: ping.probe_id,
             from: self.membership.local_node_id().clone(),
             incarnation: local_inc,
-            piggyback: vec![],
+            piggyback: self
+                .dissemination
+                .take_for_message(self.cfg.max_piggyback, fanout),
         });
         let _ = self.transport.send(from_addr, ack).await;
     }
@@ -208,7 +241,11 @@ impl FailureDetector {
         let local_inc = *self.local_incarnation.lock().await;
         let transport = Arc::clone(&self.transport);
         let inflight = Arc::clone(&self.inflight);
+        let dissemination = Arc::clone(&self.dissemination);
         let timeout_dur = self.cfg.probe_timeout;
+        let max_piggyback = self.cfg.max_piggyback;
+        let fanout =
+            DisseminationQueue::fanout_threshold(self.membership.len(), self.cfg.fanout_lambda);
         let original_probe_id = req.probe_id;
 
         tokio::spawn(async move {
@@ -219,7 +256,7 @@ impl FailureDetector {
                         probe_id: forward_id,
                         from: local_node.clone(),
                         incarnation: local_inc,
-                        piggyback: vec![],
+                        piggyback: dissemination.take_for_message(max_piggyback, fanout),
                     }),
                 )
                 .await;
@@ -233,7 +270,7 @@ impl FailureDetector {
                         probe_id: original_probe_id,
                         from: ack.from,
                         incarnation: ack.incarnation,
-                        piggyback: vec![],
+                        piggyback: dissemination.take_for_message(max_piggyback, fanout),
                     });
                     let _ = transport.send(requester_addr, relay).await;
                 }
@@ -278,6 +315,8 @@ mod tests {
             suspicion_mult: 4,
             min_suspicion: Duration::from_millis(500),
             initial_incarnation: Incarnation::ZERO,
+            max_piggyback: 6,
+            fanout_lambda: 3,
         }
     }
 
@@ -431,4 +470,118 @@ mod tests {
         let _ = sd_a.send(true);
         let _ = tokio::time::timeout(Duration::from_millis(100), h_a).await;
     }
+
+    /// Enqueue a synthetic rumour about a never-probed peer on node A's
+    /// dissemination queue, then let the 3-node mesh run a few probe
+    /// rounds. Nodes B and C must observe the delta via piggyback.
+    #[tokio::test(start_paused = true)]
+    async fn piggyback_propagates_delta_to_peers() {
+        let fab = TransportFabric::new();
+        let peers_of = |me: &str| {
+            ["a", "b", "c"]
+                .iter()
+                .filter(|p| **p != me)
+                .map(|p| {
+                    let port = match *p {
+                        "a" => 7050,
+                        "b" => 7051,
+                        "c" => 7052,
+                        _ => unreachable!(),
+                    };
+                    (p.to_string(), port)
+                })
+                .collect::<Vec<_>>()
+        };
+        let (det_a, sd_a, h_a) = spawn_node(&fab, "a", 7050, &peers_of("a")).await;
+        let (det_b, sd_b, h_b) = spawn_node(&fab, "b", 7051, &peers_of("b")).await;
+        let (det_c, sd_c, h_c) = spawn_node(&fab, "c", 7052, &peers_of("c")).await;
+
+        // Synthetic rumour: "ghost" is an Alive peer A learned about
+        // out of band. It is NOT in B or C's membership initially.
+        det_a.dissemination().enqueue(MemberUpdate {
+            node_id: NodeId::new("ghost"),
+            addr: "127.0.0.1:9999".to_string(),
+            state: MemberState::Alive,
+            incarnation: Incarnation::new(1),
+        });
+        // A's list has to know about ghost too, otherwise the outgoing
+        // piggyback is still correct but there's nothing asserting the
+        // local state. Apply it now.
+        det_a.membership.apply(&MemberUpdate {
+            node_id: NodeId::new("ghost"),
+            addr: "127.0.0.1:9999".to_string(),
+            state: MemberState::Alive,
+            incarnation: Incarnation::new(1),
+        });
+
+        // Run enough probe rounds for gossip to reach B and C.
+        for _ in 0..20 {
+            tokio::time::advance(cfg().probe_interval).await;
+            tokio::task::yield_now().await;
+        }
+
+        assert!(
+            det_b.membership.get(&NodeId::new("ghost")).is_some(),
+            "B must learn about ghost via piggyback"
+        );
+        assert!(
+            det_c.membership.get(&NodeId::new("ghost")).is_some(),
+            "C must learn about ghost via piggyback"
+        );
+
+        let _ = sd_a.send(true);
+        let _ = sd_b.send(true);
+        let _ = sd_c.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(200), h_a).await;
+        let _ = tokio::time::timeout(Duration::from_millis(200), h_b).await;
+        let _ = tokio::time::timeout(Duration::from_millis(200), h_c).await;
+    }
+
+    /// A receives a Ping whose piggyback claims A is Suspect. A must
+    /// bump its own incarnation and enqueue an Alive refutation.
+    #[tokio::test(start_paused = true)]
+    async fn self_refute_bumps_incarnation_via_piggyback() {
+        let fab = TransportFabric::new();
+        let (det_a, sd_a, h_a) = spawn_node(&fab, "a", 7060, &[]).await;
+        let probe = Arc::new(fab.bind(addr(7061)).await);
+
+        // Send a ping whose piggyback suspects "a" at inc 7.
+        probe
+            .send(
+                addr(7060),
+                SwimMessage::Ping(Ping {
+                    probe_id: ProbeId::new(1),
+                    from: NodeId::new("probe"),
+                    incarnation: Incarnation::ZERO,
+                    piggyback: vec![MemberUpdate {
+                        node_id: NodeId::new("a"),
+                        addr: addr(7060).to_string(),
+                        state: MemberState::Suspect,
+                        incarnation: Incarnation::new(7),
+                    }],
+                }),
+            )
+            .await
+            .unwrap();
+
+        // Drain the Ack so the detector actually processes recv.
+        let (_from, _ack) = tokio::time::timeout(Duration::from_millis(50), probe.recv())
+            .await
+            .expect("did not time out")
+            .expect("recv");
+
+        // A's local incarnation must now be > 7.
+        let bumped = *det_a.local_incarnation.lock().await;
+        assert!(
+            bumped > Incarnation::new(7),
+            "local incarnation {bumped:?} did not refute rumoured Suspect(7)"
+        );
+        // A's membership view for itself is Alive at the bumped value.
+        let me = det_a.membership.get(&NodeId::new("a")).expect("self");
+        assert_eq!(me.state, MemberState::Alive);
+        assert!(me.incarnation > Incarnation::new(7));
+
+        let _ = sd_a.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(100), h_a).await;
+    }
 }
diff --git a/nodedb-cluster/src/swim/detector/suspicion.rs b/nodedb-cluster/src/swim/detector/suspicion.rs
index 3947641a..143c8bcc 100644
--- a/nodedb-cluster/src/swim/detector/suspicion.rs
+++ b/nodedb-cluster/src/swim/detector/suspicion.rs
@@ -92,6 +92,8 @@ mod tests {
             suspicion_mult: 4,
             min_suspicion: Duration::from_millis(500),
             initial_incarnation: crate::swim::incarnation::Incarnation::ZERO,
+            max_piggyback: 6,
+            fanout_lambda: 3,
         }
     }
 
diff --git a/nodedb-cluster/src/swim/dissemination/apply.rs b/nodedb-cluster/src/swim/dissemination/apply.rs
new file mode 100644
index 00000000..13d095bb
--- /dev/null
+++ b/nodedb-cluster/src/swim/dissemination/apply.rs
@@ -0,0 +1,152 @@
+//! `apply_and_disseminate` — keep the dissemination queue consistent
+//! with every [`MembershipList::apply`] outcome.
+//!
+//! The rules, per Lifeguard:
+//!
+//! - `Insert` / `Apply` → enqueue the incoming update (it is now our
+//!   new truth, and the rest of the cluster should learn about it).
+//! - `SelfRefute { new_incarnation }` → enqueue a refutation `Alive`
+//!   update for the local node at the bumped incarnation. This is the
+//!   critical path for recovering from a false suspicion.
+//! - `Refute` → enqueue the **stored** view of the contested node so
+//!   the sender (and anyone it gossips to) learns the newer truth.
+//! - `Ignore` / `TerminalLeft` → no-op.
+
+use crate::swim::member::MemberState;
+use crate::swim::member::record::MemberUpdate;
+use crate::swim::membership::{MembershipList, MergeOutcome};
+
+use super::queue::DisseminationQueue;
+
+/// Apply `update` to `list` and reflect the outcome in `queue`.
+/// Returns the raw merge outcome so callers can still branch on it.
+pub fn apply_and_disseminate(
+    list: &MembershipList,
+    queue: &DisseminationQueue,
+    update: &MemberUpdate,
+) -> MergeOutcome {
+    let outcome = list.apply(update);
+    match &outcome {
+        MergeOutcome::Insert | MergeOutcome::Apply => {
+            queue.enqueue(update.clone());
+        }
+        MergeOutcome::SelfRefute { new_incarnation } => {
+            if let Some(local) = list.get(list.local_node_id()) {
+                queue.enqueue(MemberUpdate {
+                    node_id: local.node_id.clone(),
+                    addr: local.addr.to_string(),
+                    state: MemberState::Alive,
+                    incarnation: *new_incarnation,
+                });
+            }
+        }
+        MergeOutcome::Refute => {
+            if let Some(stored) = list.get(&update.node_id) {
+                queue.enqueue(MemberUpdate::from(&stored));
+            }
+        }
+        MergeOutcome::Ignore | MergeOutcome::TerminalLeft => {}
+    }
+    outcome
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::incarnation::Incarnation;
+    use nodedb_types::NodeId;
+    use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+
+    fn addr(p: u16) -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), p)
+    }
+
+    fn local_list() -> MembershipList {
+        MembershipList::new_local(NodeId::new("local"), addr(7000), Incarnation::ZERO)
+    }
+
+    fn upd(id: &str, state: MemberState, inc: u64) -> MemberUpdate {
+        MemberUpdate {
+            node_id: NodeId::new(id),
+            addr: addr(7001).to_string(),
+            state,
+            incarnation: Incarnation::new(inc),
+        }
+    }
+
+    #[test]
+    fn insert_enqueues() {
+        let list = local_list();
+        let q = DisseminationQueue::new();
+        let out = apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 0));
+        assert_eq!(out, MergeOutcome::Insert);
+        assert_eq!(q.len(), 1);
+    }
+
+    #[test]
+    fn apply_enqueues() {
+        let list = local_list();
+        let q = DisseminationQueue::new();
+        apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 0));
+        q.clear();
+        let out = apply_and_disseminate(&list, &q, &upd("n1", MemberState::Suspect, 1));
+        assert_eq!(out, MergeOutcome::Apply);
+        assert_eq!(q.len(), 1);
+    }
+
+    #[test]
+    fn ignore_leaves_queue_empty() {
+        let list = local_list();
+        let q = DisseminationQueue::new();
+        apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 3));
+        q.clear();
+        let out = apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 3));
+        assert_eq!(out, MergeOutcome::Ignore);
+        assert!(q.is_empty());
+    }
+
+    #[test]
+    fn refute_enqueues_stored_view() {
+        let list = local_list();
+        let q = DisseminationQueue::new();
+        apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 5));
+        q.clear();
+        let out = apply_and_disseminate(&list, &q, &upd("n1", MemberState::Suspect, 3));
+        assert_eq!(out, MergeOutcome::Refute);
+        // Queue should now hold the stored (newer) view of n1 at inc=5.
+        let taken = q.take_for_message(4, 8);
+        assert_eq!(taken.len(), 1);
+        assert_eq!(taken[0].incarnation, Incarnation::new(5));
+        assert_eq!(taken[0].state, MemberState::Alive);
+    }
+
+    #[test]
+    fn self_refute_enqueues_bumped_local_update() {
+        let list = local_list();
+        let q = DisseminationQueue::new();
+        let out = apply_and_disseminate(&list, &q, &upd("local", MemberState::Suspect, 3));
+        match out {
+            MergeOutcome::SelfRefute { new_incarnation } => {
+                assert_eq!(new_incarnation, Incarnation::new(4));
+            }
+            other => panic!("expected SelfRefute, got {other:?}"),
+        }
+        let taken = q.take_for_message(4, 8);
+        assert_eq!(taken.len(), 1);
+        assert_eq!(taken[0].node_id.as_str(), "local");
+        assert_eq!(taken[0].state, MemberState::Alive);
+        assert_eq!(taken[0].incarnation, Incarnation::new(4));
+    }
+
+    #[test]
+    fn terminal_left_is_noop_on_queue() {
+        let list = local_list();
+        let q = DisseminationQueue::new();
+        apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 0));
+        apply_and_disseminate(&list, &q, &upd("n1", MemberState::Left, 1));
+        q.clear();
+        let out = apply_and_disseminate(&list, &q, &upd("n1", MemberState::Alive, 99));
+        assert_eq!(out, MergeOutcome::TerminalLeft);
+        assert!(q.is_empty());
+    }
+}
diff --git a/nodedb-cluster/src/swim/dissemination/entry.rs b/nodedb-cluster/src/swim/dissemination/entry.rs
new file mode 100644
index 00000000..8a761c0f
--- /dev/null
+++ b/nodedb-cluster/src/swim/dissemination/entry.rs
@@ -0,0 +1,130 @@
+//! `PendingUpdate` — one membership delta waiting to be gossiped.
+//!
+//! Each entry tracks how many outgoing datagrams have already carried it
+//! so the queue can pick the least-disseminated rumours first and drop
+//! them after `fanout_lambda * log(n)` sends (Lifeguard §4.4).
+
+use std::cmp::Ordering;
+
+use crate::swim::member::record::MemberUpdate;
+
+/// A membership delta plus its dissemination bookkeeping.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct PendingUpdate {
+    pub update: MemberUpdate,
+    pub sent_count: u32,
+}
+
+impl PendingUpdate {
+    /// Fresh entry with `sent_count = 0`.
+    pub fn new(update: MemberUpdate) -> Self {
+        Self {
+            update,
+            sent_count: 0,
+        }
+    }
+
+    /// Bump the send counter. Used after the queue emits this entry on
+    /// an outgoing datagram.
+    pub fn record_sent(&mut self) {
+        self.sent_count = self.sent_count.saturating_add(1);
+    }
+}
+
+/// Ordering: **lower `sent_count` is "greater"** so a `BinaryHeap`
+/// (max-heap) pops the least-disseminated entry first. Tie-breaks on
+/// `(node_id, incarnation)` keep the order deterministic for tests.
+impl Ord for PendingUpdate {
+    fn cmp(&self, other: &Self) -> Ordering {
+        // Max-heap semantics: the "greatest" entry pops first.
+        // - Lower `sent_count` should pop first → reverse that field.
+        // - Alphabetically earlier `node_id` should pop first on ties
+        //   → reverse that field too.
+        // - Higher `incarnation` pops first on further ties (fresher
+        //   rumour wins) → natural order.
+        other
+            .sent_count
+            .cmp(&self.sent_count)
+            .then_with(|| {
+                other
+                    .update
+                    .node_id
+                    .as_str()
+                    .cmp(self.update.node_id.as_str())
+            })
+            .then_with(|| self.update.incarnation.cmp(&other.update.incarnation))
+    }
+}
+
+impl PartialOrd for PendingUpdate {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::incarnation::Incarnation;
+    use crate::swim::member::MemberState;
+    use nodedb_types::NodeId;
+    use std::collections::BinaryHeap;
+
+    fn upd(id: &str, inc: u64) -> MemberUpdate {
+        MemberUpdate {
+            node_id: NodeId::new(id),
+            addr: "127.0.0.1:7000".to_string(),
+            state: MemberState::Alive,
+            incarnation: Incarnation::new(inc),
+        }
+    }
+
+    #[test]
+    fn record_sent_increments() {
+        let mut p = PendingUpdate::new(upd("n1", 0));
+        assert_eq!(p.sent_count, 0);
+        p.record_sent();
+        p.record_sent();
+        assert_eq!(p.sent_count, 2);
+    }
+
+    #[test]
+    fn record_sent_saturates() {
+        let mut p = PendingUpdate {
+            update: upd("n1", 0),
+            sent_count: u32::MAX,
+        };
+        p.record_sent();
+        assert_eq!(p.sent_count, u32::MAX);
+    }
+
+    #[test]
+    fn binary_heap_pops_least_sent_first() {
+        let mut heap = BinaryHeap::new();
+        heap.push(PendingUpdate {
+            update: upd("a", 0),
+            sent_count: 3,
+        });
+        heap.push(PendingUpdate {
+            update: upd("b", 0),
+            sent_count: 0,
+        });
+        heap.push(PendingUpdate {
+            update: upd("c", 0),
+            sent_count: 1,
+        });
+        assert_eq!(heap.pop().unwrap().update.node_id.as_str(), "b");
+        assert_eq!(heap.pop().unwrap().update.node_id.as_str(), "c");
+        assert_eq!(heap.pop().unwrap().update.node_id.as_str(), "a");
+    }
+
+    #[test]
+    fn tie_break_is_deterministic() {
+        let mut heap = BinaryHeap::new();
+        heap.push(PendingUpdate::new(upd("b", 1)));
+        heap.push(PendingUpdate::new(upd("a", 1)));
+        // Same sent_count; alphabetical tie-break pops "a" first.
+        assert_eq!(heap.pop().unwrap().update.node_id.as_str(), "a");
+        assert_eq!(heap.pop().unwrap().update.node_id.as_str(), "b");
+    }
+}
diff --git a/nodedb-cluster/src/swim/dissemination/mod.rs b/nodedb-cluster/src/swim/dissemination/mod.rs
new file mode 100644
index 00000000..28da92ba
--- /dev/null
+++ b/nodedb-cluster/src/swim/dissemination/mod.rs
@@ -0,0 +1,15 @@
+//! Piggyback dissemination queue.
+//!
+//! SWIM spreads membership deltas by attaching a small number of recent
+//! rumours to every probe datagram. This module owns the queue, the
+//! decay rule (Lifeguard §4.4 "least-disseminated first, drop after
+//! lambda * log(n) sends"), and the thin wrapper that keeps the queue
+//! consistent with [`super::MembershipList::apply`] outcomes.
+
+pub mod apply;
+pub mod entry;
+pub mod queue;
+
+pub use apply::apply_and_disseminate;
+pub use entry::PendingUpdate;
+pub use queue::DisseminationQueue;
diff --git a/nodedb-cluster/src/swim/dissemination/queue.rs b/nodedb-cluster/src/swim/dissemination/queue.rs
new file mode 100644
index 00000000..bba9c580
--- /dev/null
+++ b/nodedb-cluster/src/swim/dissemination/queue.rs
@@ -0,0 +1,197 @@
+//! `DisseminationQueue` — bounded, decaying piggyback buffer.
+//!
+//! The queue is a `HashMap<NodeId, PendingUpdate>` so that a fresher
+//! rumour about the same node **replaces** its stale predecessor in
+//! place — otherwise the queue would balloon with outdated tombstones.
+//!
+//! Emission picks the `max_piggyback` least-disseminated entries,
+//! increments their send counters, and drops any entry that has now
+//! reached the `lambda_log_n` fanout threshold.
+
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use nodedb_types::NodeId;
+
+use crate::swim::member::record::MemberUpdate;
+
+use super::entry::PendingUpdate;
+
+/// Bounded dissemination buffer keyed by `NodeId`.
+#[derive(Debug, Default)]
+pub struct DisseminationQueue {
+    inner: Mutex<HashMap<NodeId, PendingUpdate>>,
+}
+
+impl DisseminationQueue {
+    /// Fresh empty queue.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Insert or replace the entry for `update.node_id`. The send
+    /// counter resets to 0 so a fresh rumour is always gossiped anew.
+    pub fn enqueue(&self, update: MemberUpdate) {
+        let mut guard = self.inner.lock().expect("dissemination lock poisoned");
+        guard.insert(update.node_id.clone(), PendingUpdate::new(update));
+    }
+
+    /// Total number of rumours currently in the queue.
+    pub fn len(&self) -> usize {
+        self.inner
+            .lock()
+            .expect("dissemination lock poisoned")
+            .len()
+    }
+
+    /// True when the queue is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Drop every rumour. Used by tests and on shutdown.
+    pub fn clear(&self) {
+        self.inner
+            .lock()
+            .expect("dissemination lock poisoned")
+            .clear();
+    }
+
+    /// Return up to `max` least-disseminated updates for a single
+    /// outgoing message. Increments each returned entry's `sent_count`
+    /// and drops entries whose new count has reached `lambda_log_n`.
+    ///
+    /// `lambda_log_n` is `ceil(lambda * log2(cluster_size + 1))` —
+    /// computed by the caller because it depends on the current
+    /// membership size and the [`super::super::config::SwimConfig`]
+    /// `fanout_lambda` knob.
+    pub fn take_for_message(&self, max: usize, lambda_log_n: u32) -> Vec<MemberUpdate> {
+        if max == 0 {
+            return Vec::new();
+        }
+        let mut guard = self.inner.lock().expect("dissemination lock poisoned");
+
+        // Sort a snapshot of keys by (sent_count, node_id) ascending;
+        // we can't use BinaryHeap directly without cloning because the
+        // values need to be mutated in place after the decision.
+        let mut keys: Vec<NodeId> = guard.keys().cloned().collect();
+        keys.sort_by(|a, b| {
+            let pa = &guard[a];
+            let pb = &guard[b];
+            pa.sent_count
+                .cmp(&pb.sent_count)
+                .then_with(|| a.as_str().cmp(b.as_str()))
+                .then_with(|| pa.update.incarnation.cmp(&pb.update.incarnation))
+        });
+        keys.truncate(max);
+
+        let mut out = Vec::with_capacity(keys.len());
+        for k in keys {
+            if let Some(pending) = guard.get_mut(&k) {
+                pending.record_sent();
+                out.push(pending.update.clone());
+                if pending.sent_count >= lambda_log_n {
+                    guard.remove(&k);
+                }
+            }
+        }
+        out
+    }
+
+    /// Compute `ceil(lambda * log2(cluster_size + 1))`. Exposed so the
+    /// runner can pass the result straight into [`take_for_message`].
+    pub fn fanout_threshold(cluster_size: usize, lambda: u32) -> u32 {
+        let n = (cluster_size + 1).max(2) as f64;
+        (lambda as f64 * n.log2()).ceil() as u32
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::incarnation::Incarnation;
+    use crate::swim::member::MemberState;
+
+    fn upd(id: &str, inc: u64) -> MemberUpdate {
+        MemberUpdate {
+            node_id: NodeId::new(id),
+            addr: "127.0.0.1:7000".to_string(),
+            state: MemberState::Alive,
+            incarnation: Incarnation::new(inc),
+        }
+    }
+
+    #[test]
+    fn enqueue_replaces_by_node_id() {
+        let q = DisseminationQueue::new();
+        q.enqueue(upd("n1", 1));
+        q.enqueue(upd("n1", 5));
+        assert_eq!(q.len(), 1);
+        // Taking it returns the latest incarnation.
+        let out = q.take_for_message(10, 4);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].incarnation, Incarnation::new(5));
+    }
+
+    #[test]
+    fn take_caps_at_max() {
+        let q = DisseminationQueue::new();
+        q.enqueue(upd("n1", 0));
+        q.enqueue(upd("n2", 0));
+        q.enqueue(upd("n3", 0));
+        let out = q.take_for_message(2, 10);
+        assert_eq!(out.len(), 2);
+    }
+
+    #[test]
+    fn take_zero_max_returns_empty() {
+        let q = DisseminationQueue::new();
+        q.enqueue(upd("n1", 0));
+        let out = q.take_for_message(0, 4);
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn entries_drop_after_fanout_threshold() {
+        let q = DisseminationQueue::new();
+        q.enqueue(upd("n1", 0));
+        // threshold = 2 → second take should drain and drop.
+        let _ = q.take_for_message(1, 2);
+        assert_eq!(q.len(), 1);
+        let _ = q.take_for_message(1, 2);
+        assert_eq!(q.len(), 0);
+    }
+
+    #[test]
+    fn least_disseminated_wins() {
+        let q = DisseminationQueue::new();
+        q.enqueue(upd("a", 0));
+        // Drain "a" twice so its sent_count reaches 2.
+        let _ = q.take_for_message(1, 10);
+        let _ = q.take_for_message(1, 10);
+        // Now enqueue a fresh "b" with sent_count=0.
+        q.enqueue(upd("b", 0));
+        // Next take should pick "b" (count=0) over "a" (count=2).
+        let out = q.take_for_message(1, 10);
+        assert_eq!(out[0].node_id.as_str(), "b");
+    }
+
+    #[test]
+    fn fanout_threshold_formula() {
+        // 7-node cluster, lambda=3 → ceil(3 * log2(8)) = 9.
+        assert_eq!(DisseminationQueue::fanout_threshold(7, 3), 9);
+        // 1-node cluster, lambda=3 → ceil(3 * log2(2)) = 3.
+        assert_eq!(DisseminationQueue::fanout_threshold(1, 3), 3);
+        // 0-node cluster, lambda=3 → ceil(3 * log2(2)) = 3 (clamped).
+        assert_eq!(DisseminationQueue::fanout_threshold(0, 3), 3);
+    }
+
+    #[test]
+    fn clear_empties_queue() {
+        let q = DisseminationQueue::new();
+        q.enqueue(upd("n1", 0));
+        q.enqueue(upd("n2", 0));
+        q.clear();
+        assert!(q.is_empty());
+    }
+}
diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs
index edf5fe97..8112e9db 100644
--- a/nodedb-cluster/src/swim/mod.rs
+++ b/nodedb-cluster/src/swim/mod.rs
@@ -22,6 +22,7 @@
 
 pub mod config;
 pub mod detector;
+pub mod dissemination;
 pub mod error;
 pub mod incarnation;
 pub mod member;
@@ -32,6 +33,7 @@ pub use config::SwimConfig;
 pub use detector::{
     FailureDetector, InMemoryTransport, ProbeScheduler, Transport, TransportFabric,
 };
+pub use dissemination::{DisseminationQueue, PendingUpdate, apply_and_disseminate};
 pub use error::SwimError;
 pub use incarnation::Incarnation;
 pub use member::{Member, MemberState};

From 44a0d4b031379c2a0a9ca377d8c343b9ab909cdd Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 06:00:16 +0800
Subject: [PATCH 4/5] refactor(swim): split transport into per-impl modules,
 add UDP transport
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract `InMemoryTransport` into its own file and add a real
`UdpTransport` alongside it, with the shared `Transport` trait and
`TransportFabric` living in a `transport/mod.rs` hub.

The old monolithic `transport.rs` held the trait, the in-memory
fabric, and placeholder comments for UDP; keeping them together made
the file grow as soon as UDP landed. The new layout mirrors the
established one-concern-per-file rule:
  transport/mod.rs      — re-exports + shared trait
  transport/in_memory.rs — test fabric (mpsc channels, drop injection)
  transport/udp.rs       — production UDP transport (tokio UdpSocket)

`UdpTransport` is now re-exported from the `swim` public API so cluster
startup and integration tests can construct one without reaching into
internal modules.
---
 nodedb-cluster/src/swim/detector/mod.rs       |   2 +-
 .../{transport.rs => transport/in_memory.rs}  |  47 +----
 .../src/swim/detector/transport/mod.rs        |  45 +++++
 .../src/swim/detector/transport/udp.rs        | 188 ++++++++++++++++++
 4 files changed, 241 insertions(+), 41 deletions(-)
 rename nodedb-cluster/src/swim/detector/{transport.rs => transport/in_memory.rs} (73%)
 create mode 100644 nodedb-cluster/src/swim/detector/transport/mod.rs
 create mode 100644 nodedb-cluster/src/swim/detector/transport/udp.rs

diff --git a/nodedb-cluster/src/swim/detector/mod.rs b/nodedb-cluster/src/swim/detector/mod.rs
index 08750ec9..829d285a 100644
--- a/nodedb-cluster/src/swim/detector/mod.rs
+++ b/nodedb-cluster/src/swim/detector/mod.rs
@@ -17,4 +17,4 @@ pub use probe_round::{ProbeOutcome, ProbeRound};
 pub use runner::FailureDetector;
 pub use scheduler::ProbeScheduler;
 pub use suspicion::SuspicionTimer;
-pub use transport::{InMemoryTransport, Transport, TransportFabric};
+pub use transport::{InMemoryTransport, Transport, TransportFabric, UdpTransport};
diff --git a/nodedb-cluster/src/swim/detector/transport.rs b/nodedb-cluster/src/swim/detector/transport/in_memory.rs
similarity index 73%
rename from nodedb-cluster/src/swim/detector/transport.rs
rename to nodedb-cluster/src/swim/detector/transport/in_memory.rs
index 42e919af..8af47c3e 100644
--- a/nodedb-cluster/src/swim/detector/transport.rs
+++ b/nodedb-cluster/src/swim/detector/transport/in_memory.rs
@@ -1,45 +1,16 @@
-//! SWIM transport abstraction.
-//!
-//! The detector talks to the network exclusively through the [`Transport`]
-//! trait. Two impls exist in the crate:
-//!
-//! 1. [`InMemoryTransport`] — a tokio-mpsc fabric used by every E-γ unit
-//!    test. Supports per-edge drop and partition injection so tests can
-//!    deterministically simulate unreachable peers.
-//! 2. The real UDP transport — lands in E-ε, not in this file.
-//!
-//! The trait is `Send + Sync` and its methods are `async`. Errors are
-//! typed [`SwimError::TransportClosed`] variants so callers never see
-//! raw `io::Error`.
-
-use std::collections::HashMap;
+//! Test-only tokio-mpsc fabric implementing [`super::Transport`].
+
+use std::collections::{HashMap, HashSet};
 use std::net::SocketAddr;
 use std::sync::Arc;
 
 use async_trait::async_trait;
 use tokio::sync::{Mutex, mpsc};
 
+use super::Transport;
 use crate::swim::error::SwimError;
 use crate::swim::wire::SwimMessage;
 
-/// Abstract SWIM transport. Implementations may be unreliable (UDP-like);
-/// the detector assumes nothing about ordering or delivery guarantees.
-#[async_trait]
-pub trait Transport: Send + Sync {
-    /// Send a single SWIM datagram to `to`. Errors indicate the transport
-    /// itself is broken, not that the peer is unreachable — an unreachable
-    /// peer is modelled as a silent drop.
-    async fn send(&self, to: SocketAddr, msg: SwimMessage) -> Result<(), SwimError>;
-
-    /// Block until the next inbound datagram is available. Returns
-    /// [`SwimError::TransportClosed`] when the transport is shut down.
-    async fn recv(&self) -> Result<(SocketAddr, SwimMessage), SwimError>;
-
-    /// The local bind address — returned so callers can include it in
-    /// outgoing messages without plumbing the address through separately.
-    fn local_addr(&self) -> SocketAddr;
-}
-
 /// Test-only tokio-mpsc fabric that hosts multiple [`InMemoryTransport`]
 /// endpoints sharing the same address space.
 #[derive(Debug, Default)]
@@ -56,7 +27,7 @@ struct FabricInner {
     /// Inbound queue per bound address.
     inboxes: HashMap<SocketAddr, mpsc::Sender<(SocketAddr, SwimMessage)>>,
     /// Set of (from, to) pairs whose datagrams are silently dropped.
-    dropped_edges: std::collections::HashSet<(SocketAddr, SocketAddr)>,
+    dropped_edges: HashSet<(SocketAddr, SocketAddr)>,
 }
 
 impl TransportFabric {
@@ -69,7 +40,7 @@ impl TransportFabric {
 
     /// Bind a new endpoint on the fabric. Panics only if `addr` is already
     /// bound in the fabric (test-only assertion — production transport
-    /// lives in E-ε).
+    /// is [`super::UdpTransport`]).
     pub async fn bind(self: &Arc<Self>, addr: SocketAddr) -> InMemoryTransport {
         let (tx, rx) = mpsc::channel(1024);
         let mut guard = self.inner.lock().await;
@@ -171,7 +142,6 @@ mod tests {
         let _b = fab.bind(addr(7001)).await;
         fab.drop_edge(addr(7000), addr(7001)).await;
         a.send(addr(7001), ping()).await.expect("send");
-        // Recv should time out — nothing delivered.
         let got = tokio::time::timeout(std::time::Duration::from_millis(20), _b.recv()).await;
         assert!(got.is_err(), "dropped edge should not deliver");
     }
@@ -188,10 +158,7 @@ mod tests {
         let fab = TransportFabric::new();
         let b = fab.bind(addr(7001)).await;
         fab.remove(addr(7001)).await;
-        // The bound transport still holds its Receiver — sender half is
-        // removed from the fabric, so future sends from other endpoints
-        // will now silently drop. The existing inbox is still drainable.
-        let _ = b; // silence unused
+        let _ = b;
     }
 
     #[tokio::test]
diff --git a/nodedb-cluster/src/swim/detector/transport/mod.rs b/nodedb-cluster/src/swim/detector/transport/mod.rs
new file mode 100644
index 00000000..7e5a7533
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/transport/mod.rs
@@ -0,0 +1,45 @@
+//! SWIM transport abstraction.
+//!
+//! The detector talks to the network exclusively through the [`Transport`]
+//! trait. Two production-facing impls exist:
+//!
+//! 1. [`in_memory::InMemoryTransport`] — a tokio-mpsc fabric used by every
+//!    unit test. Supports per-edge drop and partition injection so tests
+//!    can deterministically simulate unreachable peers.
+//! 2. [`udp::UdpTransport`] — the real wire-level transport that binds a
+//!    `tokio::net::UdpSocket` and framing-encodes every datagram via
+//!    [`crate::swim::wire::encode`].
+//!
+//! The trait is `Send + Sync` and its methods are `async`. Errors are
+//! typed [`SwimError`] variants so callers never see raw `io::Error`.
+
+pub mod in_memory;
+pub mod udp;
+
+use std::net::SocketAddr;
+
+use async_trait::async_trait;
+
+use crate::swim::error::SwimError;
+use crate::swim::wire::SwimMessage;
+
+pub use in_memory::{InMemoryTransport, TransportFabric};
+pub use udp::UdpTransport;
+
+/// Abstract SWIM transport. Implementations may be unreliable (UDP-like);
+/// the detector assumes nothing about ordering or delivery guarantees.
+#[async_trait]
+pub trait Transport: Send + Sync {
+    /// Send a single SWIM datagram to `to`. Errors indicate the transport
+    /// itself is broken, not that the peer is unreachable — an unreachable
+    /// peer is modelled as a silent drop.
+    async fn send(&self, to: SocketAddr, msg: SwimMessage) -> Result<(), SwimError>;
+
+    /// Block until the next inbound datagram is available. Returns
+    /// [`SwimError::TransportClosed`] when the transport is shut down.
+    async fn recv(&self) -> Result<(SocketAddr, SwimMessage), SwimError>;
+
+    /// The local bind address — returned so callers can include it in
+    /// outgoing messages without plumbing the address through separately.
+    fn local_addr(&self) -> SocketAddr;
+}
diff --git a/nodedb-cluster/src/swim/detector/transport/udp.rs b/nodedb-cluster/src/swim/detector/transport/udp.rs
new file mode 100644
index 00000000..d7853c25
--- /dev/null
+++ b/nodedb-cluster/src/swim/detector/transport/udp.rs
@@ -0,0 +1,188 @@
+//! Real UDP transport for SWIM.
+//!
+//! Binds a single `tokio::net::UdpSocket` and implements [`super::Transport`]
+//! by framing every outbound `SwimMessage` with the zerompk wire codec
+//! and parsing every inbound datagram the same way. Malformed inbound
+//! bytes surface as [`SwimError::Decode`] but do not close the socket —
+//! the failure detector's recv loop treats non-`TransportClosed` errors
+//! as transient and keeps running.
+//!
+//! Datagram size is capped by [`RECV_BUF_BYTES`], which must be large
+//! enough to hold a zerompk-encoded `SwimMessage` at the configured
+//! `max_piggyback` budget. 64 KiB is the IPv4 UDP maximum and is
+//! comfortably larger than any realistic SWIM payload.
+
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use tokio::net::UdpSocket;
+use tokio::sync::Mutex;
+
+use super::Transport;
+use crate::swim::error::SwimError;
+use crate::swim::wire::{self, SwimMessage};
+
+/// IPv4 UDP maximum datagram size. zerompk-encoded SWIM messages with a
+/// 6-entry piggyback fit in well under 2 KiB, so 64 KiB is abundant.
+pub const RECV_BUF_BYTES: usize = 65_536;
+
+/// SWIM datagram transport backed by a real UDP socket.
+#[derive(Debug)]
+pub struct UdpTransport {
+    socket: Arc<UdpSocket>,
+    local_addr: SocketAddr,
+    /// Serializes access to the recv buffer. `recv_from` takes `&self`
+    /// on `UdpSocket`, but we need a reusable buffer without allocating
+    /// 64 KiB per datagram; the mutex guards that buffer.
+    recv_buf: Mutex<Vec<u8>>,
+}
+
+impl UdpTransport {
+    /// Bind to `addr`. Passing `127.0.0.1:0` picks an ephemeral port,
+    /// which [`UdpTransport::local_addr`] then reports.
+    pub async fn bind(addr: SocketAddr) -> Result<Self, SwimError> {
+        let socket = UdpSocket::bind(addr).await.map_err(|e| SwimError::Encode {
+            detail: format!("udp bind {addr}: {e}"),
+        })?;
+        let local_addr = socket.local_addr().map_err(|e| SwimError::Encode {
+            detail: format!("udp local_addr: {e}"),
+        })?;
+        Ok(Self {
+            socket: Arc::new(socket),
+            local_addr,
+            recv_buf: Mutex::new(vec![0u8; RECV_BUF_BYTES]),
+        })
+    }
+}
+
+#[async_trait]
+impl Transport for UdpTransport {
+    async fn send(&self, to: SocketAddr, msg: SwimMessage) -> Result<(), SwimError> {
+        let bytes = wire::encode(&msg)?;
+        // UDP send_to is atomic per datagram; partial sends aren't a
+        // thing, so we only have to handle the error case.
+        self.socket
+            .send_to(&bytes, to)
+            .await
+            .map(|_| ())
+            .map_err(|e| SwimError::Encode {
+                detail: format!("udp send_to {to}: {e}"),
+            })
+    }
+
+    async fn recv(&self) -> Result<(SocketAddr, SwimMessage), SwimError> {
+        let mut buf = self.recv_buf.lock().await;
+        let (n, from) =
+            self.socket
+                .recv_from(&mut buf[..])
+                .await
+                .map_err(|e| SwimError::Decode {
+                    detail: format!("udp recv_from: {e}"),
+                })?;
+        let msg = wire::decode(&buf[..n])?;
+        Ok((from, msg))
+    }
+
+    fn local_addr(&self) -> SocketAddr {
+        self.local_addr
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::incarnation::Incarnation;
+    use crate::swim::wire::{Ack, Ping, ProbeId};
+    use nodedb_types::NodeId;
+    use std::net::{IpAddr, Ipv4Addr};
+    use std::time::Duration;
+
+    fn any_loopback() -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0)
+    }
+
+    #[tokio::test]
+    async fn bind_and_report_local_addr() {
+        let t = UdpTransport::bind(any_loopback()).await.expect("bind");
+        assert_eq!(t.local_addr().ip(), IpAddr::V4(Ipv4Addr::LOCALHOST));
+        assert_ne!(t.local_addr().port(), 0);
+    }
+
+    #[tokio::test]
+    async fn send_and_recv_roundtrip_between_real_sockets() {
+        let a = UdpTransport::bind(any_loopback()).await.expect("bind a");
+        let b = UdpTransport::bind(any_loopback()).await.expect("bind b");
+        let ping = SwimMessage::Ping(Ping {
+            probe_id: ProbeId::new(42),
+            from: NodeId::new("a"),
+            incarnation: Incarnation::new(3),
+            piggyback: vec![],
+        });
+        a.send(b.local_addr(), ping.clone()).await.expect("send");
+        let (from, msg) = tokio::time::timeout(Duration::from_secs(1), b.recv())
+            .await
+            .expect("recv timed out")
+            .expect("recv ok");
+        assert_eq!(from, a.local_addr());
+        assert_eq!(msg, ping);
+    }
+
+    #[tokio::test]
+    async fn recv_decodes_ack_variant() {
+        let a = UdpTransport::bind(any_loopback()).await.expect("bind a");
+        let b = UdpTransport::bind(any_loopback()).await.expect("bind b");
+        let ack = SwimMessage::Ack(Ack {
+            probe_id: ProbeId::new(1),
+            from: NodeId::new("b"),
+            incarnation: Incarnation::new(7),
+            piggyback: vec![],
+        });
+        a.send(b.local_addr(), ack.clone()).await.expect("send");
+        let (_from, msg) = tokio::time::timeout(Duration::from_secs(1), b.recv())
+            .await
+            .expect("recv timed out")
+            .expect("recv ok");
+        assert_eq!(msg, ack);
+    }
+
+    #[tokio::test]
+    async fn decode_error_on_garbage_datagram() {
+        // Bind one socket, send raw garbage to it from a second
+        // loopback socket, then call `recv` through the transport
+        // wrapper. The malformed bytes must surface as SwimError::Decode
+        // rather than close the socket.
+        let victim = UdpTransport::bind(any_loopback()).await.expect("bind");
+        let sender = tokio::net::UdpSocket::bind(any_loopback())
+            .await
+            .expect("sender bind");
+        sender
+            .send_to(&[0xff_u8; 8], victim.local_addr())
+            .await
+            .expect("send garbage");
+        let err = tokio::time::timeout(Duration::from_secs(1), victim.recv())
+            .await
+            .expect("recv timed out")
+            .expect_err("decode should fail");
+        assert!(matches!(err, SwimError::Decode { .. }));
+
+        // Follow up with a valid datagram to prove the socket still
+        // works after a bad one.
+        let sender_transport = UdpTransport::bind(any_loopback()).await.expect("bind");
+        let ping = SwimMessage::Ping(Ping {
+            probe_id: ProbeId::new(1),
+            from: NodeId::new("x"),
+            incarnation: Incarnation::ZERO,
+            piggyback: vec![],
+        });
+        sender_transport
+            .send(victim.local_addr(), ping.clone())
+            .await
+            .expect("send valid");
+        let (_from, msg) = tokio::time::timeout(Duration::from_secs(1), victim.recv())
+            .await
+            .expect("recv timed out")
+            .expect("recv ok");
+        assert_eq!(msg, ping);
+    }
+}

From 087f5ed02c7923bc705e17356268e3bfe92596f8 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 06:00:36 +0800
Subject: [PATCH 5/5] feat(swim): add spawn entry-point, SwimHandle, and
 optional UDP config field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Callers previously had to wire up MembershipList, DisseminationQueue,
FailureDetector, and the run-loop task themselves — a four-step
sequence with easy ordering mistakes. `spawn_swim` collapses this into
one call: it seeds the membership list from the provided address list,
validates config, starts the detector task, and returns a `SwimHandle`
that owns shutdown plumbing and exposes the shared membership and
dissemination queue.

`BootstrapConfig` gains `swim_udp_addr: Option<SocketAddr>` so
operators can opt into SWIM by supplying a bind address. `None` keeps
the existing behaviour (cluster boots without SWIM; membership is
observed only through Raft). All existing callsites are updated with
`swim_udp_addr: None`.

`SwimHandle` and `spawn_swim` are re-exported from `nodedb_cluster` so
dependent crates do not need to reach into swim internals.

Adds a real-UDP integration test (`swim_udp_convergence`) that boots
three SWIM nodes on ephemeral loopback ports and asserts they converge
to a full Alive view before a member is shut down and the remainder
observe it as Suspect/Dead.
---
 nodedb-cluster/src/bootstrap/bootstrap_fn.rs |   1 +
 nodedb-cluster/src/bootstrap/config.rs       |   7 +
 nodedb-cluster/src/bootstrap/join.rs         |   2 +
 nodedb-cluster/src/bootstrap/probe.rs        |   1 +
 nodedb-cluster/src/bootstrap/restart.rs      |   1 +
 nodedb-cluster/src/lib.rs                    |   5 +-
 nodedb-cluster/src/swim/bootstrap.rs         | 246 +++++++++++++++++++
 nodedb-cluster/src/swim/mod.rs               |   4 +-
 nodedb-cluster/tests/common/mod.rs           |   1 +
 nodedb-cluster/tests/swim_udp_convergence.rs | 142 +++++++++++
 nodedb/src/control/cluster/init.rs           |   1 +
 11 files changed, 409 insertions(+), 2 deletions(-)
 create mode 100644 nodedb-cluster/src/swim/bootstrap.rs
 create mode 100644 nodedb-cluster/tests/swim_udp_convergence.rs

diff --git a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
index 2db01945..a09b2e2c 100644
--- a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
+++ b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
@@ -101,6 +101,7 @@ mod tests {
             data_dir: _dir.path().to_path_buf(),
             force_bootstrap: false,
             join_retry: Default::default(),
+            swim_udp_addr: None,
         };
 
         let state = bootstrap(&config, &catalog).unwrap();
diff --git a/nodedb-cluster/src/bootstrap/config.rs b/nodedb-cluster/src/bootstrap/config.rs
index eb2a8611..933198c5 100644
--- a/nodedb-cluster/src/bootstrap/config.rs
+++ b/nodedb-cluster/src/bootstrap/config.rs
@@ -84,6 +84,13 @@ pub struct ClusterConfig {
     /// (`8` attempts, `32 s` ceiling). Tests override this with a
     /// faster policy.
     pub join_retry: JoinRetryPolicy,
+    /// Optional UDP bind address for the SWIM failure detector. `None`
+    /// disables SWIM entirely — cluster startup then relies solely on
+    /// the existing raft transport for membership observations. When
+    /// `Some`, the operator is expected to spawn SWIM separately via
+    /// [`crate::spawn_swim`] after the cluster is up and feed the
+    /// seed list from `seed_nodes`.
+    pub swim_udp_addr: Option<SocketAddr>,
 }
 
 /// Result of cluster startup — everything needed to run the Raft loop.
diff --git a/nodedb-cluster/src/bootstrap/join.rs b/nodedb-cluster/src/bootstrap/join.rs
index cad1dfad..afe6ad7a 100644
--- a/nodedb-cluster/src/bootstrap/join.rs
+++ b/nodedb-cluster/src/bootstrap/join.rs
@@ -449,6 +449,7 @@ mod tests {
             data_dir: _dir1.path().to_path_buf(),
             force_bootstrap: false,
             join_retry: Default::default(),
+            swim_udp_addr: None,
         };
         let state1 = bootstrap(&config1, &catalog1).unwrap();
 
@@ -497,6 +498,7 @@ mod tests {
             data_dir: _dir2.path().to_path_buf(),
             force_bootstrap: false,
             join_retry: Default::default(),
+            swim_udp_addr: None,
         };
 
         let lifecycle = ClusterLifecycleTracker::new();
diff --git a/nodedb-cluster/src/bootstrap/probe.rs b/nodedb-cluster/src/bootstrap/probe.rs
index 389f3994..1688c87d 100644
--- a/nodedb-cluster/src/bootstrap/probe.rs
+++ b/nodedb-cluster/src/bootstrap/probe.rs
@@ -222,6 +222,7 @@ mod tests {
             data_dir: std::env::temp_dir(),
             force_bootstrap: false,
             join_retry: Default::default(),
+            swim_udp_addr: None,
         }
     }
 
diff --git a/nodedb-cluster/src/bootstrap/restart.rs b/nodedb-cluster/src/bootstrap/restart.rs
index a6e7577a..3306142a 100644
--- a/nodedb-cluster/src/bootstrap/restart.rs
+++ b/nodedb-cluster/src/bootstrap/restart.rs
@@ -111,6 +111,7 @@ mod tests {
             data_dir: _dir.path().to_path_buf(),
             force_bootstrap: false,
             join_retry: Default::default(),
+            swim_udp_addr: None,
         };
 
         // Bootstrap first.
diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index bf114e35..ea340b52 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -78,4 +78,7 @@ pub use lifecycle::{
 pub use rdma_transport::{RdmaConfig, RdmaTransport};
 pub use rebalance_scheduler::{NodeMetrics, RebalanceScheduler, RebalanceTrigger, SchedulerConfig};
 pub use shard_split::{SplitPlan, SplitStrategy, plan_graph_split, plan_vector_split};
-pub use swim::{Incarnation, Member, MemberState, MembershipList, SwimConfig, SwimError};
+pub use swim::{
+    Incarnation, Member, MemberState, MembershipList, SwimConfig, SwimError, SwimHandle,
+    UdpTransport, spawn as spawn_swim,
+};
diff --git a/nodedb-cluster/src/swim/bootstrap.rs b/nodedb-cluster/src/swim/bootstrap.rs
new file mode 100644
index 00000000..943190d1
--- /dev/null
+++ b/nodedb-cluster/src/swim/bootstrap.rs
@@ -0,0 +1,246 @@
+//! SWIM subsystem bootstrap.
+//!
+//! [`spawn`] is the one-stop entry point callers (cluster startup or
+//! tests) use to stand up a running failure detector:
+//!
+//! 1. Constructs a [`MembershipList`] containing the local node at
+//!    incarnation 0.
+//! 2. Seeds the list with an `Alive` entry for every address in
+//!    `seeds`, using a synthetic `NodeId` of the form `"seed:<addr>"`.
+//!    The first successful probe replaces the placeholder with the
+//!    peer's real node id via the normal merge path.
+//! 3. Validates [`SwimConfig`] and constructs a [`FailureDetector`].
+//! 4. Spawns the detector's run loop on a fresh tokio task.
+//! 5. Returns a [`SwimHandle`] the caller can use to read membership,
+//!    access the dissemination queue, and shut the detector down.
+
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use nodedb_types::NodeId;
+use tokio::sync::watch;
+use tokio::task::JoinHandle;
+
+use super::config::SwimConfig;
+use super::detector::{FailureDetector, ProbeScheduler, Transport};
+use super::dissemination::DisseminationQueue;
+use super::error::SwimError;
+use super::incarnation::Incarnation;
+use super::member::MemberState;
+use super::member::record::MemberUpdate;
+use super::membership::MembershipList;
+
+/// Owns a running SWIM detector and its shutdown plumbing.
+///
+/// Dropping `SwimHandle` leaks the background task — callers should
+/// always invoke [`SwimHandle::shutdown`] to request graceful drain.
+pub struct SwimHandle {
+    detector: Arc<FailureDetector>,
+    membership: Arc<MembershipList>,
+    shutdown_tx: watch::Sender<bool>,
+    join: JoinHandle<()>,
+}
+
+impl SwimHandle {
+    /// Shared reference to the detector (for metrics, debugging, or
+    /// injecting synthetic rumours in tests).
+    pub fn detector(&self) -> &Arc<FailureDetector> {
+        &self.detector
+    }
+
+    /// Shared reference to the membership list. Clone cheaply; the
+    /// underlying `Arc` is identical to the detector's view.
+    pub fn membership(&self) -> &Arc<MembershipList> {
+        &self.membership
+    }
+
+    /// Shared reference to the dissemination queue. Used by callers
+    /// that want to enqueue rumours from outside SWIM (e.g. the raft
+    /// layer announcing a conf change).
+    pub fn dissemination(&self) -> &Arc<DisseminationQueue> {
+        self.detector.dissemination()
+    }
+
+    /// Signal the detector to shut down and await its task to finish.
+    /// Returns whatever error the join handle surfaced (normally none).
+    pub async fn shutdown(self) {
+        let _ = self.shutdown_tx.send(true);
+        let _ = self.join.await;
+    }
+}
+
+/// Bring up a SWIM failure detector.
+///
+/// * `cfg` — validated [`SwimConfig`]. An invalid config returns
+///   [`SwimError::InvalidConfig`] before any task is spawned.
+/// * `local_id` — this node's canonical id.
+/// * `local_addr` — the socket address the transport is already bound
+///   to. The membership list stores it verbatim for peers to echo back
+///   in probe responses.
+/// * `seeds` — initial peer addresses. Empty list is legal and yields a
+///   solo-cluster detector that does nothing interesting until a peer
+///   arrives via an external join.
+/// * `transport` — any [`Transport`] impl (UDP in production, the
+///   in-memory fabric in tests).
+pub async fn spawn(
+    cfg: SwimConfig,
+    local_id: NodeId,
+    local_addr: SocketAddr,
+    seeds: Vec<SocketAddr>,
+    transport: Arc<dyn Transport>,
+) -> Result<SwimHandle, SwimError> {
+    cfg.validate()?;
+
+    let membership = Arc::new(MembershipList::new_local(
+        local_id.clone(),
+        local_addr,
+        cfg.initial_incarnation,
+    ));
+
+    // Seed the membership table so the first probe round has somewhere
+    // to go. Placeholder ids are replaced on the first ack.
+    for seed_addr in &seeds {
+        if *seed_addr == local_addr {
+            continue;
+        }
+        membership.apply(&MemberUpdate {
+            node_id: NodeId::new(format!("seed:{seed_addr}")),
+            addr: seed_addr.to_string(),
+            state: MemberState::Alive,
+            incarnation: Incarnation::ZERO,
+        });
+    }
+
+    let initial_inc = cfg.initial_incarnation;
+    let detector = Arc::new(FailureDetector::new(
+        cfg,
+        Arc::clone(&membership),
+        transport,
+        ProbeScheduler::new(),
+    ));
+
+    // Prime the dissemination queue with our own Alive record so the
+    // first outgoing probes advertise our canonical NodeId + addr to
+    // every seed. Without this, seed placeholders would never be
+    // replaced with real ids until some peer independently learned
+    // our identity — which is not reliable from seed bootstrap alone.
+    detector.dissemination().enqueue(MemberUpdate {
+        node_id: local_id.clone(),
+        addr: local_addr.to_string(),
+        state: MemberState::Alive,
+        incarnation: initial_inc,
+    });
+
+    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let join = tokio::spawn({
+        let detector = Arc::clone(&detector);
+        async move { detector.run(shutdown_rx).await }
+    });
+
+    Ok(SwimHandle {
+        detector,
+        membership,
+        shutdown_tx,
+        join,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swim::detector::TransportFabric;
+    use std::net::{IpAddr, Ipv4Addr};
+    use std::time::Duration;
+
+    fn addr(p: u16) -> SocketAddr {
+        SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), p)
+    }
+
+    fn cfg() -> SwimConfig {
+        SwimConfig {
+            probe_interval: Duration::from_millis(100),
+            probe_timeout: Duration::from_millis(40),
+            indirect_probes: 2,
+            suspicion_mult: 4,
+            min_suspicion: Duration::from_millis(500),
+            initial_incarnation: Incarnation::ZERO,
+            max_piggyback: 6,
+            fanout_lambda: 3,
+        }
+    }
+
+    #[tokio::test]
+    async fn spawn_solo_cluster_has_only_local() {
+        let fab = TransportFabric::new();
+        let transport: Arc<dyn Transport> = Arc::new(fab.bind(addr(7100)).await);
+        let handle = spawn(cfg(), NodeId::new("a"), addr(7100), vec![], transport)
+            .await
+            .expect("spawn");
+        assert_eq!(handle.membership().len(), 1);
+        assert!(handle.membership().is_solo());
+        handle.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn spawn_seeds_populates_membership() {
+        let fab = TransportFabric::new();
+        let transport: Arc<dyn Transport> = Arc::new(fab.bind(addr(7110)).await);
+        let handle = spawn(
+            cfg(),
+            NodeId::new("a"),
+            addr(7110),
+            vec![addr(7111), addr(7112)],
+            transport,
+        )
+        .await
+        .expect("spawn");
+        assert_eq!(handle.membership().len(), 3);
+        handle.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn spawn_skips_local_addr_in_seeds() {
+        let fab = TransportFabric::new();
+        let transport: Arc<dyn Transport> = Arc::new(fab.bind(addr(7120)).await);
+        let handle = spawn(
+            cfg(),
+            NodeId::new("a"),
+            addr(7120),
+            vec![addr(7120), addr(7121)],
+            transport,
+        )
+        .await
+        .expect("spawn");
+        // Local + one real seed = 2.
+        assert_eq!(handle.membership().len(), 2);
+        handle.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn invalid_config_rejected_before_task_spawned() {
+        let fab = TransportFabric::new();
+        let transport: Arc<dyn Transport> = Arc::new(fab.bind(addr(7130)).await);
+        let mut bad = cfg();
+        bad.probe_timeout = bad.probe_interval; // violates the strict-less rule
+        let res = spawn(bad, NodeId::new("a"), addr(7130), vec![], transport).await;
+        match res {
+            Err(SwimError::InvalidConfig { .. }) => {}
+            Err(other) => panic!("expected InvalidConfig, got {other:?}"),
+            Ok(_) => panic!("expected InvalidConfig error"),
+        }
+    }
+
+    #[tokio::test]
+    async fn shutdown_joins_promptly() {
+        let fab = TransportFabric::new();
+        let transport: Arc<dyn Transport> = Arc::new(fab.bind(addr(7140)).await);
+        let handle = spawn(cfg(), NodeId::new("a"), addr(7140), vec![], transport)
+            .await
+            .expect("spawn");
+        let start = std::time::Instant::now();
+        tokio::time::timeout(Duration::from_millis(500), handle.shutdown())
+            .await
+            .expect("shutdown did not join within budget");
+        assert!(start.elapsed() < Duration::from_millis(500));
+    }
+}
diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs
index 8112e9db..2500e5f7 100644
--- a/nodedb-cluster/src/swim/mod.rs
+++ b/nodedb-cluster/src/swim/mod.rs
@@ -20,6 +20,7 @@
 //! It exposes the pure data model — member states, incarnation numbers, and
 //! the state-merge rule — that every later sub-batch builds on.
 
+pub mod bootstrap;
 pub mod config;
 pub mod detector;
 pub mod dissemination;
@@ -29,9 +30,10 @@ pub mod member;
 pub mod membership;
 pub mod wire;
 
+pub use bootstrap::{SwimHandle, spawn};
 pub use config::SwimConfig;
 pub use detector::{
-    FailureDetector, InMemoryTransport, ProbeScheduler, Transport, TransportFabric,
+    FailureDetector, InMemoryTransport, ProbeScheduler, Transport, TransportFabric, UdpTransport,
 };
 pub use dissemination::{DisseminationQueue, PendingUpdate, apply_and_disseminate};
 pub use error::SwimError;
diff --git a/nodedb-cluster/tests/common/mod.rs b/nodedb-cluster/tests/common/mod.rs
index 7b88768b..b2c1d137 100644
--- a/nodedb-cluster/tests/common/mod.rs
+++ b/nodedb-cluster/tests/common/mod.rs
@@ -186,6 +186,7 @@ impl TestNode {
                 max_attempts: 8,
                 max_backoff_secs: 2,
             },
+            swim_udp_addr: None,
         };
 
         let lifecycle = ClusterLifecycleTracker::new();
diff --git a/nodedb-cluster/tests/swim_udp_convergence.rs b/nodedb-cluster/tests/swim_udp_convergence.rs
new file mode 100644
index 00000000..254653b4
--- /dev/null
+++ b/nodedb-cluster/tests/swim_udp_convergence.rs
@@ -0,0 +1,142 @@
+//! Real-UDP convergence test for the SWIM failure detector.
+//!
+//! Spawns three SWIM instances on ephemeral loopback UDP ports, each
+//! seeded with the other two, and asserts:
+//!
+//! 1. Every node converges to a 3-member `Alive` view within 5 seconds.
+//! 2. When one node shuts its detector down, the remaining two observe
+//!    the silent peer as `Suspect` or `Dead` within another 5 seconds.
+//!
+//! Uses real wall-clock time and a fast probe cadence (50 ms probe
+//! interval) so the whole test finishes well under a second on a warm
+//! build. No mocks, no paused time.
+
+use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use nodedb_cluster::swim::Transport;
+use nodedb_cluster::{SwimConfig, SwimHandle, UdpTransport, spawn_swim};
+use nodedb_types::NodeId;
+
+fn any_loopback() -> SocketAddr {
+    SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0)
+}
+
+fn fast_cfg() -> SwimConfig {
+    SwimConfig {
+        probe_interval: Duration::from_millis(50),
+        probe_timeout: Duration::from_millis(20),
+        indirect_probes: 2,
+        suspicion_mult: 2,
+        min_suspicion: Duration::from_millis(150),
+        initial_incarnation: nodedb_cluster::Incarnation::ZERO,
+        max_piggyback: 6,
+        fanout_lambda: 3,
+    }
+}
+
+async fn poll<F>(deadline: Duration, mut pred: F) -> bool
+where
+    F: FnMut() -> bool,
+{
+    let start = Instant::now();
+    while start.elapsed() < deadline {
+        if pred() {
+            return true;
+        }
+        tokio::time::sleep(Duration::from_millis(20)).await;
+    }
+    pred()
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn three_node_udp_mesh_converges_and_detects_failure() {
+    // Bind three ephemeral UDP sockets.
+    let t_a = Arc::new(UdpTransport::bind(any_loopback()).await.expect("bind a"));
+    let t_b = Arc::new(UdpTransport::bind(any_loopback()).await.expect("bind b"));
+    let t_c = Arc::new(UdpTransport::bind(any_loopback()).await.expect("bind c"));
+    let addr_a = t_a.local_addr();
+    let addr_b = t_b.local_addr();
+    let addr_c = t_c.local_addr();
+
+    // Spawn each detector seeded with the other two addresses.
+    let h_a: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("a"),
+        addr_a,
+        vec![addr_b, addr_c],
+        t_a,
+    )
+    .await
+    .expect("spawn a");
+    let h_b: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("b"),
+        addr_b,
+        vec![addr_a, addr_c],
+        t_b,
+    )
+    .await
+    .expect("spawn b");
+    let h_c: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("c"),
+        addr_c,
+        vec![addr_a, addr_b],
+        t_c,
+    )
+    .await
+    .expect("spawn c");
+
+    // Converge: each node must observe at least 3 alive members (self
+    // + 2 peers). The seed placeholders start as Alive so this check
+    // is already true at t=0, but after a probe round the placeholder
+    // IDs get replaced by the real node IDs via the `from` field on
+    // the Ack. We assert that at least one of {a,b,c} *by real id*
+    // is present on every node.
+    let converged = poll(Duration::from_secs(5), || {
+        let real_ids = [NodeId::new("a"), NodeId::new("b"), NodeId::new("c")];
+        let check = |h: &SwimHandle| {
+            real_ids
+                .iter()
+                .filter(|id| h.membership().get(id).is_some())
+                .count()
+                >= 3
+        };
+        check(&h_a) && check(&h_b) && check(&h_c)
+    })
+    .await;
+    assert!(converged, "3-node UDP mesh failed to converge within 5s");
+
+    // Shut down node B. Do it cleanly so B stops responding to probes.
+    h_b.shutdown().await;
+
+    // A and C must now observe B as Suspect or Dead within 5s.
+    // We check by real id, so this assertion survives even if the
+    // merged entry's node_id ends up being "b" or "seed:<addr>".
+    let detected = poll(Duration::from_secs(5), || {
+        let b_id = NodeId::new("b");
+        let state_of = |h: &SwimHandle| {
+            h.membership()
+                .get(&b_id)
+                .map(|m| m.state)
+                .filter(|s| {
+                    matches!(
+                        s,
+                        nodedb_cluster::MemberState::Suspect | nodedb_cluster::MemberState::Dead
+                    )
+                })
+                .is_some()
+        };
+        state_of(&h_a) && state_of(&h_c)
+    })
+    .await;
+    assert!(
+        detected,
+        "A and C failed to mark B as Suspect/Dead within 5s after B shut down"
+    );
+
+    h_a.shutdown().await;
+    h_c.shutdown().await;
+}
diff --git a/nodedb/src/control/cluster/init.rs b/nodedb/src/control/cluster/init.rs
index 056baa0d..ef06fe4b 100644
--- a/nodedb/src/control/cluster/init.rs
+++ b/nodedb/src/control/cluster/init.rs
@@ -74,6 +74,7 @@ pub async fn init_cluster_with_transport(
         data_dir: data_dir.to_path_buf(),
         force_bootstrap: config.force_bootstrap,
         join_retry: join_retry_policy_from_env(),
+        swim_udp_addr: None,
     };
 
     let lifecycle = nodedb_cluster::ClusterLifecycleTracker::new();