From 9f592082905de4f78c93c1178fdc3d3db31263aa Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 08:00:22 +0800
Subject: [PATCH 01/24] chore: bump workspace version to 0.0.4

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index 34822136..5d18c12c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.0.3"
+version = "0.0.4"
 edition = "2024"
 rust-version = "1.94"
 license = "BUSL-1.1"

From ce3dc9da5720aa5d2d36588e75d71a523b15bb4e Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 08:18:03 +0800
Subject: [PATCH 02/24] feat(swim): add MembershipSubscriber hook for state
 transition events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a `MembershipSubscriber` trait that the failure detector
calls after every accepted member state transition (insert, Alive →
Suspect, Suspect → Dead, any → Left). Subscribers are synchronous and
must not block; they are suitable for cheap in-memory bookkeeping.

Key changes:
- `subscriber.rs`: defines `MembershipSubscriber` with a single
  `on_state_change(node_id, old, new)` method.
- `detector/runner.rs`: adds `with_subscribers` constructor and
  `apply_and_notify` helper that wraps `apply_and_disseminate` with
  before/after state snapshots; the no-subscriber path is
  zero-overhead (early return on empty slice).
- `bootstrap.rs`: adds `spawn_with_subscribers` alongside the
  existing `spawn` entry-point so callers can inject subscribers
  without breaking the common no-observer signature.
- `mod.rs`: re-exports `MembershipSubscriber` and
  `spawn_with_subscribers`.
---
 nodedb-cluster/src/swim/bootstrap.rs       | 18 ++++++++-
 nodedb-cluster/src/swim/detector/runner.rs | 43 ++++++++++++++++++++--
 nodedb-cluster/src/swim/mod.rs             |  2 +
 nodedb-cluster/src/swim/subscriber.rs      | 30 +++++++++++++++
 4 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 nodedb-cluster/src/swim/subscriber.rs

diff --git a/nodedb-cluster/src/swim/bootstrap.rs b/nodedb-cluster/src/swim/bootstrap.rs
index 943190d1..739e6ab1 100644
--- a/nodedb-cluster/src/swim/bootstrap.rs
+++ b/nodedb-cluster/src/swim/bootstrap.rs
@@ -29,6 +29,7 @@ use super::incarnation::Incarnation;
 use super::member::MemberState;
 use super::member::record::MemberUpdate;
 use super::membership::MembershipList;
+use super::subscriber::MembershipSubscriber;
 
 /// Owns a running SWIM detector and its shutdown plumbing.
 ///
@@ -88,6 +89,20 @@ pub async fn spawn(
     local_addr: SocketAddr,
     seeds: Vec<SocketAddr>,
     transport: Arc<dyn Transport>,
+) -> Result<SwimHandle, SwimError> {
+    spawn_with_subscribers(cfg, local_id, local_addr, seeds, transport, Vec::new()).await
+}
+
+/// Same as [`spawn`] but installs the given [`MembershipSubscriber`]s
+/// on the detector before its run loop starts, so every state
+/// transition is observed from the very first probe round.
+pub async fn spawn_with_subscribers(
+    cfg: SwimConfig,
+    local_id: NodeId,
+    local_addr: SocketAddr,
+    seeds: Vec<SocketAddr>,
+    transport: Arc<dyn Transport>,
+    subscribers: Vec<Arc<dyn MembershipSubscriber>>,
 ) -> Result<SwimHandle, SwimError> {
     cfg.validate()?;
 
@@ -112,11 +127,12 @@ pub async fn spawn(
     }
 
     let initial_inc = cfg.initial_incarnation;
-    let detector = Arc::new(FailureDetector::new(
+    let detector = Arc::new(FailureDetector::with_subscribers(
         cfg,
         Arc::clone(&membership),
         transport,
         ProbeScheduler::new(),
+        subscribers,
     ));
 
     // Prime the dissemination queue with our own Alive record so the
diff --git a/nodedb-cluster/src/swim/detector/runner.rs b/nodedb-cluster/src/swim/detector/runner.rs
index 23997583..df805320 100644
--- a/nodedb-cluster/src/swim/detector/runner.rs
+++ b/nodedb-cluster/src/swim/detector/runner.rs
@@ -19,6 +19,7 @@ use crate::swim::incarnation::Incarnation;
 use crate::swim::member::MemberState;
 use crate::swim::member::record::MemberUpdate;
 use crate::swim::membership::{MembershipList, MergeOutcome};
+use crate::swim::subscriber::MembershipSubscriber;
 use crate::swim::wire::{Ack, Ping, PingReq, ProbeId, SwimMessage};
 
 use super::probe_round::{InflightProbes, ProbeOutcome, ProbeRound};
@@ -41,6 +42,7 @@ pub struct FailureDetector {
     dissemination: Arc<DisseminationQueue>,
     probe_counter: AtomicU64,
     local_incarnation: Mutex<Incarnation>,
+    subscribers: Vec<Arc<dyn MembershipSubscriber>>,
 }
 
 impl FailureDetector {
@@ -51,6 +53,18 @@ impl FailureDetector {
         membership: Arc<MembershipList>,
         transport: Arc<dyn Transport>,
         scheduler: ProbeScheduler,
+    ) -> Self {
+        Self::with_subscribers(cfg, membership, transport, scheduler, Vec::new())
+    }
+
+    /// Construct with a list of [`MembershipSubscriber`]s that will be
+    /// notified on every member state transition.
+    pub fn with_subscribers(
+        cfg: SwimConfig,
+        membership: Arc<MembershipList>,
+        transport: Arc<dyn Transport>,
+        scheduler: ProbeScheduler,
+        subscribers: Vec<Arc<dyn MembershipSubscriber>>,
     ) -> Self {
         let initial_inc = cfg.initial_incarnation;
         Self {
@@ -63,7 +77,30 @@ impl FailureDetector {
             dissemination: Arc::new(DisseminationQueue::new()),
             probe_counter: AtomicU64::new(0),
             local_incarnation: Mutex::new(initial_inc),
+            subscribers,
+        }
+    }
+
+    /// Apply an update via [`apply_and_disseminate`] while notifying
+    /// every subscriber of any resulting state transition. Returns the
+    /// raw [`MergeOutcome`] so callers can still react to
+    /// `SelfRefute` etc.
+    fn apply_and_notify(&self, update: &MemberUpdate) -> MergeOutcome {
+        let old_state = self.membership.get(&update.node_id).map(|m| m.state);
+        let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update);
+        if self.subscribers.is_empty() {
+            return outcome;
+        }
+        let new_state = match self.membership.get(&update.node_id) {
+            Some(m) => m.state,
+            None => return outcome,
+        };
+        if old_state != Some(new_state) {
+            for sub in &self.subscribers {
+                sub.on_state_change(&update.node_id, old_state, new_state);
+            }
         }
+        outcome
     }
 
     /// Shared reference to the dissemination queue. Tests use it to
@@ -78,7 +115,7 @@ impl FailureDetector {
     /// local incarnation so subsequent probes advertise the new value.
     async fn ingest_piggyback(&self, piggyback: &[MemberUpdate]) {
         for update in piggyback {
-            let outcome = apply_and_disseminate(&self.membership, &self.dissemination, update);
+            let outcome = self.apply_and_notify(update);
             if let MergeOutcome::SelfRefute { new_incarnation } = outcome {
                 let mut guard = self.local_incarnation.lock().await;
                 if new_incarnation > *guard {
@@ -140,7 +177,7 @@ impl FailureDetector {
                     state: MemberState::Dead,
                     incarnation: member.incarnation,
                 };
-                apply_and_disseminate(&self.membership, &self.dissemination, &dead_update);
+                self.apply_and_notify(&dead_update);
             }
         }
 
@@ -174,7 +211,7 @@ impl FailureDetector {
                         state: MemberState::Suspect,
                         incarnation: member.incarnation,
                     };
-                    apply_and_disseminate(&self.membership, &self.dissemination, &suspect_update);
+                    self.apply_and_notify(&suspect_update);
                     let cluster_size = self.membership.len();
                     self.suspicion.lock().await.arm(
                         target,
diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs
index 2500e5f7..622e1fd9 100644
--- a/nodedb-cluster/src/swim/mod.rs
+++ b/nodedb-cluster/src/swim/mod.rs
@@ -28,6 +28,7 @@ pub mod error;
 pub mod incarnation;
 pub mod member;
 pub mod membership;
+pub mod subscriber;
 pub mod wire;
 
 pub use bootstrap::{SwimHandle, spawn};
@@ -40,4 +41,5 @@ pub use error::SwimError;
 pub use incarnation::Incarnation;
 pub use member::{Member, MemberState};
 pub use membership::{MembershipList, MembershipSnapshot, merge_update};
+pub use subscriber::MembershipSubscriber;
 pub use wire::{Ack, Nack, NackReason, Ping, PingReq, ProbeId, SwimMessage};
diff --git a/nodedb-cluster/src/swim/subscriber.rs b/nodedb-cluster/src/swim/subscriber.rs
new file mode 100644
index 00000000..e7a20746
--- /dev/null
+++ b/nodedb-cluster/src/swim/subscriber.rs
@@ -0,0 +1,30 @@
+//! `MembershipSubscriber` — hook fired whenever SWIM observes a
+//! member state transition.
+//!
+//! The failure detector invokes every registered subscriber *after*
+//! applying an update to the [`MembershipList`](super::membership::MembershipList)
+//! and dissemination queue, so subscribers see the post-merge view.
+//!
+//! Subscribers are synchronous and must not block — they typically do
+//! cheap in-memory bookkeeping (e.g. clearing a routing leader hint).
+//! Heavier work belongs on a dedicated task the subscriber spawns
+//! itself.
+//!
+//! ## Lifecycle
+//!
+//! - `old = None` means "first time we've seen this node" (insert).
+//! - `old = Some(state)` means the member existed and transitioned to
+//!   a strictly different `new` state. The detector never calls the
+//!   hook for no-op reapplies.
+//! - `Left` is terminal — after it fires once the member is gone.
+
+use nodedb_types::NodeId;
+
+use super::member::MemberState;
+
+/// Hook trait for observers that react to SWIM membership changes.
+pub trait MembershipSubscriber: Send + Sync {
+    /// Called after the membership list has accepted a state change
+    /// for `node_id`. `old` is `None` on first-time insert.
+    fn on_state_change(&self, node_id: &NodeId, old: Option<MemberState>, new: MemberState);
+}

From c1ab17d7999465f64a5fea3fa2df53ab2785d5aa Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 08:18:23 +0800
Subject: [PATCH 03/24] feat(cluster): invalidate routing leader hints on SWIM
 liveness events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `RoutingLivenessHook`, a `MembershipSubscriber` that clears the
leader hint for every Raft group whose leaseholder transitions to
Suspect, Dead, or Left. After the hint is cleared, the next query
against those vShards gets `NotLeader`, triggers leader re-discovery,
and updates the routing table — limiting clients to at most one retry
on node failure.

`NodeIdResolver` is a closure type that maps SWIM `NodeId` to the
numeric routing-table id, keeping the hook storage-agnostic. Seed
placeholders and transient learners that the routing table has not
yet registered are silently ignored.

The integration test (`tests/swim_routing_invalidation.rs`) runs three
real UDP-backed SWIM nodes, shuts down the group leader, and asserts
that the hook clears the routing hint within a few suspicion timeouts.
---
 Cargo.lock                                    |  36 ++--
 nodedb-cluster/src/lib.rs                     |   7 +-
 nodedb-cluster/src/routing_liveness.rs        | 187 ++++++++++++++++++
 .../tests/swim_routing_invalidation.rs        | 159 +++++++++++++++
 4 files changed, 369 insertions(+), 20 deletions(-)
 create mode 100644 nodedb-cluster/src/routing_liveness.rs
 create mode 100644 nodedb-cluster/tests/swim_routing_invalidation.rs

diff --git a/Cargo.lock b/Cargo.lock
index b17b653b..69390359 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3645,7 +3645,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "aes-gcm",
  "anyhow",
@@ -3735,7 +3735,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-bridge"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "fluxbench",
  "libc",
@@ -3747,7 +3747,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-client"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "async-trait",
  "nodedb-types",
@@ -3765,7 +3765,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-cluster"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "async-trait",
  "crc32c",
@@ -3792,7 +3792,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-codec"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "lz4_flex 0.11.6",
  "pco",
@@ -3808,7 +3808,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-columnar"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "crc32c",
  "nodedb-codec",
@@ -3825,7 +3825,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-crdt"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "hmac 0.12.1",
  "loro",
@@ -3838,7 +3838,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-fts"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "icu_segmenter",
  "lindera",
@@ -3853,7 +3853,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-graph"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nodedb-types",
  "rkyv 0.8.15",
@@ -3867,7 +3867,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-mem"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "fluxbench",
  "libc",
@@ -3882,7 +3882,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-query"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nodedb-fts",
  "nodedb-spatial",
@@ -3898,7 +3898,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-raft"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "rand 0.9.4",
  "rkyv 0.8.15",
@@ -3914,7 +3914,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-spatial"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "h3o",
  "nodedb-types",
@@ -3929,7 +3929,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-sql"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nodedb-query",
  "nodedb-types",
@@ -3939,7 +3939,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-strict"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "arrow",
  "nodedb-types",
@@ -3953,7 +3953,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-types"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "nanoid",
  "nodedb-codec",
@@ -3972,7 +3972,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-vector"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "libc",
  "memmap2",
@@ -3989,7 +3989,7 @@ dependencies = [
 
 [[package]]
 name = "nodedb-wal"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "aes-gcm",
  "crc32c",
diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index ea340b52..169864ad 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -29,6 +29,7 @@ pub mod readiness;
 pub mod rebalance;
 pub mod rebalance_scheduler;
 pub mod routing;
+pub mod routing_liveness;
 pub mod rpc_codec;
 pub mod shard_split;
 pub mod swim;
@@ -56,6 +57,7 @@ pub use multi_raft::{GroupStatus, MultiRaft};
 pub use raft_loop::{CommitApplier, RaftLoop, VShardEnvelopeHandler};
 pub use rebalance::{RebalancePlan, compute_plan, plan_to_requests};
 pub use routing::RoutingTable;
+pub use routing_liveness::{NodeIdResolver, RoutingLivenessHook};
 pub use rpc_codec::RaftRpc;
 pub use topology::{ClusterTopology, NodeInfo, NodeState};
 pub use transport::{NexarTransport, RaftRpcHandler};
@@ -78,7 +80,8 @@ pub use lifecycle::{
 pub use rdma_transport::{RdmaConfig, RdmaTransport};
 pub use rebalance_scheduler::{NodeMetrics, RebalanceScheduler, RebalanceTrigger, SchedulerConfig};
 pub use shard_split::{SplitPlan, SplitStrategy, plan_graph_split, plan_vector_split};
+pub use swim::bootstrap::spawn_with_subscribers as spawn_swim_with_subscribers;
 pub use swim::{
-    Incarnation, Member, MemberState, MembershipList, SwimConfig, SwimError, SwimHandle,
-    UdpTransport, spawn as spawn_swim,
+    Incarnation, Member, MemberState, MembershipList, MembershipSubscriber, SwimConfig, SwimError,
+    SwimHandle, UdpTransport, spawn as spawn_swim,
 };
diff --git a/nodedb-cluster/src/routing_liveness.rs b/nodedb-cluster/src/routing_liveness.rs
new file mode 100644
index 00000000..49cb22f1
--- /dev/null
+++ b/nodedb-cluster/src/routing_liveness.rs
@@ -0,0 +1,187 @@
+//! Liveness-driven routing invalidation (checklist item E.2).
+//!
+//! [`RoutingLivenessHook`] is a [`MembershipSubscriber`] that clears
+//! the leader hint for every Raft group whose leaseholder has just
+//! been marked `Suspect`, `Dead`, or `Left` by the SWIM failure
+//! detector. After the hook fires, the next query that consults the
+//! routing table observes `leader == 0` (the "no leader known"
+//! sentinel) and falls through to a fresh leader discovery via the
+//! existing NotLeader-triggered election path — which is exactly the
+//! behaviour the checklist requires:
+//!
+//! > 1. The routing cache invalidates all vShards whose leaseholder
+//! >    was that node.
+//! > 2. The next query against those vShards gets `NotLeader`,
+//! >    triggers a leader election, and updates the routing table.
+//! > 3. Clients see at most one retry.
+//!
+//! The hook is storage-agnostic: it holds `Arc<RwLock<RoutingTable>>`
+//! and a resolver closure that maps the string-keyed SWIM `NodeId`
+//! to the numeric `u64` id used throughout the rest of the cluster
+//! crate. Wiring layers (start_cluster, tests) supply the resolver
+//! appropriate to their topology source.
+//!
+//! The hook is intentionally sync and cheap — a single `RwLock::write`,
+//! a linear scan over group_members, and `set_leader(gid, 0)` for
+//! each affected group. No I/O, no spawning. That keeps it safe to
+//! call directly from the detector run loop.
+
+use std::sync::{Arc, RwLock};
+
+use nodedb_types::NodeId;
+use tracing::debug;
+
+use crate::routing::RoutingTable;
+use crate::swim::MemberState;
+use crate::swim::subscriber::MembershipSubscriber;
+
+/// Resolver mapping SWIM `NodeId` → numeric `u64` routing-table id.
+///
+/// Returns `None` for members SWIM knows about but the routing table
+/// does not (placeholder `seed:<addr>` entries before the first real
+/// probe, transient learners, etc.). Those are silently ignored.
+pub type NodeIdResolver = Arc<dyn Fn(&NodeId) -> Option<u64> + Send + Sync>;
+
+/// Clears the leader hint for every group led by a node that SWIM
+/// has marked Suspect/Dead/Left.
+pub struct RoutingLivenessHook {
+    routing: Arc<RwLock<RoutingTable>>,
+    resolver: NodeIdResolver,
+}
+
+impl RoutingLivenessHook {
+    pub fn new(routing: Arc<RwLock<RoutingTable>>, resolver: NodeIdResolver) -> Self {
+        Self { routing, resolver }
+    }
+}
+
+impl MembershipSubscriber for RoutingLivenessHook {
+    fn on_state_change(&self, node_id: &NodeId, _old: Option<MemberState>, new: MemberState) {
+        // Alive transitions are a no-op: the next query will refresh
+        // the leader hint naturally on NotLeader. We only invalidate
+        // when a leader has observably stopped being reachable.
+        if !matches!(
+            new,
+            MemberState::Suspect | MemberState::Dead | MemberState::Left
+        ) {
+            return;
+        }
+
+        let Some(numeric_id) = (self.resolver)(node_id) else {
+            // SWIM knows about a node the routing table doesn't — a
+            // seed placeholder, a learner mid-join, or a node that
+            // was never registered. Nothing to invalidate.
+            return;
+        };
+
+        let mut rt = self.routing.write().unwrap_or_else(|p| p.into_inner());
+        let affected: Vec<u64> = rt
+            .group_members()
+            .iter()
+            .filter(|(_, info)| info.leader == numeric_id)
+            .map(|(gid, _)| *gid)
+            .collect();
+        for gid in &affected {
+            rt.set_leader(*gid, 0);
+        }
+        if !affected.is_empty() {
+            debug!(
+                ?node_id,
+                ?new,
+                numeric_id,
+                groups_invalidated = affected.len(),
+                "routing liveness hook cleared leader hints"
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn rt_with_leaders(pairs: &[(u64, u64)], rf: usize) -> Arc<RwLock<RoutingTable>> {
+        // Build a routing table with `pairs.len()` groups where group
+        // `gid` has leader `leader`. Uses the uniform constructor to
+        // pick a membership, then overrides the leader.
+        let nodes: Vec<u64> = pairs.iter().map(|(_, l)| *l).collect();
+        let mut rt = RoutingTable::uniform(pairs.len() as u64, &nodes, rf);
+        for (gid, leader) in pairs {
+            rt.set_leader(*gid, *leader);
+        }
+        Arc::new(RwLock::new(rt))
+    }
+
+    fn resolver_for(map: &'static [(&'static str, u64)]) -> NodeIdResolver {
+        Arc::new(move |nid: &NodeId| {
+            map.iter()
+                .find(|(s, _)| *s == nid.as_str())
+                .map(|(_, n)| *n)
+        })
+    }
+
+    #[test]
+    fn dead_transition_clears_leader_for_owned_groups() {
+        let rt = rt_with_leaders(&[(0, 1), (1, 2), (2, 1), (3, 3)], 1);
+        let hook =
+            RoutingLivenessHook::new(rt.clone(), resolver_for(&[("a", 1), ("b", 2), ("c", 3)]));
+
+        hook.on_state_change(
+            &NodeId::new("a"),
+            Some(MemberState::Alive),
+            MemberState::Dead,
+        );
+
+        let guard = rt.read().unwrap();
+        assert_eq!(guard.group_info(0).unwrap().leader, 0);
+        assert_eq!(guard.group_info(1).unwrap().leader, 2);
+        assert_eq!(guard.group_info(2).unwrap().leader, 0);
+        assert_eq!(guard.group_info(3).unwrap().leader, 3);
+    }
+
+    #[test]
+    fn suspect_transition_also_invalidates() {
+        let rt = rt_with_leaders(&[(0, 7)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("x", 7)]));
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Suspect,
+        );
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 0);
+    }
+
+    #[test]
+    fn alive_transition_is_noop() {
+        let rt = rt_with_leaders(&[(0, 5)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("q", 5)]));
+        hook.on_state_change(&NodeId::new("q"), None, MemberState::Alive);
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 5);
+    }
+
+    #[test]
+    fn unresolved_node_id_is_ignored() {
+        let rt = rt_with_leaders(&[(0, 1)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("a", 1)]));
+        // NodeId "seed:127.0.0.1:9000" is not in the resolver map.
+        hook.on_state_change(
+            &NodeId::new("seed:127.0.0.1:9000"),
+            Some(MemberState::Alive),
+            MemberState::Dead,
+        );
+        // Leader untouched because the resolver returned None.
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 1);
+    }
+
+    #[test]
+    fn left_is_also_invalidating() {
+        let rt = rt_with_leaders(&[(0, 2)], 1);
+        let hook = RoutingLivenessHook::new(rt.clone(), resolver_for(&[("b", 2)]));
+        hook.on_state_change(
+            &NodeId::new("b"),
+            Some(MemberState::Alive),
+            MemberState::Left,
+        );
+        assert_eq!(rt.read().unwrap().group_info(0).unwrap().leader, 0);
+    }
+}
diff --git a/nodedb-cluster/tests/swim_routing_invalidation.rs b/nodedb-cluster/tests/swim_routing_invalidation.rs
new file mode 100644
index 00000000..f062d6f1
--- /dev/null
+++ b/nodedb-cluster/tests/swim_routing_invalidation.rs
@@ -0,0 +1,159 @@
+//! E.2 — Liveness drives routing invalidation.
+//!
+//! Three UDP-backed SWIM nodes form a full mesh. A shared
+//! `RoutingTable` declares node B as the leader of group 0. A
+//! `RoutingLivenessHook` subscribed to node A's detector is wired to
+//! that routing table. When B is shut down, A's detector must observe
+//! the Suspect→Dead transition and the hook must clear the leader
+//! hint for group 0 within a few suspicion timeouts.
+
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, Instant};
+
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::routing_liveness::{NodeIdResolver, RoutingLivenessHook};
+use nodedb_cluster::swim::Transport;
+use nodedb_cluster::swim::bootstrap::spawn_with_subscribers;
+use nodedb_cluster::{
+    Incarnation, MembershipSubscriber, SwimConfig, SwimHandle, UdpTransport, spawn_swim,
+};
+use nodedb_types::NodeId;
+
+fn fast_cfg() -> SwimConfig {
+    SwimConfig {
+        probe_interval: Duration::from_millis(50),
+        probe_timeout: Duration::from_millis(20),
+        indirect_probes: 2,
+        suspicion_mult: 3,
+        min_suspicion: Duration::from_millis(150),
+        initial_incarnation: Incarnation::ZERO,
+        max_piggyback: 6,
+        fanout_lambda: 3,
+    }
+}
+
+fn resolver_static() -> NodeIdResolver {
+    Arc::new(|nid: &NodeId| match nid.as_str() {
+        "a" => Some(1),
+        "b" => Some(2),
+        "c" => Some(3),
+        _ => None,
+    })
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn swim_dead_leader_clears_routing_hint() {
+    // --- Build three real UDP transports on ephemeral ports. ---
+    let t_a = Arc::new(
+        UdpTransport::bind("127.0.0.1:0".parse().unwrap())
+            .await
+            .unwrap(),
+    );
+    let t_b = Arc::new(
+        UdpTransport::bind("127.0.0.1:0".parse().unwrap())
+            .await
+            .unwrap(),
+    );
+    let t_c = Arc::new(
+        UdpTransport::bind("127.0.0.1:0".parse().unwrap())
+            .await
+            .unwrap(),
+    );
+    let addr_a = t_a.local_addr();
+    let addr_b = t_b.local_addr();
+    let addr_c = t_c.local_addr();
+
+    // --- Shared routing table: 4 groups, leader = node b (id=2) for groups 0 and 2. ---
+    let rt = Arc::new(RwLock::new(RoutingTable::uniform(4, &[1, 2, 3], 3)));
+    {
+        let mut guard = rt.write().unwrap();
+        guard.set_leader(0, 2);
+        guard.set_leader(1, 1);
+        guard.set_leader(2, 2);
+        guard.set_leader(3, 3);
+    }
+
+    // --- Hook node A to the routing table. ---
+    let hook: Arc<dyn MembershipSubscriber> =
+        Arc::new(RoutingLivenessHook::new(rt.clone(), resolver_static()));
+
+    let h_a: SwimHandle = spawn_with_subscribers(
+        fast_cfg(),
+        NodeId::new("a"),
+        addr_a,
+        vec![addr_b, addr_c],
+        t_a.clone() as Arc<dyn Transport>,
+        vec![hook],
+    )
+    .await
+    .expect("spawn a");
+    let h_b: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("b"),
+        addr_b,
+        vec![addr_a, addr_c],
+        t_b.clone() as Arc<dyn Transport>,
+    )
+    .await
+    .expect("spawn b");
+    let h_c: SwimHandle = spawn_swim(
+        fast_cfg(),
+        NodeId::new("c"),
+        addr_c,
+        vec![addr_a, addr_b],
+        t_c.clone() as Arc<dyn Transport>,
+    )
+    .await
+    .expect("spawn c");
+
+    // --- Wait for A to learn about B (real id, not placeholder). ---
+    let deadline = Instant::now() + Duration::from_secs(5);
+    loop {
+        let seen = h_a.membership().get(&NodeId::new("b")).is_some();
+        if seen || Instant::now() >= deadline {
+            assert!(seen, "A never learned B's real NodeId");
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    // --- Sanity: group 0 still led by node 2. ---
+    {
+        let guard = rt.read().unwrap();
+        assert_eq!(guard.group_info(0).unwrap().leader, 2);
+    }
+
+    // --- Shut B down and wait for A to invalidate the leader hint. ---
+    h_b.shutdown().await;
+
+    let deadline = Instant::now() + Duration::from_secs(5);
+    loop {
+        let cleared = {
+            let guard = rt.read().unwrap();
+            guard.group_info(0).unwrap().leader == 0 && guard.group_info(2).unwrap().leader == 0
+        };
+        if cleared {
+            break;
+        }
+        if Instant::now() >= deadline {
+            panic!("routing hook never cleared leader hints for groups led by B");
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    // Groups led by A must be untouched — A is still alive and probing.
+    // We do NOT assert on groups led by C: under real UDP races the
+    // detector may transiently flag C as Suspect while B is being
+    // demoted, which is the correct behaviour of the hook.
+    {
+        let guard = rt.read().unwrap();
+        assert_eq!(
+            guard.group_info(1).unwrap().leader,
+            1,
+            "group led by local node A must not be invalidated"
+        );
+    }
+
+    h_a.shutdown().await;
+    h_c.shutdown().await;
+}

From 7056676cc97a70027f867fc874660b8de2a875df Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 08:56:24 +0800
Subject: [PATCH 04/24] feat(cluster): add graceful node decommission
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a `decommission` module with four sub-modules:

- `safety`      — `check_can_decommission` validates that removing a
                  node from every Raft group it belongs to still
                  satisfies the configured replication factor.
- `flow`        — `DecommissionFlow` drives the ordered sequence of
                  metadata proposals (start → leadership transfers →
                  member removal → finish) with per-step error
                  propagation.
- `coordinator` — `DecommissionCoordinator` owns the flow and exposes a
                  single `run()` entry point for callers that hold the
                  metadata Raft handle.
- `observer`    — `DecommissionObserver` watches committed entries and
                  unblocks a waiting coordinator when the final
                  `FinishDecommission` entry is applied.

Supporting changes:

- `metadata_group::entry` — add `RoutingChange::RemoveMember` so the
  decommission flow can strip a draining node from every group it
  belongs to without a compatibility break.
- `metadata_group::applier` — add `with_live_state()` builder that
  attaches live `ClusterTopology` and `RoutingTable` handles; committed
  `TopologyChange` and `RoutingChange` entries now mutate them in place
  in addition to updating the in-memory history log.
- `routing` — add `remove_group_member()` with unit tests covering
  voter removal, learner-only removal, and unknown-group no-ops.
- `lifecycle::plan_decommission` — thin wrapper over
  `decommission::plan_full_decommission`; existing call sites are
  unchanged.
- `lib` — re-export public decommission surface.

Integration test `decommission_flow` exercises the full round-trip on a
three-node in-process cluster: safety check, coordinator run, and
topology/routing state after commit.
---
 .../src/decommission/coordinator.rs           | 222 +++++++++++++++++
 nodedb-cluster/src/decommission/flow.rs       | 227 ++++++++++++++++++
 nodedb-cluster/src/decommission/mod.rs        |  35 +++
 nodedb-cluster/src/decommission/observer.rs   | 196 +++++++++++++++
 nodedb-cluster/src/decommission/safety.rs     | 172 +++++++++++++
 nodedb-cluster/src/lib.rs                     |   9 +
 nodedb-cluster/src/lifecycle.rs               |  61 ++---
 nodedb-cluster/src/metadata_group/applier.rs  | 174 +++++++++++++-
 nodedb-cluster/src/metadata_group/entry.rs    |   7 +
 nodedb-cluster/src/routing.rs                 |  47 ++++
 nodedb-cluster/tests/decommission_flow.rs     | 153 ++++++++++++
 11 files changed, 1260 insertions(+), 43 deletions(-)
 create mode 100644 nodedb-cluster/src/decommission/coordinator.rs
 create mode 100644 nodedb-cluster/src/decommission/flow.rs
 create mode 100644 nodedb-cluster/src/decommission/mod.rs
 create mode 100644 nodedb-cluster/src/decommission/observer.rs
 create mode 100644 nodedb-cluster/src/decommission/safety.rs
 create mode 100644 nodedb-cluster/tests/decommission_flow.rs

diff --git a/nodedb-cluster/src/decommission/coordinator.rs b/nodedb-cluster/src/decommission/coordinator.rs
new file mode 100644
index 00000000..4f62c4aa
--- /dev/null
+++ b/nodedb-cluster/src/decommission/coordinator.rs
@@ -0,0 +1,222 @@
+//! `DecommissionCoordinator` — drives a [`DecommissionPlan`] through
+//! the metadata Raft group one entry at a time.
+//!
+//! The coordinator is a stateless-looking actor: it owns the plan,
+//! a [`MetadataProposer`] (the injection seam for tests and for
+//! whichever Raft driver is wired up at runtime), and an index
+//! counter. On every call to [`DecommissionCoordinator::run`] it
+//! proposes each entry in order, waiting for each to commit before
+//! advancing. A proposer failure aborts the run at the failed step —
+//! the caller can retry by constructing a fresh coordinator from
+//! the same plan, because every step is idempotent at the metadata
+//! layer (the cache and live-state appliers skip already-applied
+//! indexes).
+//!
+//! The coordinator does not own a timer or a shutdown channel — it
+//! is a one-shot sequence. Higher-level supervisors handle retries
+//! and cancellation.
+
+use async_trait::async_trait;
+use tracing::{debug, info};
+
+use crate::error::Result;
+use crate::metadata_group::MetadataEntry;
+
+use super::flow::DecommissionPlan;
+
+/// Injection seam: proposes a single metadata entry through the
+/// metadata Raft group and waits for it to commit. Returns the
+/// applied index on success so the coordinator can tell it apart
+/// from older commits.
+#[async_trait]
+pub trait MetadataProposer: Send + Sync {
+    async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64>;
+}
+
+// Blanket impl so callers can pass `Arc<T>` wherever a `MetadataProposer`
+// is required without having to write a forwarding impl for every
+// wrapper type. Defined here (rather than in the consumer crate) to
+// avoid orphan-rule issues for downstream test impls.
+#[async_trait]
+impl<T: MetadataProposer + ?Sized> MetadataProposer for std::sync::Arc<T> {
+    async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64> {
+        (**self).propose_and_wait(entry).await
+    }
+}
+
+/// Drives a [`DecommissionPlan`] to completion.
+pub struct DecommissionCoordinator<P: MetadataProposer> {
+    plan: DecommissionPlan,
+    proposer: P,
+}
+
+/// Outcome of a successful coordinator run.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DecommissionRunResult {
+    pub node_id: u64,
+    pub entries_committed: usize,
+    pub last_applied_index: u64,
+}
+
+impl<P: MetadataProposer> DecommissionCoordinator<P> {
+    pub fn new(plan: DecommissionPlan, proposer: P) -> Self {
+        Self { plan, proposer }
+    }
+
+    /// Propose every entry in the plan sequentially, waiting for
+    /// each commit. Returns the total number of entries committed
+    /// and the final applied index.
+    pub async fn run(self) -> Result<DecommissionRunResult> {
+        let node_id = self.plan.node_id;
+        let total = self.plan.entries.len();
+        info!(node_id, steps = total, "decommission coordinator starting");
+        let mut last_applied = 0u64;
+        for (step, entry) in self.plan.entries.into_iter().enumerate() {
+            debug!(node_id, step, total, "proposing decommission entry");
+            last_applied = self.proposer.propose_and_wait(entry).await?;
+        }
+        info!(
+            node_id,
+            entries_committed = total,
+            last_applied,
+            "decommission coordinator finished"
+        );
+        Ok(DecommissionRunResult {
+            node_id,
+            entries_committed: total,
+            last_applied_index: last_applied,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::decommission::flow::plan_full_decommission;
+    use crate::error::ClusterError;
+    use crate::metadata_group::{RoutingChange, TopologyChange};
+    use crate::routing::RoutingTable;
+    use crate::topology::{ClusterTopology, NodeInfo, NodeState};
+    use std::net::SocketAddr;
+    use std::sync::atomic::{AtomicU64, Ordering};
+    use std::sync::{Arc, Mutex};
+
+    struct RecordingProposer {
+        committed: Mutex<Vec<MetadataEntry>>,
+        counter: AtomicU64,
+    }
+
+    impl RecordingProposer {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                committed: Mutex::new(Vec::new()),
+                counter: AtomicU64::new(0),
+            })
+        }
+    }
+
+    #[async_trait]
+    impl MetadataProposer for RecordingProposer {
+        async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64> {
+            let idx = self.counter.fetch_add(1, Ordering::SeqCst) + 1;
+            self.committed.lock().unwrap().push(entry);
+            Ok(idx)
+        }
+    }
+
+    struct FailingProposer {
+        fail_after: usize,
+        counter: AtomicU64,
+    }
+
+    #[async_trait]
+    impl MetadataProposer for FailingProposer {
+        async fn propose_and_wait(&self, _entry: MetadataEntry) -> Result<u64> {
+            let n = self.counter.fetch_add(1, Ordering::SeqCst);
+            if n as usize >= self.fail_after {
+                return Err(ClusterError::Transport {
+                    detail: "injected failure".into(),
+                });
+            }
+            Ok(n + 1)
+        }
+    }
+
+    fn three_node_plan() -> DecommissionPlan {
+        let mut t = ClusterTopology::new();
+        for (i, id) in [1u64, 2, 3].iter().enumerate() {
+            let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+        }
+        let routing = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        plan_full_decommission(1, &t, &routing, 2).unwrap()
+    }
+
+    #[tokio::test]
+    async fn coordinator_proposes_every_entry_in_order() {
+        let plan = three_node_plan();
+        let expected = plan.entries.clone();
+        let proposer = RecordingProposer::new();
+        let coord = DecommissionCoordinator::new(plan, proposer.clone());
+        let result = coord.run().await.unwrap();
+
+        assert_eq!(result.node_id, 1);
+        assert_eq!(result.entries_committed, expected.len());
+        let committed = proposer.committed.lock().unwrap().clone();
+        assert_eq!(committed, expected);
+    }
+
+    #[tokio::test]
+    async fn coordinator_aborts_on_proposer_error() {
+        let plan = three_node_plan();
+        let proposer = FailingProposer {
+            fail_after: 2,
+            counter: AtomicU64::new(0),
+        };
+        let coord = DecommissionCoordinator::new(plan, proposer);
+        let err = coord.run().await.unwrap_err();
+        assert!(err.to_string().contains("injected failure"));
+    }
+
+    #[tokio::test]
+    async fn coordinator_reports_last_applied_index() {
+        let plan = three_node_plan();
+        let proposer = RecordingProposer::new();
+        let coord = DecommissionCoordinator::new(plan, proposer.clone());
+        let result = coord.run().await.unwrap();
+        // The recording proposer returns monotonically increasing
+        // indexes starting from 1; the last one equals the total
+        // entry count.
+        assert_eq!(result.last_applied_index, result.entries_committed as u64);
+    }
+
+    /// Sanity: the plan's shape is preserved end to end — the
+    /// recording proposer sees the same `StartDecommission` /
+    /// `FinishDecommission` / `Leave` bookends.
+    #[tokio::test]
+    async fn coordinator_preserves_bookends() {
+        let plan = three_node_plan();
+        let proposer = RecordingProposer::new();
+        let coord = DecommissionCoordinator::new(plan, proposer.clone());
+        coord.run().await.unwrap();
+
+        let committed = proposer.committed.lock().unwrap().clone();
+        assert!(matches!(
+            committed.first(),
+            Some(MetadataEntry::TopologyChange(
+                TopologyChange::StartDecommission { node_id: 1 }
+            ))
+        ));
+        assert!(matches!(
+            committed.last(),
+            Some(MetadataEntry::TopologyChange(TopologyChange::Leave {
+                node_id: 1
+            }))
+        ));
+        // At least one RemoveMember for the target.
+        assert!(committed.iter().any(|e| matches!(
+            e,
+            MetadataEntry::RoutingChange(RoutingChange::RemoveMember { node_id: 1, .. })
+        )));
+    }
+}
diff --git a/nodedb-cluster/src/decommission/flow.rs b/nodedb-cluster/src/decommission/flow.rs
new file mode 100644
index 00000000..aa2c30c3
--- /dev/null
+++ b/nodedb-cluster/src/decommission/flow.rs
@@ -0,0 +1,227 @@
+//! Decommission flow — emit the full ordered sequence of metadata
+//! entries that move a node from `Active` to fully removed.
+//!
+//! [`plan_full_decommission`] is pure: given a snapshot of topology
+//! and routing, it returns the exact list of
+//! [`MetadataEntry`](crate::metadata_group::MetadataEntry) values the
+//! coordinator will propose through the metadata Raft group, in the
+//! order they must commit. The flow is deterministic — two nodes
+//! looking at the same snapshot produce byte-identical plans, which
+//! means a failed coordinator can be resumed from any consistent
+//! snapshot without needing per-plan state to be replicated.
+
+use crate::error::Result;
+use crate::metadata_group::{MetadataEntry, RoutingChange, TopologyChange};
+use crate::routing::RoutingTable;
+use crate::topology::ClusterTopology;
+
+use super::safety::check_can_decommission;
+
+/// Output of [`plan_full_decommission`] — the caller proposes
+/// `entries` in order, waiting for each to commit before moving on.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DecommissionPlan {
+    pub node_id: u64,
+    pub entries: Vec<MetadataEntry>,
+}
+
+/// Build the complete decommission plan for `node_id`.
+///
+/// Steps (in the order they appear in the returned `entries`):
+///
+/// 1. `TopologyChange::StartDecommission` — flip the target to
+///    `Draining`.
+/// 2. `RoutingChange::LeadershipTransfer` — for every group the
+///    target currently leads, hand leadership to another voter.
+/// 3. `RoutingChange::RemoveMember` — strip the target out of every
+///    group's member (and learner) list.
+/// 4. `TopologyChange::FinishDecommission` — flip the target to
+///    `Decommissioned`.
+/// 5. `TopologyChange::Leave` — remove the target from topology
+///    entirely so future peer lookups return `NodeNotFound`.
+///
+/// The safety gate in [`check_can_decommission`] runs first and
+/// returns an error without producing a plan if any group would drop
+/// below the configured replication factor.
+pub fn plan_full_decommission(
+    node_id: u64,
+    topology: &ClusterTopology,
+    routing: &RoutingTable,
+    replication_factor: usize,
+) -> Result<DecommissionPlan> {
+    check_can_decommission(node_id, topology, routing, replication_factor)?;
+
+    let mut entries = Vec::new();
+    entries.push(MetadataEntry::TopologyChange(
+        TopologyChange::StartDecommission { node_id },
+    ));
+
+    // Collect a stable, sorted group_id ordering so the plan is
+    // reproducible across HashMap iterations.
+    let mut group_ids: Vec<u64> = routing
+        .group_members()
+        .iter()
+        .filter(|(_, info)| info.members.contains(&node_id) || info.learners.contains(&node_id))
+        .map(|(gid, _)| *gid)
+        .collect();
+    group_ids.sort_unstable();
+
+    // 2. Leadership transfers for every group the target currently leads.
+    for gid in &group_ids {
+        let info = routing
+            .group_info(*gid)
+            .expect("group id came from routing snapshot");
+        if info.leader != node_id {
+            continue;
+        }
+        if let Some(&new_leader) = info.members.iter().find(|&&m| m != node_id) {
+            entries.push(MetadataEntry::RoutingChange(
+                RoutingChange::LeadershipTransfer {
+                    group_id: *gid,
+                    new_leader_node_id: new_leader,
+                },
+            ));
+        }
+    }
+
+    // 3. Remove the target from every group's member and learner sets.
+    for gid in &group_ids {
+        entries.push(MetadataEntry::RoutingChange(RoutingChange::RemoveMember {
+            group_id: *gid,
+            node_id,
+        }));
+    }
+
+    // 4. Finish decommission (topology state → Decommissioned).
+    entries.push(MetadataEntry::TopologyChange(
+        TopologyChange::FinishDecommission { node_id },
+    ));
+
+    // 5. Leave — remove from topology entirely.
+    entries.push(MetadataEntry::TopologyChange(TopologyChange::Leave {
+        node_id,
+    }));
+
+    Ok(DecommissionPlan { node_id, entries })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::{NodeInfo, NodeState};
+    use std::net::SocketAddr;
+
+    fn topo(nodes: &[u64]) -> ClusterTopology {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let addr: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, addr, NodeState::Active));
+        }
+        t
+    }
+
+    #[test]
+    fn plan_shape_matches_spec() {
+        let t = topo(&[1, 2, 3]);
+        // 2 groups, RF=3 (each group has all 3 nodes). Decommission
+        // 1 with RF=2 (the surviving quorum).
+        let routing = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        let plan = plan_full_decommission(1, &t, &routing, 2).unwrap();
+        assert_eq!(plan.node_id, 1);
+
+        // First entry: StartDecommission.
+        assert!(matches!(
+            plan.entries.first(),
+            Some(MetadataEntry::TopologyChange(
+                TopologyChange::StartDecommission { node_id: 1 }
+            ))
+        ));
+
+        // Last two entries: FinishDecommission, Leave.
+        let n = plan.entries.len();
+        assert!(matches!(
+            plan.entries[n - 2],
+            MetadataEntry::TopologyChange(TopologyChange::FinishDecommission { node_id: 1 })
+        ));
+        assert!(matches!(
+            plan.entries[n - 1],
+            MetadataEntry::TopologyChange(TopologyChange::Leave { node_id: 1 })
+        ));
+
+        // Every group the target is in must get a RemoveMember.
+        let remove_count = plan
+            .entries
+            .iter()
+            .filter(|e| {
+                matches!(
+                    e,
+                    MetadataEntry::RoutingChange(RoutingChange::RemoveMember { node_id: 1, .. })
+                )
+            })
+            .count();
+        assert_eq!(remove_count, 2);
+    }
+
+    #[test]
+    fn plan_emits_leadership_transfer_when_target_leads() {
+        let t = topo(&[1, 2, 3]);
+        let mut routing = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        routing.set_leader(0, 1);
+        routing.set_leader(1, 2);
+        let plan = plan_full_decommission(1, &t, &routing, 2).unwrap();
+        // Exactly one LeadershipTransfer for group 0.
+        let transfers: Vec<_> = plan
+            .entries
+            .iter()
+            .filter_map(|e| match e {
+                MetadataEntry::RoutingChange(RoutingChange::LeadershipTransfer {
+                    group_id,
+                    new_leader_node_id,
+                }) => Some((*group_id, *new_leader_node_id)),
+                _ => None,
+            })
+            .collect();
+        assert_eq!(transfers.len(), 1);
+        assert_eq!(transfers[0].0, 0);
+        assert_ne!(transfers[0].1, 1, "new leader must not be the target");
+    }
+
+    #[test]
+    fn plan_is_deterministic() {
+        let t = topo(&[1, 2, 3]);
+        let routing = RoutingTable::uniform(4, &[1, 2, 3], 3);
+        let p1 = plan_full_decommission(2, &t, &routing, 2).unwrap();
+        let p2 = plan_full_decommission(2, &t, &routing, 2).unwrap();
+        assert_eq!(p1.entries, p2.entries);
+    }
+
+    #[test]
+    fn plan_rejected_when_safety_fails() {
+        let t = topo(&[1, 2]);
+        let routing = RoutingTable::uniform(2, &[1, 2], 2);
+        let err = plan_full_decommission(1, &t, &routing, 2).unwrap_err();
+        assert!(err.to_string().contains("replication factor"));
+    }
+
+    #[test]
+    fn plan_skips_groups_target_is_not_in() {
+        let t = topo(&[1, 2, 3]);
+        let mut routing = RoutingTable::uniform(4, &[1, 2, 3], 3);
+        routing.set_group_members(0, vec![2, 3]);
+        routing.set_group_members(1, vec![2, 3]);
+        routing.set_group_members(2, vec![1, 2, 3]);
+        routing.set_group_members(3, vec![1, 2, 3]);
+        let plan = plan_full_decommission(1, &t, &routing, 2).unwrap();
+        let removes: Vec<u64> = plan
+            .entries
+            .iter()
+            .filter_map(|e| match e {
+                MetadataEntry::RoutingChange(RoutingChange::RemoveMember { group_id, .. }) => {
+                    Some(*group_id)
+                }
+                _ => None,
+            })
+            .collect();
+        assert_eq!(removes, vec![2, 3]);
+    }
+}
diff --git a/nodedb-cluster/src/decommission/mod.rs b/nodedb-cluster/src/decommission/mod.rs
new file mode 100644
index 00000000..3b0bd5c4
--- /dev/null
+++ b/nodedb-cluster/src/decommission/mod.rs
@@ -0,0 +1,35 @@
+//! Decommission flow — graceful removal of a node from the cluster.
+//!
+//! Decommission is a multi-step, metadata-raft-replicated process:
+//!
+//! 1. **Safety gate** — [`safety::check_can_decommission`] refuses the
+//!    decommission if any Raft group the target is in would drop below
+//!    the configured replication factor after its removal. This is
+//!    the only correctness-critical check — once it passes, every
+//!    subsequent step is just routing/topology bookkeeping.
+//! 2. **Plan** — [`flow::plan_full_decommission`] emits the full ordered
+//!    sequence of [`MetadataEntry`](crate::metadata_group::MetadataEntry)
+//!    values the coordinator will propose: `StartDecommission`, any
+//!    required leadership transfers, a `RemoveMember` per group, then
+//!    `FinishDecommission` and `Leave`.
+//! 3. **Propose** (future batch: `coordinator.rs`) — stateful actor
+//!    proposes each entry in order through a `MetadataProposer` trait,
+//!    waiting for the applied index to advance past each commit before
+//!    advancing its own state.
+//! 4. **Observe** (future batch: `observer.rs`) — the target node
+//!    watches its own topology state and fires a cooperative shutdown
+//!    signal when it transitions to `Decommissioned`.
+//!
+//! This sub-batch ships steps 1 and 2 as pure, side-effect-free
+//! functions so the flow can be exhaustively unit-tested before the
+//! stateful coordinator is wired up.
+
+pub mod coordinator;
+pub mod flow;
+pub mod observer;
+pub mod safety;
+
+pub use coordinator::{DecommissionCoordinator, DecommissionRunResult, MetadataProposer};
+pub use flow::{DecommissionPlan, plan_full_decommission};
+pub use observer::DecommissionObserver;
+pub use safety::{DecommissionSafetyError, check_can_decommission};
diff --git a/nodedb-cluster/src/decommission/observer.rs b/nodedb-cluster/src/decommission/observer.rs
new file mode 100644
index 00000000..d3034c80
--- /dev/null
+++ b/nodedb-cluster/src/decommission/observer.rs
@@ -0,0 +1,196 @@
+//! `DecommissionObserver` — local-node self-shutdown signal.
+//!
+//! The coordinator proposes a full decommission plan through the
+//! metadata Raft group. Every node (including the target itself)
+//! applies the resulting entries through `CacheApplier`, which, when
+//! attached with [`CacheApplier::with_live_state`](crate::metadata_group::CacheApplier::with_live_state),
+//! cascades topology state transitions into the live
+//! `Arc<RwLock<ClusterTopology>>` handle.
+//!
+//! The observer polls that handle for the *local* node id. Once the
+//! node's own state reaches `Decommissioned` — or the node has been
+//! removed from topology entirely by a committed `Leave` — the
+//! observer flips a `tokio::sync::watch` channel to `true`, which is
+//! the cooperative shutdown signal every long-lived background task
+//! on this node is already listening on.
+//!
+//! This is the last link in the decommission chain: once the watch
+//! is flipped, the raft loops, SWIM detector, reachability driver,
+//! and transport accept loops all drain and exit on their own.
+
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+
+use tokio::sync::watch;
+use tokio::time::interval;
+use tracing::{info, warn};
+
+use crate::topology::{ClusterTopology, NodeState};
+
+/// Periodically checks the local node's topology state and fires a
+/// shutdown signal on `Decommissioned` or removal.
+pub struct DecommissionObserver {
+    topology: Arc<RwLock<ClusterTopology>>,
+    local_node_id: u64,
+    shutdown_tx: watch::Sender<bool>,
+    poll_interval: Duration,
+}
+
+impl DecommissionObserver {
+    /// Build an observer and return it alongside the receiver half of
+    /// its shutdown watch channel. Every subsystem that wants to
+    /// cooperatively drain on decommission can call
+    /// [`watch::Receiver::clone`] on the returned receiver.
+    pub fn new(
+        topology: Arc<RwLock<ClusterTopology>>,
+        local_node_id: u64,
+        poll_interval: Duration,
+    ) -> (Self, watch::Receiver<bool>) {
+        let (shutdown_tx, shutdown_rx) = watch::channel(false);
+        (
+            Self {
+                topology,
+                local_node_id,
+                shutdown_tx,
+                poll_interval,
+            },
+            shutdown_rx,
+        )
+    }
+
+    /// Single check. Returns `true` iff the observer fired the
+    /// shutdown signal during this call (or had already fired it
+    /// previously — the watch is level-triggered, not edge).
+    pub fn check_once(&self) -> bool {
+        if *self.shutdown_tx.borrow() {
+            return true;
+        }
+        let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
+        let should_fire = match topo.get_node(self.local_node_id) {
+            Some(node) => node.state == NodeState::Decommissioned,
+            // Node is gone from topology — either a committed `Leave`
+            // (post-decommission) or manual removal. Either way, we
+            // are no longer part of the cluster.
+            None => true,
+        };
+        if should_fire {
+            info!(
+                local_node_id = self.local_node_id,
+                "decommission observer firing local shutdown signal"
+            );
+            if let Err(e) = self.shutdown_tx.send(true) {
+                warn!(error = %e, "shutdown watch receivers all dropped");
+            }
+            return true;
+        }
+        false
+    }
+
+    /// Run the observer's poll loop until `cancel` flips to `true`.
+    /// Exits immediately after firing its own shutdown signal —
+    /// there is nothing more to watch.
+    pub async fn run(self, mut cancel: watch::Receiver<bool>) {
+        let mut tick = interval(self.poll_interval);
+        tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+        loop {
+            tokio::select! {
+                biased;
+                changed = cancel.changed() => {
+                    if changed.is_ok() && *cancel.borrow() {
+                        return;
+                    }
+                }
+                _ = tick.tick() => {
+                    if self.check_once() {
+                        return;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::NodeInfo;
+    use std::net::SocketAddr;
+
+    fn topo_with(node_id: u64, state: NodeState) -> Arc<RwLock<ClusterTopology>> {
+        let mut t = ClusterTopology::new();
+        let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap();
+        t.add_node(NodeInfo::new(node_id, addr, state));
+        Arc::new(RwLock::new(t))
+    }
+
+    #[test]
+    fn check_once_does_not_fire_while_active() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, _rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(10));
+        assert!(!obs.check_once());
+    }
+
+    #[test]
+    fn check_once_fires_on_decommissioned_state() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, mut rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(10));
+        assert!(!obs.check_once());
+        topo.write()
+            .unwrap()
+            .set_state(5, NodeState::Decommissioned);
+        assert!(obs.check_once());
+        assert!(*rx.borrow_and_update());
+    }
+
+    #[test]
+    fn check_once_fires_when_node_removed_from_topology() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, _rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(10));
+        topo.write().unwrap().remove_node(5);
+        assert!(obs.check_once());
+    }
+
+    #[test]
+    fn check_once_is_idempotent_after_firing() {
+        let topo = topo_with(5, NodeState::Decommissioned);
+        let (obs, _rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(10));
+        assert!(obs.check_once());
+        // Second call sees the fired signal and reports true again.
+        assert!(obs.check_once());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_fires_shutdown_and_exits() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, mut rx) = DecommissionObserver::new(topo.clone(), 5, Duration::from_millis(50));
+        let (_cancel_tx, cancel_rx) = watch::channel(false);
+        let handle = tokio::spawn(async move { obs.run(cancel_rx).await });
+
+        // Advance twice — first tick = no-op, then flip state.
+        tokio::time::advance(Duration::from_millis(60)).await;
+        tokio::task::yield_now().await;
+        topo.write()
+            .unwrap()
+            .set_state(5, NodeState::Decommissioned);
+        tokio::time::advance(Duration::from_millis(60)).await;
+        tokio::task::yield_now().await;
+
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle)
+            .await
+            .expect("observer run loop did not exit");
+        assert!(*rx.borrow_and_update());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_exits_on_cancel_without_firing() {
+        let topo = topo_with(5, NodeState::Active);
+        let (obs, rx) = DecommissionObserver::new(topo, 5, Duration::from_millis(50));
+        let (cancel_tx, cancel_rx) = watch::channel(false);
+        let handle = tokio::spawn(async move { obs.run(cancel_rx).await });
+        let _ = cancel_tx.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle)
+            .await
+            .expect("cancel did not end run loop");
+        assert!(!*rx.borrow());
+    }
+}
diff --git a/nodedb-cluster/src/decommission/safety.rs b/nodedb-cluster/src/decommission/safety.rs
new file mode 100644
index 00000000..91533a34
--- /dev/null
+++ b/nodedb-cluster/src/decommission/safety.rs
@@ -0,0 +1,172 @@
+//! Decommission safety gate.
+//!
+//! Before the coordinator proposes a single metadata entry, it must
+//! prove that removing the target node from every Raft group it
+//! belongs to will leave each group with at least `replication_factor`
+//! voting members. Dropping below RF silently is a data-loss bug —
+//! this module is the only place that decision is made.
+
+use crate::error::{ClusterError, Result};
+use crate::routing::RoutingTable;
+use crate::topology::{ClusterTopology, NodeState};
+
+/// Why a decommission request was rejected.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum DecommissionSafetyError {
+    /// The target node id does not exist in the topology.
+    NodeNotFound { node_id: u64 },
+    /// The node is already past the point of decommission.
+    AlreadyDecommissioned { node_id: u64 },
+    /// Removing the node would leave this group below `replication_factor`
+    /// voters. The decommission must wait until a new voter has been
+    /// added to the group (via rebalance / migration executor).
+    WouldViolateReplicationFactor {
+        node_id: u64,
+        group_id: u64,
+        current_voters: usize,
+        replication_factor: usize,
+    },
+}
+
+impl std::fmt::Display for DecommissionSafetyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NodeNotFound { node_id } => {
+                write!(f, "node {node_id} not found in topology")
+            }
+            Self::AlreadyDecommissioned { node_id } => {
+                write!(f, "node {node_id} is already decommissioned")
+            }
+            Self::WouldViolateReplicationFactor {
+                node_id,
+                group_id,
+                current_voters,
+                replication_factor,
+            } => write!(
+                f,
+                "removing node {node_id} from group {group_id} \
+                 would leave {} voter(s), below replication factor {replication_factor}",
+                current_voters.saturating_sub(1)
+            ),
+        }
+    }
+}
+
+impl std::error::Error for DecommissionSafetyError {}
+
+impl From<DecommissionSafetyError> for ClusterError {
+    fn from(value: DecommissionSafetyError) -> Self {
+        ClusterError::Transport {
+            detail: value.to_string(),
+        }
+    }
+}
+
+/// Verify that node `node_id` can be safely stripped out of every
+/// group it participates in without dropping any group below
+/// `replication_factor` voters.
+///
+/// This check is purely structural — it looks at the current routing
+/// table, not the live cluster. Callers must re-run it immediately
+/// before proposing each step if the topology may have shifted since
+/// the plan was computed.
+pub fn check_can_decommission(
+    node_id: u64,
+    topology: &ClusterTopology,
+    routing: &RoutingTable,
+    replication_factor: usize,
+) -> Result<()> {
+    let node = topology
+        .get_node(node_id)
+        .ok_or(DecommissionSafetyError::NodeNotFound { node_id })?;
+
+    if node.state == NodeState::Decommissioned {
+        return Err(DecommissionSafetyError::AlreadyDecommissioned { node_id }.into());
+    }
+
+    for (group_id, info) in routing.group_members() {
+        if !info.members.contains(&node_id) {
+            continue;
+        }
+        let current_voters = info.members.len();
+        // After removal the group would have `current_voters - 1`
+        // voters. Require that to be at least `replication_factor`.
+        if current_voters.saturating_sub(1) < replication_factor {
+            return Err(DecommissionSafetyError::WouldViolateReplicationFactor {
+                node_id,
+                group_id: *group_id,
+                current_voters,
+                replication_factor,
+            }
+            .into());
+        }
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::NodeInfo;
+    use std::net::SocketAddr;
+
+    fn topo(nodes: &[u64]) -> ClusterTopology {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let addr: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, addr, NodeState::Active));
+        }
+        t
+    }
+
+    #[test]
+    fn rejects_unknown_node() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        let err = check_can_decommission(99, &t, &r, 2).unwrap_err();
+        assert!(err.to_string().contains("99"));
+    }
+
+    #[test]
+    fn rejects_already_decommissioned() {
+        let mut t = topo(&[1, 2, 3]);
+        t.set_state(1, NodeState::Decommissioned);
+        let r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        let err = check_can_decommission(1, &t, &r, 2).unwrap_err();
+        assert!(err.to_string().contains("already decommissioned"));
+    }
+
+    #[test]
+    fn rejects_when_rf_would_be_violated() {
+        let t = topo(&[1, 2]);
+        // RF=2 with only 2 nodes → every group has exactly 2 voters.
+        // Removing either one would leave 1 voter (< RF=2).
+        let r = RoutingTable::uniform(2, &[1, 2], 2);
+        let err = check_can_decommission(1, &t, &r, 2).unwrap_err();
+        assert!(err.to_string().contains("replication factor"));
+    }
+
+    #[test]
+    fn accepts_when_extra_voter_available() {
+        let t = topo(&[1, 2, 3]);
+        // 3 nodes × RF=2 means each group has 2 voters but the third
+        // node is a candidate replacement. The safety check doesn't
+        // know about replacements — it only checks current state,
+        // so we need RF=1 for this to pass without a prior rebalance.
+        let r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        check_can_decommission(1, &t, &r, 2).unwrap();
+    }
+
+    #[test]
+    fn skips_groups_target_is_not_member_of() {
+        let t = topo(&[1, 2, 3]);
+        // Node 1 is only in group 0, node 2 is only in group 1.
+        let mut r = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        r.set_group_members(0, vec![1, 3]);
+        r.set_group_members(1, vec![2, 3]);
+        // Decommission 1 with RF=1 → group 0 drops to [3], group 1
+        // untouched.
+        check_can_decommission(1, &t, &r, 1).unwrap();
+    }
+}
diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index 169864ad..e38e1591 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -4,6 +4,7 @@ pub mod circuit_breaker;
 pub mod cluster_info;
 pub mod conf_change;
 pub mod cross_shard_txn;
+pub mod decommission;
 pub mod distributed_document;
 pub mod distributed_graph;
 pub mod distributed_join;
@@ -25,6 +26,7 @@ pub mod quic_transport;
 pub mod raft_loop;
 pub mod raft_storage;
 pub mod rdma_transport;
+pub mod reachability;
 pub mod readiness;
 pub mod rebalance;
 pub mod rebalance_scheduler;
@@ -44,6 +46,10 @@ pub use cluster_info::{
     ClusterInfoSnapshot, ClusterObserver, GroupSnapshot, GroupStatusProvider, PeerSnapshot,
 };
 pub use conf_change::{ConfChange, ConfChangeType};
+pub use decommission::{
+    DecommissionCoordinator, DecommissionObserver, DecommissionPlan, DecommissionRunResult,
+    DecommissionSafetyError, MetadataProposer, check_can_decommission, plan_full_decommission,
+};
 pub use error::{ClusterError, Result};
 pub use forward::{NoopPlanExecutor, PlanExecutor};
 pub use ghost::{GhostStub, GhostTable};
@@ -55,6 +61,9 @@ pub use migration_executor::{
 };
 pub use multi_raft::{GroupStatus, MultiRaft};
 pub use raft_loop::{CommitApplier, RaftLoop, VShardEnvelopeHandler};
+pub use reachability::{
+    NoopProber, ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber, TransportProber,
+};
 pub use rebalance::{RebalancePlan, compute_plan, plan_to_requests};
 pub use routing::RoutingTable;
 pub use routing_liveness::{NodeIdResolver, RoutingLivenessHook};
diff --git a/nodedb-cluster/src/lifecycle.rs b/nodedb-cluster/src/lifecycle.rs
index 28dd6bd2..43966b8c 100644
--- a/nodedb-cluster/src/lifecycle.rs
+++ b/nodedb-cluster/src/lifecycle.rs
@@ -15,7 +15,7 @@
 use tracing::{info, warn};
 
 use crate::error::{ClusterError, Result};
-use crate::metadata_group::{MetadataEntry, RoutingChange, TopologyChange};
+use crate::metadata_group::{MetadataEntry, TopologyChange};
 use crate::routing::RoutingTable;
 use crate::topology::{ClusterTopology, NodeInfo, NodeState};
 
@@ -27,55 +27,34 @@ pub struct DecommissionResult {
     pub completed: bool,
 }
 
-/// Plan a node decommission: compute which vShards to migrate and where.
-///
-/// Produces a sequence of [`MetadataEntry`] values to be proposed against
-/// the metadata Raft group in order. Steps:
-/// 1. Start decommission (topology transition).
-/// 2. Transfer leadership of all Raft groups led by this node.
+/// Plan a node decommission — thin wrapper over
+/// [`crate::decommission::plan_full_decommission`] that returns the
+/// full ordered sequence of metadata entries. Kept as a public
+/// convenience for older call sites; new code should use the
+/// `decommission` module directly.
 pub fn plan_decommission(
     node_id: u64,
     topology: &ClusterTopology,
     routing: &RoutingTable,
 ) -> Result<Vec<MetadataEntry>> {
-    let node = topology.get_node(node_id).ok_or(ClusterError::Transport {
-        detail: format!("node {node_id} not found in topology"),
-    })?;
-
-    if node.state == NodeState::Decommissioned {
-        return Err(ClusterError::Transport {
-            detail: format!("node {node_id} is already decommissioned"),
-        });
-    }
-
-    let mut entries = Vec::new();
-
-    // Step 1: Start decommission.
-    entries.push(MetadataEntry::TopologyChange(
-        TopologyChange::StartDecommission { node_id },
-    ));
-
-    // Step 2: Leadership transfers for groups led by this node.
-    for group_id in routing.group_ids() {
-        if let Some(info) = routing.group_info(group_id)
-            && info.leader == node_id
-            && let Some(&new_leader) = info.members.iter().find(|&&m| m != node_id)
-        {
-            entries.push(MetadataEntry::RoutingChange(
-                RoutingChange::LeadershipTransfer {
-                    group_id,
-                    new_leader_node_id: new_leader,
-                },
-            ));
-        }
-    }
-
+    // Historical callers assumed the full-cluster RF; derive a safe
+    // lower bound from the smallest existing group so the check is
+    // never stricter than the cluster is already running under.
+    let rf = routing
+        .group_members()
+        .values()
+        .map(|info| info.members.len())
+        .min()
+        .unwrap_or(1)
+        .saturating_sub(1)
+        .max(1);
+    let plan = crate::decommission::plan_full_decommission(node_id, topology, routing, rf)?;
     info!(
         node_id,
-        metadata_entries = entries.len(),
+        metadata_entries = plan.entries.len(),
         "decommission plan computed"
     );
-    Ok(entries)
+    Ok(plan.entries)
 }
 
 /// Check if a node can be safely removed from the cluster.
diff --git a/nodedb-cluster/src/metadata_group/applier.rs b/nodedb-cluster/src/metadata_group/applier.rs
index fd169f04..8f5db1db 100644
--- a/nodedb-cluster/src/metadata_group/applier.rs
+++ b/nodedb-cluster/src/metadata_group/applier.rs
@@ -1,12 +1,16 @@
 //! [`MetadataApplier`] trait: the contract raft_loop uses to dispatch
 //! committed entries on the metadata group (group 0).
 
+use std::net::SocketAddr;
 use std::sync::{Arc, RwLock};
 
 use tracing::warn;
 
 use crate::metadata_group::cache::MetadataCache;
 use crate::metadata_group::codec::decode_entry;
+use crate::metadata_group::entry::{MetadataEntry, RoutingChange, TopologyChange};
+use crate::routing::RoutingTable;
+use crate::topology::{ClusterTopology, NodeInfo, NodeState};
 
 /// Applies committed metadata entries to local state.
 ///
@@ -29,16 +33,108 @@ pub trait MetadataApplier: Send + Sync + 'static {
 #[derive(Clone)]
 pub struct CacheApplier {
     cache: Arc<RwLock<MetadataCache>>,
+    /// Optional live topology handle. When set, committed
+    /// `TopologyChange` entries mutate this handle in place so the
+    /// rest of the process sees the new state immediately — decommission
+    /// state transitions, joiner promotion, and `Leave` removal all
+    /// flow through here.
+    live_topology: Option<Arc<RwLock<ClusterTopology>>>,
+    /// Optional live routing table handle. When set, committed
+    /// `RoutingChange` entries (leadership transfer, member removal,
+    /// vshard reassignment) mutate this handle in place.
+    live_routing: Option<Arc<RwLock<RoutingTable>>>,
 }
 
 impl CacheApplier {
     pub fn new(cache: Arc<RwLock<MetadataCache>>) -> Self {
-        Self { cache }
+        Self {
+            cache,
+            live_topology: None,
+            live_routing: None,
+        }
+    }
+
+    /// Extend this applier with live topology/routing handles. When
+    /// set, committed `TopologyChange` and `RoutingChange` entries
+    /// mutate the handles in place in addition to the in-memory
+    /// history log kept in `MetadataCache`. Backward-compatible:
+    /// existing callers that don't attach handles see no behaviour
+    /// change.
+    pub fn with_live_state(
+        mut self,
+        topology: Arc<RwLock<ClusterTopology>>,
+        routing: Arc<RwLock<RoutingTable>>,
+    ) -> Self {
+        self.live_topology = Some(topology);
+        self.live_routing = Some(routing);
+        self
     }
 
     pub fn cache(&self) -> Arc<RwLock<MetadataCache>> {
         self.cache.clone()
     }
+
+    /// Mutate the live topology handle (if attached) in response to
+    /// a committed `TopologyChange`. Silent no-op when no handle is
+    /// set — backward-compatible with older test wiring.
+    fn apply_topology_change(&self, change: &TopologyChange) {
+        let Some(live) = &self.live_topology else {
+            return;
+        };
+        let mut topo = live.write().unwrap_or_else(|p| p.into_inner());
+        match change {
+            TopologyChange::Join { node_id, addr } => {
+                if topo.contains(*node_id) {
+                    return;
+                }
+                let parsed: SocketAddr = addr.parse().unwrap_or_else(|_| {
+                    warn!(node_id, addr, "join: invalid address, using placeholder");
+                    SocketAddr::from(([0, 0, 0, 0], 0))
+                });
+                topo.join_as_learner(NodeInfo::new(*node_id, parsed, NodeState::Joining));
+            }
+            TopologyChange::PromoteToVoter { node_id } => {
+                topo.promote_to_voter(*node_id);
+            }
+            TopologyChange::StartDecommission { node_id } => {
+                topo.set_state(*node_id, NodeState::Draining);
+            }
+            TopologyChange::FinishDecommission { node_id } => {
+                topo.set_state(*node_id, NodeState::Decommissioned);
+            }
+            TopologyChange::Leave { node_id } => {
+                topo.remove_node(*node_id);
+            }
+        }
+    }
+
+    /// Mutate the live routing handle (if attached) in response to
+    /// a committed `RoutingChange`.
+    fn apply_routing_change(&self, change: &RoutingChange) {
+        let Some(live) = &self.live_routing else {
+            return;
+        };
+        let mut rt = live.write().unwrap_or_else(|p| p.into_inner());
+        match change {
+            RoutingChange::ReassignVShard {
+                vshard_id,
+                new_group_id,
+                new_leaseholder_node_id,
+            } => {
+                rt.reassign_vshard(*vshard_id, *new_group_id);
+                rt.set_leader(*new_group_id, *new_leaseholder_node_id);
+            }
+            RoutingChange::LeadershipTransfer {
+                group_id,
+                new_leader_node_id,
+            } => {
+                rt.set_leader(*group_id, *new_leader_node_id);
+            }
+            RoutingChange::RemoveMember { group_id, node_id } => {
+                rt.remove_group_member(*group_id, *node_id);
+            }
+        }
+    }
 }
 
 impl MetadataApplier for CacheApplier {
@@ -54,7 +150,15 @@ impl MetadataApplier for CacheApplier {
                 continue;
             }
             match decode_entry(data) {
-                Ok(entry) => guard.apply(*index, &entry),
+                Ok(entry) => {
+                    guard.apply(*index, &entry);
+                    // Cascade to live state (if attached).
+                    match &entry {
+                        MetadataEntry::TopologyChange(change) => self.apply_topology_change(change),
+                        MetadataEntry::RoutingChange(change) => self.apply_routing_change(change),
+                        _ => {}
+                    }
+                }
                 Err(e) => warn!(index = *index, error = %e, "metadata decode failed"),
             }
         }
@@ -120,6 +224,72 @@ mod tests {
         assert_eq!(guard.catalog_entries_applied, 1);
     }
 
+    #[test]
+    fn cache_applier_mutates_live_topology_on_start_decommission() {
+        use crate::topology::{ClusterTopology, NodeInfo, NodeState};
+        use std::net::SocketAddr;
+
+        let cache = Arc::new(RwLock::new(MetadataCache::new()));
+        let mut t = ClusterTopology::new();
+        let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap();
+        t.add_node(NodeInfo::new(7, addr, NodeState::Active));
+        let topology = Arc::new(RwLock::new(t));
+        let routing = Arc::new(RwLock::new(crate::routing::RoutingTable::uniform(
+            1,
+            &[7],
+            1,
+        )));
+        let applier =
+            CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone());
+
+        let bytes = encode_entry(&MetadataEntry::TopologyChange(
+            TopologyChange::StartDecommission { node_id: 7 },
+        ))
+        .unwrap();
+        applier.apply(&[(1, bytes)]);
+
+        let topo = topology.read().unwrap();
+        assert_eq!(topo.get_node(7).unwrap().state, NodeState::Draining);
+    }
+
+    #[test]
+    fn cache_applier_mutates_live_routing_on_remove_member() {
+        use crate::metadata_group::entry::RoutingChange;
+
+        let cache = Arc::new(RwLock::new(MetadataCache::new()));
+        let topology = Arc::new(RwLock::new(crate::topology::ClusterTopology::new()));
+        let routing = Arc::new(RwLock::new(crate::routing::RoutingTable::uniform(
+            1,
+            &[1, 2, 3],
+            3,
+        )));
+        let applier =
+            CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone());
+
+        let bytes = encode_entry(&MetadataEntry::RoutingChange(RoutingChange::RemoveMember {
+            group_id: 0,
+            node_id: 2,
+        }))
+        .unwrap();
+        applier.apply(&[(1, bytes)]);
+
+        let rt = routing.read().unwrap();
+        assert!(!rt.group_info(0).unwrap().members.contains(&2));
+    }
+
+    #[test]
+    fn cache_applier_without_live_state_stays_log_only() {
+        let cache = Arc::new(RwLock::new(MetadataCache::new()));
+        let applier = CacheApplier::new(cache.clone());
+        let bytes = encode_entry(&MetadataEntry::TopologyChange(
+            TopologyChange::StartDecommission { node_id: 5 },
+        ))
+        .unwrap();
+        // Must not panic and must still advance the applied index.
+        let last = applier.apply(&[(1, bytes)]);
+        assert_eq!(last, 1);
+    }
+
     #[test]
     fn noop_applier_advances_watermark() {
         let noop = NoopMetadataApplier;
diff --git a/nodedb-cluster/src/metadata_group/entry.rs b/nodedb-cluster/src/metadata_group/entry.rs
index 301a9bf8..c751796e 100644
--- a/nodedb-cluster/src/metadata_group/entry.rs
+++ b/nodedb-cluster/src/metadata_group/entry.rs
@@ -123,4 +123,11 @@ pub enum RoutingChange {
         group_id: u64,
         new_leader_node_id: u64,
     },
+    /// Remove a node from a Raft group's member and learner sets.
+    ///
+    /// Used by the decommission flow to strip a draining node out of
+    /// every group it belongs to. Proposing this is only safe once
+    /// `safety::check_can_decommission` has confirmed the group will
+    /// still satisfy the configured replication factor.
+    RemoveMember { group_id: u64, node_id: u64 },
 }
diff --git a/nodedb-cluster/src/routing.rs b/nodedb-cluster/src/routing.rs
index fdaaab83..1587bd37 100644
--- a/nodedb-cluster/src/routing.rs
+++ b/nodedb-cluster/src/routing.rs
@@ -155,6 +155,28 @@ impl RoutingTable {
         }
     }
 
+    /// Remove a node from a group's voter and learner lists. If the
+    /// removed node was the current leader hint, the hint is cleared
+    /// so the next query drives a fresh discovery. Returns `true` if
+    /// the group existed and anything was actually removed.
+    ///
+    /// The caller is responsible for safety: dropping below the
+    /// configured replication factor must be gated by
+    /// `decommission::safety::check_can_decommission`.
+    pub fn remove_group_member(&mut self, group_id: u64, node_id: u64) -> bool {
+        let Some(info) = self.group_members.get_mut(&group_id) else {
+            return false;
+        };
+        let before_members = info.members.len();
+        let before_learners = info.learners.len();
+        info.members.retain(|&id| id != node_id);
+        info.learners.retain(|&id| id != node_id);
+        if info.leader == node_id {
+            info.leader = 0;
+        }
+        info.members.len() != before_members || info.learners.len() != before_learners
+    }
+
     /// Update the learner list for a Raft group.
     pub fn set_group_learners(&mut self, group_id: u64, learners: Vec<u64>) {
         if let Some(info) = self.group_members.get_mut(&group_id) {
@@ -274,6 +296,31 @@ mod tests {
         assert_eq!(rt.leader_for_vshard(0).unwrap(), 99);
     }
 
+    #[test]
+    fn remove_group_member_strips_voter_and_clears_leader() {
+        let mut rt = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        rt.set_leader(0, 2);
+        assert!(rt.remove_group_member(0, 2));
+        let info = rt.group_info(0).unwrap();
+        assert!(!info.members.contains(&2));
+        assert_eq!(info.leader, 0, "leader hint should be cleared");
+    }
+
+    #[test]
+    fn remove_group_member_strips_learner_only() {
+        let mut rt = RoutingTable::uniform(2, &[1, 2, 3], 3);
+        rt.add_group_learner(0, 9);
+        assert!(rt.remove_group_member(0, 9));
+        let info = rt.group_info(0).unwrap();
+        assert!(!info.learners.contains(&9));
+    }
+
+    #[test]
+    fn remove_group_member_unknown_group_returns_false() {
+        let mut rt = RoutingTable::uniform(1, &[1, 2], 2);
+        assert!(!rt.remove_group_member(99, 1));
+    }
+
     #[test]
     fn vshard_not_mapped() {
         let rt = RoutingTable::uniform(2, &[1, 2], 2);
diff --git a/nodedb-cluster/tests/decommission_flow.rs b/nodedb-cluster/tests/decommission_flow.rs
new file mode 100644
index 00000000..ef317a9d
--- /dev/null
+++ b/nodedb-cluster/tests/decommission_flow.rs
@@ -0,0 +1,153 @@
+//! End-to-end decommission flow.
+//!
+//! Wires every piece of the decommission subsystem together without
+//! standing up a real metadata Raft group:
+//!
+//! - `CacheApplier::with_live_state` holds shared topology + routing.
+//! - A direct in-memory `MetadataProposer` encodes each proposed
+//!   entry, feeds it straight into the applier with a synthetic
+//!   monotonically-increasing index, and returns the index — i.e. a
+//!   "propose and wait for commit" that is instantaneous.
+//! - `DecommissionCoordinator` walks a `plan_full_decommission`
+//!   output through that proposer.
+//! - `DecommissionObserver` watches the local topology for the
+//!   target's state transition and fires its shutdown watch.
+//!
+//! The real metadata Raft path is already exercised by
+//! `metadata_replication.rs`; this test focuses on the decommission
+//! state machine end to end: plan → propose → apply → live state
+//! → observer signal.
+
+use std::net::SocketAddr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+
+use nodedb_cluster::decommission::{
+    DecommissionCoordinator, DecommissionObserver, MetadataProposer, plan_full_decommission,
+};
+use nodedb_cluster::error::Result;
+use nodedb_cluster::metadata_group::{CacheApplier, MetadataApplier, MetadataCache, encode_entry};
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
+use nodedb_cluster::{DecommissionRunResult, MetadataEntry};
+
+/// In-memory proposer that encodes every entry and immediately feeds
+/// it through an attached `CacheApplier`, returning a synthetic
+/// monotonically-increasing index. This is the "one-node metadata
+/// group" equivalent the test uses to drive the decommission
+/// state machine end to end in a few hundred microseconds.
+struct DirectProposer {
+    applier: Arc<CacheApplier>,
+    next_index: AtomicU64,
+    proposed: Mutex<Vec<MetadataEntry>>,
+}
+
+impl DirectProposer {
+    fn new(applier: Arc<CacheApplier>) -> Arc<Self> {
+        Arc::new(Self {
+            applier,
+            next_index: AtomicU64::new(1),
+            proposed: Mutex::new(Vec::new()),
+        })
+    }
+}
+
+#[async_trait]
+impl MetadataProposer for DirectProposer {
+    async fn propose_and_wait(&self, entry: MetadataEntry) -> Result<u64> {
+        let idx = self.next_index.fetch_add(1, Ordering::SeqCst);
+        let bytes = encode_entry(&entry).expect("encode metadata entry");
+        self.applier.apply(&[(idx, bytes)]);
+        self.proposed.lock().unwrap().push(entry);
+        Ok(idx)
+    }
+}
+
+#[tokio::test]
+async fn end_to_end_decommission_drains_node_and_signals_shutdown() {
+    // --- 3 active nodes, 4 groups, RF=3. Decommission node 3
+    //     while RF=2 is the surviving quorum target.
+    let mut topo = ClusterTopology::new();
+    for (i, id) in [1u64, 2, 3].iter().enumerate() {
+        let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+        topo.add_node(NodeInfo::new(*id, a, NodeState::Active));
+    }
+    let topology = Arc::new(RwLock::new(topo));
+    let mut rt = RoutingTable::uniform(4, &[1, 2, 3], 3);
+    // Make node 3 the leader of at least one group so the plan
+    // emits a LeadershipTransfer entry and the applier must handle
+    // it live.
+    rt.set_leader(0, 3);
+    rt.set_leader(1, 1);
+    rt.set_leader(2, 3);
+    rt.set_leader(3, 2);
+    let routing = Arc::new(RwLock::new(rt));
+
+    // --- Applier with live topology + routing cascading.
+    let cache = Arc::new(RwLock::new(MetadataCache::new()));
+    let applier = Arc::new(
+        CacheApplier::new(cache.clone()).with_live_state(topology.clone(), routing.clone()),
+    );
+    let proposer = DirectProposer::new(applier.clone());
+
+    // --- Observer running on node 3 (the target).
+    let (observer, mut shutdown_rx) =
+        DecommissionObserver::new(topology.clone(), 3, Duration::from_millis(10));
+
+    // --- Build the plan from a snapshot of the live state.
+    let plan = {
+        let t = topology.read().unwrap();
+        let r = routing.read().unwrap();
+        plan_full_decommission(3, &t, &r, 2).expect("plan")
+    };
+    let plan_len = plan.entries.len();
+
+    // --- Drive the coordinator.
+    let coordinator = DecommissionCoordinator::new(plan, proposer.clone());
+    let result: DecommissionRunResult = coordinator.run().await.expect("coordinator run");
+    assert_eq!(result.node_id, 3);
+    assert_eq!(result.entries_committed, plan_len);
+
+    // --- Assert live state now reflects the decommission outcome.
+    //
+    // Topology: node 3 is gone (final `Leave` entry removed it).
+    {
+        let t = topology.read().unwrap();
+        assert!(
+            t.get_node(3).is_none(),
+            "node 3 should be removed from topology after Leave"
+        );
+        // Node 1 and 2 still present and unchanged.
+        assert_eq!(t.get_node(1).unwrap().state, NodeState::Active);
+        assert_eq!(t.get_node(2).unwrap().state, NodeState::Active);
+    }
+
+    // Routing: node 3 is no longer in any group's member set, and
+    // the groups it used to lead have had their leader hints
+    // updated via LeadershipTransfer.
+    {
+        let r = routing.read().unwrap();
+        for (gid, info) in r.group_members() {
+            assert!(
+                !info.members.contains(&3),
+                "group {gid} still contains node 3 after decommission"
+            );
+            assert!(
+                !info.learners.contains(&3),
+                "group {gid} still has node 3 as learner after decommission"
+            );
+        }
+        // Group 0 was led by 3 → LeadershipTransfer emitted a new
+        // non-3 leader; group 2 likewise.
+        assert_ne!(r.group_info(0).unwrap().leader, 3);
+        assert_ne!(r.group_info(2).unwrap().leader, 3);
+    }
+
+    // --- Observer must now fire its shutdown signal on the very
+    //     next check — the topology change already landed.
+    assert!(observer.check_once());
+    assert!(*shutdown_rx.borrow_and_update());
+}

From 794a7cab0e2ccdf99bbeabc6627427d0fe8f0191 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 08:56:50 +0800
Subject: [PATCH 05/24] feat(cluster): add reachability-driven circuit-breaker
 recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a `reachability` module that actively probes peers whose
circuit breakers are stuck in the Open state. Without periodic probes,
a breaker that opens due to a transient failure never transitions back
to HalfOpen because there is no outbound traffic to trigger a
`check()` call — the node stays blacklisted indefinitely.

New pieces:

- `reachability::prober` — `ReachabilityProber` trait with a
  `TransportProber` adapter (wraps any `Transport: clone + send`) and a
  `NoopProber` for tests.
- `reachability::driver` — `ReachabilityDriver` runs a Tokio interval
  loop, calls `circuit_breaker.open_peers()` each tick, and fires one
  probe per stuck peer. A successful probe resets the breaker to
  Closed; a failed probe keeps it Open until the next interval.
  `ReachabilityDriverConfig` controls the probe interval and per-tick
  peer cap.

Supporting changes:

- `circuit_breaker::open_peers()` — returns the ids of every peer
  currently in the Open state so the driver can discover them without
  coupling to breaker internals.
- `routing_liveness` — remove stale inline checklist from the module
  doc; the comment now describes what actually happens.
- `lib` — re-export public reachability surface.

Integration test `reachability_loop` spins up a driver backed by a
`NoopProber`, forces three peers into the Open state, advances the
interval, and asserts that all three breakers return to Closed.
---
 nodedb-cluster/src/circuit_breaker.rs     |  20 ++
 nodedb-cluster/src/reachability/driver.rs | 220 ++++++++++++++++++++++
 nodedb-cluster/src/reachability/mod.rs    |  22 +++
 nodedb-cluster/src/reachability/prober.rs |  68 +++++++
 nodedb-cluster/src/routing_liveness.rs    |  13 +-
 nodedb-cluster/tests/reachability_loop.rs | 142 ++++++++++++++
 6 files changed, 476 insertions(+), 9 deletions(-)
 create mode 100644 nodedb-cluster/src/reachability/driver.rs
 create mode 100644 nodedb-cluster/src/reachability/mod.rs
 create mode 100644 nodedb-cluster/src/reachability/prober.rs
 create mode 100644 nodedb-cluster/tests/reachability_loop.rs

diff --git a/nodedb-cluster/src/circuit_breaker.rs b/nodedb-cluster/src/circuit_breaker.rs
index 3b02d4e5..3c5992f1 100644
--- a/nodedb-cluster/src/circuit_breaker.rs
+++ b/nodedb-cluster/src/circuit_breaker.rs
@@ -145,6 +145,26 @@ impl CircuitBreaker {
             .unwrap_or(CircuitState::Closed)
     }
 
+    /// Return the ids of every peer whose breaker is currently Open.
+    ///
+    /// Used by the reachability driver to find peers that need an
+    /// active probe — without a periodic poke these peers never
+    /// transition back to HalfOpen (no traffic → no `check()` call
+    /// → no cooldown re-evaluation).
+    pub fn open_peers(&self) -> Vec<u64> {
+        let peers = self.peers.read().unwrap_or_else(|p| p.into_inner());
+        peers
+            .iter()
+            .filter_map(|(id, b)| {
+                if b.state == CircuitState::Open {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect()
+    }
+
     /// Get consecutive failure count for a peer.
     pub fn failure_count(&self, peer: u64) -> u32 {
         let peers = self.peers.read().unwrap_or_else(|p| p.into_inner());
diff --git a/nodedb-cluster/src/reachability/driver.rs b/nodedb-cluster/src/reachability/driver.rs
new file mode 100644
index 00000000..b677ba0c
--- /dev/null
+++ b/nodedb-cluster/src/reachability/driver.rs
@@ -0,0 +1,220 @@
+//! [`ReachabilityDriver`] — periodic open-breaker probe loop.
+//!
+//! Every `interval`, the driver asks the shared [`CircuitBreaker`]
+//! for its currently-Open peer set and fires a probe at each via the
+//! injected [`ReachabilityProber`]. Probes run in parallel via
+//! `tokio::spawn` so a slow peer never blocks the next one. Probe
+//! results are intentionally ignored: the production `TransportProber`
+//! routes through `NexarTransport::send_rpc`, which already walks the
+//! circuit breaker's `check → record_success|record_failure` path, so
+//! the driver does not need to bookkeep anything itself.
+//!
+//! Shutdown is cooperative via `tokio::sync::watch`. On `true` the
+//! run loop breaks at the next tick or immediately if it is waiting.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio::sync::watch;
+use tokio::time::{MissedTickBehavior, interval};
+use tracing::{debug, trace};
+
+use crate::circuit_breaker::CircuitBreaker;
+
+use super::prober::ReachabilityProber;
+
+/// Configuration for the reachability driver.
+#[derive(Debug, Clone)]
+pub struct ReachabilityDriverConfig {
+    /// Period between open-peer sweeps. Defaults to 30 s in
+    /// production; tests override to milliseconds.
+    pub interval: Duration,
+}
+
+impl Default for ReachabilityDriverConfig {
+    fn default() -> Self {
+        Self {
+            interval: Duration::from_secs(30),
+        }
+    }
+}
+
+/// Drives periodic reachability probes against every Open-state peer.
+pub struct ReachabilityDriver {
+    breaker: Arc<CircuitBreaker>,
+    prober: Arc<dyn ReachabilityProber>,
+    cfg: ReachabilityDriverConfig,
+}
+
+impl ReachabilityDriver {
+    pub fn new(
+        breaker: Arc<CircuitBreaker>,
+        prober: Arc<dyn ReachabilityProber>,
+        cfg: ReachabilityDriverConfig,
+    ) -> Self {
+        Self {
+            breaker,
+            prober,
+            cfg,
+        }
+    }
+
+    /// Run the driver until `shutdown` flips to `true`.
+    pub async fn run(self: Arc<Self>, mut shutdown: watch::Receiver<bool>) {
+        let mut tick = interval(self.cfg.interval);
+        // Skip the immediate first tick so the first probe fires one
+        // full interval after start. Otherwise every process restart
+        // would stampede every open breaker at once.
+        tick.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        tick.tick().await;
+        loop {
+            tokio::select! {
+                biased;
+                changed = shutdown.changed() => {
+                    if changed.is_ok() && *shutdown.borrow() {
+                        break;
+                    }
+                }
+                _ = tick.tick() => {
+                    self.sweep_once().await;
+                }
+            }
+        }
+        debug!("reachability driver shutting down");
+    }
+
+    /// Single sweep — exposed for tests that drive the loop manually.
+    pub async fn sweep_once(&self) {
+        let open = self.breaker.open_peers();
+        if open.is_empty() {
+            return;
+        }
+        trace!(count = open.len(), "reachability sweep: probing open peers");
+        for peer in open {
+            let prober = Arc::clone(&self.prober);
+            tokio::spawn(async move {
+                let _ = prober.probe(peer).await;
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::circuit_breaker::CircuitBreakerConfig;
+    use async_trait::async_trait;
+    use std::sync::Mutex;
+
+    struct RecordingProber {
+        calls: Mutex<Vec<u64>>,
+    }
+
+    impl RecordingProber {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                calls: Mutex::new(Vec::new()),
+            })
+        }
+        fn take(&self) -> Vec<u64> {
+            let mut g = self.calls.lock().unwrap();
+            let out = g.clone();
+            g.clear();
+            out
+        }
+    }
+
+    #[async_trait]
+    impl ReachabilityProber for RecordingProber {
+        async fn probe(&self, peer: u64) -> Result<(), crate::error::ClusterError> {
+            self.calls.lock().unwrap().push(peer);
+            Ok(())
+        }
+    }
+
+    fn open_breaker() -> Arc<CircuitBreaker> {
+        Arc::new(CircuitBreaker::new(CircuitBreakerConfig {
+            failure_threshold: 1,
+            cooldown: Duration::from_secs(60),
+        }))
+    }
+
+    #[tokio::test]
+    async fn sweep_probes_every_open_peer() {
+        let breaker = open_breaker();
+        breaker.record_failure(1);
+        breaker.record_failure(2);
+        breaker.record_failure(3);
+
+        let prober = RecordingProber::new();
+        let driver = Arc::new(ReachabilityDriver::new(
+            Arc::clone(&breaker),
+            prober.clone() as Arc<dyn ReachabilityProber>,
+            ReachabilityDriverConfig {
+                interval: Duration::from_millis(50),
+            },
+        ));
+        driver.sweep_once().await;
+        // Let spawned probe tasks run.
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        let mut calls = prober.take();
+        calls.sort_unstable();
+        assert_eq!(calls, vec![1, 2, 3]);
+    }
+
+    #[tokio::test]
+    async fn sweep_skips_closed_peers() {
+        let breaker = open_breaker();
+        breaker.record_success(1); // Registers 1 as Closed.
+        breaker.record_failure(2); // Opens 2.
+        let prober = RecordingProber::new();
+        let driver = Arc::new(ReachabilityDriver::new(
+            Arc::clone(&breaker),
+            prober.clone() as Arc<dyn ReachabilityProber>,
+            ReachabilityDriverConfig::default(),
+        ));
+        driver.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert_eq!(prober.take(), vec![2]);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_fires_sweeps_on_interval_and_shuts_down() {
+        let breaker = open_breaker();
+        breaker.record_failure(7);
+        let prober = RecordingProber::new();
+        let driver = Arc::new(ReachabilityDriver::new(
+            Arc::clone(&breaker),
+            prober.clone() as Arc<dyn ReachabilityProber>,
+            ReachabilityDriverConfig {
+                interval: Duration::from_millis(100),
+            },
+        ));
+        let (tx, rx) = watch::channel(false);
+        let handle = tokio::spawn({
+            let d = Arc::clone(&driver);
+            async move { d.run(rx).await }
+        });
+
+        // First tick is skipped, second delivers a sweep.
+        tokio::time::advance(Duration::from_millis(120)).await;
+        tokio::task::yield_now().await;
+        tokio::time::advance(Duration::from_millis(120)).await;
+        tokio::task::yield_now().await;
+        for _ in 0..16 {
+            tokio::task::yield_now().await;
+        }
+
+        assert!(
+            !prober.take().is_empty(),
+            "driver never probed in run-loop mode"
+        );
+
+        let _ = tx.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle).await;
+    }
+}
diff --git a/nodedb-cluster/src/reachability/mod.rs b/nodedb-cluster/src/reachability/mod.rs
new file mode 100644
index 00000000..6423b087
--- /dev/null
+++ b/nodedb-cluster/src/reachability/mod.rs
@@ -0,0 +1,22 @@
+//! Reachability driver — the active half of circuit-breaker recovery.
+//!
+//! `CircuitBreaker` transitions `Open → HalfOpen` only on the next
+//! `check()` call. Without periodic traffic to an offline peer, that
+//! check never happens and the breaker stays `Open` forever even after
+//! the peer has recovered. This module closes that blind spot:
+//!
+//! - [`ReachabilityDriver`] periodically walks the breaker's open set
+//!   and sends a lightweight probe RPC to each peer via the existing
+//!   `send_rpc` path, which drives the normal HalfOpen → Closed /
+//!   HalfOpen → Open transitions.
+//! - [`ReachabilityProber`] is the injection seam: production wraps
+//!   [`crate::transport::NexarTransport`], tests use a mock.
+//!
+//! The driver is shutdown-aware (watch channel) and bounded — one
+//! probe per open peer per tick, fire-and-forget.
+
+pub mod driver;
+pub mod prober;
+
+pub use driver::{ReachabilityDriver, ReachabilityDriverConfig};
+pub use prober::{NoopProber, ReachabilityProber, TransportProber};
diff --git a/nodedb-cluster/src/reachability/prober.rs b/nodedb-cluster/src/reachability/prober.rs
new file mode 100644
index 00000000..47607e1d
--- /dev/null
+++ b/nodedb-cluster/src/reachability/prober.rs
@@ -0,0 +1,68 @@
+//! [`ReachabilityProber`] — the injection seam for reachability probes.
+//!
+//! Implementations:
+//!
+//! - [`TransportProber`] wraps an `Arc<NexarTransport>` and sends a
+//!   `RaftRpc::Ping` to the peer. `send_rpc` already handles the
+//!   circuit-breaker check, the QUIC dial, retries, and
+//!   `record_success` / `record_failure` — the prober is a one-line
+//!   adapter.
+//! - [`NoopProber`] always succeeds. Useful for tests that only want
+//!   to verify the loop's tick cadence and shutdown.
+//!
+//! Tests that want deterministic open→closed transitions construct
+//! their own trait impls; see `tests/reachability_loop.rs`.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use crate::error::Result;
+use crate::rpc_codec::{PingRequest, RaftRpc};
+use crate::transport::NexarTransport;
+
+/// Abstract probe operation over a single peer.
+#[async_trait]
+pub trait ReachabilityProber: Send + Sync {
+    /// Send one probe to `peer`. Returns `Ok(())` iff the probe
+    /// completed successfully (implying the peer is reachable).
+    async fn probe(&self, peer: u64) -> Result<()>;
+}
+
+/// Production prober: sends a `Ping` via the live transport. The
+/// transport's internal circuit breaker records success/failure
+/// automatically — the driver does not need to bookkeep anything.
+pub struct TransportProber {
+    transport: Arc<NexarTransport>,
+    self_node_id: u64,
+}
+
+impl TransportProber {
+    pub fn new(transport: Arc<NexarTransport>, self_node_id: u64) -> Self {
+        Self {
+            transport,
+            self_node_id,
+        }
+    }
+}
+
+#[async_trait]
+impl ReachabilityProber for TransportProber {
+    async fn probe(&self, peer: u64) -> Result<()> {
+        let rpc = RaftRpc::Ping(PingRequest {
+            sender_id: self.self_node_id,
+            topology_version: 0,
+        });
+        self.transport.send_rpc(peer, rpc).await.map(|_| ())
+    }
+}
+
+/// Always-succeeds prober for cadence/shutdown tests.
+pub struct NoopProber;
+
+#[async_trait]
+impl ReachabilityProber for NoopProber {
+    async fn probe(&self, _peer: u64) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/nodedb-cluster/src/routing_liveness.rs b/nodedb-cluster/src/routing_liveness.rs
index 49cb22f1..3e98a3c6 100644
--- a/nodedb-cluster/src/routing_liveness.rs
+++ b/nodedb-cluster/src/routing_liveness.rs
@@ -1,4 +1,4 @@
-//! Liveness-driven routing invalidation (checklist item E.2).
+//! Liveness-driven routing invalidation.
 //!
 //! [`RoutingLivenessHook`] is a [`MembershipSubscriber`] that clears
 //! the leader hint for every Raft group whose leaseholder has just
@@ -6,14 +6,9 @@
 //! detector. After the hook fires, the next query that consults the
 //! routing table observes `leader == 0` (the "no leader known"
 //! sentinel) and falls through to a fresh leader discovery via the
-//! existing NotLeader-triggered election path — which is exactly the
-//! behaviour the checklist requires:
-//!
-//! > 1. The routing cache invalidates all vShards whose leaseholder
-//! >    was that node.
-//! > 2. The next query against those vShards gets `NotLeader`,
-//! >    triggers a leader election, and updates the routing table.
-//! > 3. Clients see at most one retry.
+//! existing `NotLeader`-triggered election path. Clients see at most
+//! one retry: the stale hint, the failed dispatch, and a refreshed
+//! leader lookup.
 //!
 //! The hook is storage-agnostic: it holds `Arc<RwLock<RoutingTable>>`
 //! and a resolver closure that maps the string-keyed SWIM `NodeId`
diff --git a/nodedb-cluster/tests/reachability_loop.rs b/nodedb-cluster/tests/reachability_loop.rs
new file mode 100644
index 00000000..9e0ab007
--- /dev/null
+++ b/nodedb-cluster/tests/reachability_loop.rs
@@ -0,0 +1,142 @@
+//! Reachability loop closes the circuit-breaker blind spot.
+//!
+//! Scenario: peer 42 starts out unreachable so its breaker opens.
+//! After a few seconds the peer "recovers" (the mock prober flips
+//! from Err to Ok). The reachability driver must observe the next
+//! sweep as a success and drive the breaker back to `Closed` without
+//! any user traffic.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::{Duration, Instant};
+
+use async_trait::async_trait;
+
+use nodedb_cluster::circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, CircuitState};
+use nodedb_cluster::error::{ClusterError, Result};
+use nodedb_cluster::reachability::{
+    ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber,
+};
+use tokio::sync::watch;
+
+/// Mock prober whose success/failure can be flipped at runtime by
+/// the test. Every probe call increments a hit counter so the test
+/// can prove the sweep actually ran.
+struct Flappy {
+    healthy: AtomicBool,
+}
+
+impl Flappy {
+    fn new() -> Arc<Self> {
+        Arc::new(Self {
+            healthy: AtomicBool::new(false),
+        })
+    }
+    fn heal(&self) {
+        self.healthy.store(true, Ordering::SeqCst);
+    }
+}
+
+#[async_trait]
+impl ReachabilityProber for Flappy {
+    async fn probe(&self, peer: u64) -> Result<()> {
+        if self.healthy.load(Ordering::SeqCst) {
+            Ok(())
+        } else {
+            Err(ClusterError::Transport {
+                detail: format!("mock: peer {peer} unreachable"),
+            })
+        }
+    }
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn reachability_loop_recovers_open_breaker_without_user_traffic() {
+    // --- Shared breaker, opened immediately for peer 42. ---
+    let breaker = Arc::new(CircuitBreaker::new(CircuitBreakerConfig {
+        failure_threshold: 1,
+        // Short cooldown so HalfOpen is eligible quickly — the
+        // driver still needs to drive the actual transition.
+        cooldown: Duration::from_millis(100),
+    }));
+    breaker.record_failure(42);
+    assert_eq!(breaker.state(42), CircuitState::Open);
+
+    // --- Flappy prober starts "unhealthy". ---
+    let prober = Flappy::new();
+
+    // The driver's sweep_once calls probe() but does NOT itself
+    // drive record_success/record_failure — production relies on
+    // NexarTransport::send_rpc for that, and the mock has no such
+    // wrapper. So we install a relay closure that records the
+    // outcome against the breaker on the driver's behalf. This is
+    // the minimal glue needed to exercise the real loop end-to-end.
+    struct RelayProber {
+        inner: Arc<Flappy>,
+        breaker: Arc<CircuitBreaker>,
+    }
+    #[async_trait]
+    impl ReachabilityProber for RelayProber {
+        async fn probe(&self, peer: u64) -> Result<()> {
+            // Mirror send_rpc: check → probe → record outcome.
+            if self.breaker.check(peer).is_err() {
+                return Err(ClusterError::CircuitOpen {
+                    node_id: peer,
+                    failures: self.breaker.failure_count(peer),
+                });
+            }
+            match self.inner.probe(peer).await {
+                Ok(()) => {
+                    self.breaker.record_success(peer);
+                    Ok(())
+                }
+                Err(e) => {
+                    self.breaker.record_failure(peer);
+                    Err(e)
+                }
+            }
+        }
+    }
+    let relay: Arc<dyn ReachabilityProber> = Arc::new(RelayProber {
+        inner: prober.clone(),
+        breaker: Arc::clone(&breaker),
+    });
+
+    let driver = Arc::new(ReachabilityDriver::new(
+        Arc::clone(&breaker),
+        relay,
+        ReachabilityDriverConfig {
+            interval: Duration::from_millis(150),
+        },
+    ));
+    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let handle = tokio::spawn({
+        let d = Arc::clone(&driver);
+        async move { d.run(shutdown_rx).await }
+    });
+
+    // --- First few sweeps: probe keeps failing, breaker stays Open. ---
+    tokio::time::sleep(Duration::from_millis(500)).await;
+    assert_eq!(
+        breaker.state(42),
+        CircuitState::Open,
+        "breaker should stay open while peer is unhealthy"
+    );
+
+    // --- Heal the peer. Next sweep should drive Open → HalfOpen → Closed. ---
+    prober.heal();
+
+    let deadline = Instant::now() + Duration::from_secs(3);
+    loop {
+        if breaker.state(42) == CircuitState::Closed {
+            break;
+        }
+        if Instant::now() >= deadline {
+            panic!("breaker never recovered; state = {:?}", breaker.state(42));
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    let _ = shutdown_tx.send(true);
+    let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
+}

From 256ebd3cd8e8b1f7109e971f91585f30cd1c3e7a Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 08:57:01 +0800
Subject: [PATCH 06/24] docs(cluster): remove stale phase-label references from
 swim comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SWIM implementation was built in incremental sub-batches whose
labels (E-α, E-β, E-γ, …) were embedded in module docs, inline
comments, and test file headers. Now that all layers are present the
labels no longer help a reader navigate the code and are confusing
when encountered out of context.

Replace each reference with a plain description of what the module or
code path actually does.
---
 nodedb-cluster/src/swim/detector/mod.rs       |  4 ++--
 nodedb-cluster/src/swim/detector/runner.rs    |  6 +++---
 nodedb-cluster/src/swim/member/record.rs      |  2 +-
 nodedb-cluster/src/swim/membership/list.rs    |  2 +-
 nodedb-cluster/src/swim/mod.rs                | 21 ++++++++-----------
 nodedb-cluster/src/swim/wire/message.rs       |  4 ++--
 nodedb-cluster/src/swim/wire/probe.rs         | 11 ++++------
 nodedb-cluster/tests/common/mod.rs            |  6 +++---
 nodedb-cluster/tests/metadata_replication.rs  |  6 +++---
 .../tests/swim_routing_invalidation.rs        |  2 +-
 10 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/nodedb-cluster/src/swim/detector/mod.rs b/nodedb-cluster/src/swim/detector/mod.rs
index 829d285a..5dccfc7b 100644
--- a/nodedb-cluster/src/swim/detector/mod.rs
+++ b/nodedb-cluster/src/swim/detector/mod.rs
@@ -4,8 +4,8 @@
 //! probe scheduler, the suspicion timer, and the main `tokio::select!`
 //! loop. All actual networking is pushed behind the [`Transport`] trait
 //! so unit tests can run fully in-process against [`InMemoryTransport`]
-//! and the real UDP transport in E-ε can slot in without touching the
-//! detector logic.
+//! while production uses [`UdpTransport`] — both slot into the same
+//! detector without touching its logic.
 
 pub mod probe_round;
 pub mod runner;
diff --git a/nodedb-cluster/src/swim/detector/runner.rs b/nodedb-cluster/src/swim/detector/runner.rs
index df805320..ef16bb2b 100644
--- a/nodedb-cluster/src/swim/detector/runner.rs
+++ b/nodedb-cluster/src/swim/detector/runner.rs
@@ -319,9 +319,9 @@ impl FailureDetector {
     }
 
     /// Refute a self-suspect rumour by bumping local incarnation and
-    /// rebroadcasting `Alive`. E-γ exposes the handle so tests can
-    /// assert the behaviour; the dissemination queue in E-δ will call
-    /// this automatically from the piggyback ingestor.
+    /// rebroadcasting `Alive`. Exposed for tests that assert the
+    /// refutation machinery directly; the piggyback ingestor calls
+    /// the same underlying path automatically in production.
     #[cfg(test)]
     pub async fn bump_local_incarnation(&self, past: Incarnation) -> Incarnation {
         let mut guard = self.local_incarnation.lock().await;
diff --git a/nodedb-cluster/src/swim/member/record.rs b/nodedb-cluster/src/swim/member/record.rs
index 22bde368..8aa98653 100644
--- a/nodedb-cluster/src/swim/member/record.rs
+++ b/nodedb-cluster/src/swim/member/record.rs
@@ -55,7 +55,7 @@ impl Member {
 }
 
 /// Serializable subset of a `Member` — everything except the monotonic
-/// instant. E-β will use this as the wire payload for membership deltas.
+/// instant. Used as the wire payload for membership deltas.
 #[derive(
     Debug,
     Clone,
diff --git a/nodedb-cluster/src/swim/membership/list.rs b/nodedb-cluster/src/swim/membership/list.rs
index be2d975a..e2049625 100644
--- a/nodedb-cluster/src/swim/membership/list.rs
+++ b/nodedb-cluster/src/swim/membership/list.rs
@@ -117,7 +117,7 @@ impl MembershipList {
     }
 
     /// Apply a rumour to the table. Returns the merge outcome so the caller
-    /// can drive the dissemination queue (E-δ). On `SelfRefute`, the local
+    /// can drive the dissemination queue. On `SelfRefute`, the local
     /// record is updated in place to carry the bumped incarnation before
     /// returning, so the caller only needs to gossip the new record.
     pub fn apply(&self, update: &MemberUpdate) -> MergeOutcome {
diff --git a/nodedb-cluster/src/swim/mod.rs b/nodedb-cluster/src/swim/mod.rs
index 622e1fd9..0fa706e6 100644
--- a/nodedb-cluster/src/swim/mod.rs
+++ b/nodedb-cluster/src/swim/mod.rs
@@ -6,19 +6,16 @@
 //! incarnation refutation, dedicated acks) used by modern systems such as
 //! Hashicorp memberlist and Cassandra's gossiper.
 //!
-//! ## Layer map (Phase E)
+//! ## Layer map
 //!
-//! | Sub-batch | Contents                                                   |
-//! |-----------|------------------------------------------------------------|
-//! | **E-α**   | Core types — `config`, `error`, `incarnation`, `member`, `membership` (this file's children) |
-//! | E-β       | Wire messages (`Ping`/`PingReq`/`Ack`/`Nack`) + zerompk codec |
-//! | E-γ       | Failure detector loop over an injected transport trait     |
-//! | E-δ       | Piggyback dissemination queue + convergence tests          |
-//! | E-ε       | Real UDP transport, bootstrap seeding, cluster integration |
-//!
-//! E-α is deliberately side-effect-free: no tasks, no I/O, no wire formats.
-//! It exposes the pure data model — member states, incarnation numbers, and
-//! the state-merge rule — that every later sub-batch builds on.
+//! - `config`, `error`, `incarnation`, `member`, `membership` — pure
+//!   data model: states, incarnation numbers, and the merge rule.
+//! - `wire` — `Ping` / `PingReq` / `Ack` / `Nack` datagrams + codec.
+//! - `detector` — failure detector loop over a pluggable transport
+//!   trait, scheduler, suspicion timer, probe round machinery.
+//! - `dissemination` — piggyback queue with `lambda * log(n)` fanout.
+//! - `bootstrap` — one-stop `spawn` entry point.
+//! - `subscriber` — hook trait fired on every membership transition.
 
 pub mod bootstrap;
 pub mod config;
diff --git a/nodedb-cluster/src/swim/wire/message.rs b/nodedb-cluster/src/swim/wire/message.rs
index da884b96..56d16636 100644
--- a/nodedb-cluster/src/swim/wire/message.rs
+++ b/nodedb-cluster/src/swim/wire/message.rs
@@ -31,7 +31,7 @@ pub enum SwimMessage {
 
 impl SwimMessage {
     /// Mutable borrow of the piggyback slot, independent of variant.
-    /// Used by the dissemination queue (E-δ) to stamp outgoing deltas
+    /// Used by the dissemination queue to stamp outgoing deltas
     /// without caring which message type it is stamping onto.
     pub fn piggyback_mut(&mut self) -> &mut Vec<MemberUpdate> {
         match self {
@@ -53,7 +53,7 @@ impl SwimMessage {
     }
 
     /// Drop piggyback entries beyond `max`. Used before encoding to keep
-    /// a datagram below the UDP MTU — the dissemination queue (E-δ) will
+    /// a datagram below the UDP MTU — the dissemination queue will
     /// decide which updates are highest-priority; this helper just
     /// enforces the upper bound.
     pub fn truncate_piggyback(&mut self, max: usize) {
diff --git a/nodedb-cluster/src/swim/wire/probe.rs b/nodedb-cluster/src/swim/wire/probe.rs
index 3a115019..d17b0373 100644
--- a/nodedb-cluster/src/swim/wire/probe.rs
+++ b/nodedb-cluster/src/swim/wire/probe.rs
@@ -1,9 +1,8 @@
 //! SWIM probe message structs.
 //!
-//! These are the four datagram types the failure detector exchanges over
-//! the network once E-ε wires in a transport. They are pure data types
-//! with `serde` derives — no I/O, no validation beyond what the type
-//! system enforces.
+//! These are the four datagram types the failure detector exchanges
+//! over the network. They are pure data types with `serde` derives —
+//! no I/O, no validation beyond what the type system enforces.
 //!
 //! ## Message flow (reference)
 //!
@@ -21,9 +20,7 @@
 //! ```
 //!
 //! Every message carries a bounded `piggyback: Vec<MemberUpdate>` slot
-//! used for gossip-style dissemination of membership deltas (E-δ). The
-//! wire format reserves the slot now so later sub-batches don't need a
-//! compatibility break.
+//! used for gossip-style dissemination of membership deltas.
 
 use nodedb_types::NodeId;
 use serde::{Deserialize, Serialize};
diff --git a/nodedb-cluster/tests/common/mod.rs b/nodedb-cluster/tests/common/mod.rs
index b2c1d137..ea25a084 100644
--- a/nodedb-cluster/tests/common/mod.rs
+++ b/nodedb-cluster/tests/common/mod.rs
@@ -292,9 +292,9 @@ impl TestNode {
     }
 
     /// Number of committed `CatalogDdl` entries observed by this
-    /// node's cache applier. After batch 1e the cluster crate
-    /// treats catalog DDL payloads as opaque — this counter is
-    /// what tests assert on for replication correctness.
+    /// node's cache applier. The cluster crate treats catalog DDL
+    /// payloads as opaque — this counter is what tests assert on
+    /// for replication correctness.
     pub fn catalog_entries_applied(&self) -> u64 {
         self.metadata_cache
             .read()
diff --git a/nodedb-cluster/tests/metadata_replication.rs b/nodedb-cluster/tests/metadata_replication.rs
index 6ed9c58f..dd0133ee 100644
--- a/nodedb-cluster/tests/metadata_replication.rs
+++ b/nodedb-cluster/tests/metadata_replication.rs
@@ -1,8 +1,8 @@
 //! Integration test: replicated metadata group commits + cache apply.
 //!
-//! After batch 1e the `nodedb-cluster` crate no longer understands
-//! per-DDL-object descriptor shapes — `CatalogDdl { payload }` is
-//! opaque here. This test verifies the cluster-side plumbing
+//! The `nodedb-cluster` crate does not understand per-DDL-object
+//! descriptor shapes — `CatalogDdl { payload }` is opaque here.
+//! This test verifies the cluster-side plumbing
 //! (raft commit + metadata applier dispatch + cache watermark)
 //! using synthetic opaque payloads. End-to-end cross-node DDL
 //! visibility (applier decoding + redb writeback + pgwire visibility)
diff --git a/nodedb-cluster/tests/swim_routing_invalidation.rs b/nodedb-cluster/tests/swim_routing_invalidation.rs
index f062d6f1..6dd67422 100644
--- a/nodedb-cluster/tests/swim_routing_invalidation.rs
+++ b/nodedb-cluster/tests/swim_routing_invalidation.rs
@@ -1,4 +1,4 @@
-//! E.2 — Liveness drives routing invalidation.
+//! Liveness drives routing invalidation.
 //!
 //! Three UDP-backed SWIM nodes form a full mesh. A shared
 //! `RoutingTable` declares node B as the leader of group 0. A

From e9472dcd0451c04eebf18b11c70bfdd959a18b6d Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 09:17:13 +0800
Subject: [PATCH 07/24] feat(cluster): add load-based rebalancer with metrics,
 planning, and driver loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces a new `rebalancer` module to the cluster crate that
complements the existing overload-triggered scheduler with a
storage-shape-driven rebalancing path (bytes + qps + vshard count).

Key components:

- `metrics`: `LoadMetrics`, `LoadMetricsProvider` trait, `LoadWeights`,
  and `normalized_score` for per-node pressure scoring
- `plan`: `compute_load_based_plan` produces a bounded set of vshard
  moves from hottest to coldest nodes using configurable imbalance
  thresholds (`RebalancerPlanConfig`)
- `driver`: `RebalancerLoop` orchestrates periodic plan evaluation and
  dispatches moves through the `MigrationDispatcher` trait; the
  `ElectionGate` trait lets callers gate sweeps on Raft leadership;
  `AlwaysReadyGate` is provided for tests

All planning logic is pure and side-effect-free, making it fully
unit-testable without spawning any Tokio tasks. An end-to-end
integration test (`tests/rebalancer_loop.rs`) wires a `StaticProvider`,
a `DirectDispatcher`, and `AlwaysReadyGate` to assert that a full
plan → dispatch → routing-table-mutation cycle works correctly.
---
 nodedb-cluster/src/lib.rs                |   6 +
 nodedb-cluster/src/rebalancer/driver.rs  | 354 ++++++++++++++++++++++
 nodedb-cluster/src/rebalancer/metrics.rs | 142 +++++++++
 nodedb-cluster/src/rebalancer/mod.rs     |  32 ++
 nodedb-cluster/src/rebalancer/plan.rs    | 365 +++++++++++++++++++++++
 nodedb-cluster/tests/rebalancer_loop.rs  | 174 +++++++++++
 6 files changed, 1073 insertions(+)
 create mode 100644 nodedb-cluster/src/rebalancer/driver.rs
 create mode 100644 nodedb-cluster/src/rebalancer/metrics.rs
 create mode 100644 nodedb-cluster/src/rebalancer/mod.rs
 create mode 100644 nodedb-cluster/src/rebalancer/plan.rs
 create mode 100644 nodedb-cluster/tests/rebalancer_loop.rs

diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index e38e1591..83834e21 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -30,6 +30,7 @@ pub mod reachability;
 pub mod readiness;
 pub mod rebalance;
 pub mod rebalance_scheduler;
+pub mod rebalancer;
 pub mod routing;
 pub mod routing_liveness;
 pub mod rpc_codec;
@@ -65,6 +66,11 @@ pub use reachability::{
     NoopProber, ReachabilityDriver, ReachabilityDriverConfig, ReachabilityProber, TransportProber,
 };
 pub use rebalance::{RebalancePlan, compute_plan, plan_to_requests};
+pub use rebalancer::{
+    AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, LoadWeights,
+    MigrationDispatcher, RebalancerLoop, RebalancerLoopConfig, RebalancerPlanConfig,
+    compute_load_based_plan, normalized_score,
+};
 pub use routing::RoutingTable;
 pub use routing_liveness::{NodeIdResolver, RoutingLivenessHook};
 pub use rpc_codec::RaftRpc;
diff --git a/nodedb-cluster/src/rebalancer/driver.rs b/nodedb-cluster/src/rebalancer/driver.rs
new file mode 100644
index 00000000..6b5289bd
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/driver.rs
@@ -0,0 +1,354 @@
+//! Rebalancer driver loop.
+//!
+//! [`RebalancerLoop`] is the active half of the load-based rebalancer.
+//! Every `interval` it walks this sequence:
+//!
+//! 1. Ask the injected `ElectionGate` whether any raft group is
+//!    currently mid-election. If so, skip this tick entirely —
+//!    moves during an election race with the new leader's log and
+//!    are almost guaranteed to be wasted work.
+//! 2. Ask the injected [`LoadMetricsProvider`] for a snapshot of
+//!    every node's current load metrics.
+//! 3. Call [`compute_load_based_plan`] against the live routing +
+//!    topology with the configured plan config. If the plan is
+//!    empty (cluster within threshold, or no cold candidates), do
+//!    nothing.
+//! 4. Dispatch each planned move through the injected
+//!    [`MigrationDispatcher`], fire-and-forget. The dispatcher is
+//!    where the bridge to the production `MigrationExecutor` lives
+//!    — tests use a mock that records the calls.
+//!
+//! The loop holds no state of its own; the dispatcher tracks
+//! in-flight work and the breaker/scheduler state is on the
+//! underlying subsystems. This keeps the driver trivially
+//! restartable: crash mid-tick, respawn, resume.
+
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::watch;
+use tokio::time::{MissedTickBehavior, interval};
+use tracing::{debug, info, warn};
+
+use crate::error::Result;
+use crate::rebalance::PlannedMove;
+use crate::routing::RoutingTable;
+use crate::topology::ClusterTopology;
+
+use super::metrics::LoadMetricsProvider;
+use super::plan::{RebalancerPlanConfig, compute_load_based_plan};
+
+/// Injection seam: tells the driver whether it's safe to dispatch
+/// moves. Production wraps a `MultiRaft` status probe; tests return
+/// a constant boolean.
+#[async_trait]
+pub trait ElectionGate: Send + Sync {
+    /// Return `true` if **any** raft group is currently holding an
+    /// election (no stable leader). The driver skips its tick when
+    /// this is `true`.
+    async fn any_group_electing(&self) -> bool;
+}
+
+/// Permissive gate that never blocks the driver. Useful in tests
+/// and in single-node clusters where elections are instantaneous.
+pub struct AlwaysReadyGate;
+
+#[async_trait]
+impl ElectionGate for AlwaysReadyGate {
+    async fn any_group_electing(&self) -> bool {
+        false
+    }
+}
+
+/// Injection seam: executes a single planned move. Production
+/// wraps `MigrationExecutor::execute` and reports success/failure
+/// via logging + the tracker; tests record the move.
+#[async_trait]
+pub trait MigrationDispatcher: Send + Sync {
+    async fn dispatch(&self, mv: PlannedMove) -> Result<()>;
+}
+
+/// Configuration for [`RebalancerLoop`].
+#[derive(Debug, Clone)]
+pub struct RebalancerLoopConfig {
+    /// Period between rebalance sweeps. Defaults to 30 s.
+    pub interval: Duration,
+    /// Plan computation config propagated to
+    /// [`compute_load_based_plan`] on every tick.
+    pub plan: RebalancerPlanConfig,
+}
+
+impl Default for RebalancerLoopConfig {
+    fn default() -> Self {
+        Self {
+            interval: Duration::from_secs(30),
+            plan: RebalancerPlanConfig::default(),
+        }
+    }
+}
+
+/// The driver itself.
+pub struct RebalancerLoop {
+    cfg: RebalancerLoopConfig,
+    metrics: Arc<dyn LoadMetricsProvider>,
+    dispatcher: Arc<dyn MigrationDispatcher>,
+    gate: Arc<dyn ElectionGate>,
+    routing: Arc<RwLock<RoutingTable>>,
+    topology: Arc<RwLock<ClusterTopology>>,
+}
+
+impl RebalancerLoop {
+    pub fn new(
+        cfg: RebalancerLoopConfig,
+        metrics: Arc<dyn LoadMetricsProvider>,
+        dispatcher: Arc<dyn MigrationDispatcher>,
+        gate: Arc<dyn ElectionGate>,
+        routing: Arc<RwLock<RoutingTable>>,
+        topology: Arc<RwLock<ClusterTopology>>,
+    ) -> Self {
+        Self {
+            cfg,
+            metrics,
+            dispatcher,
+            gate,
+            routing,
+            topology,
+        }
+    }
+
+    /// Run the driver until `shutdown` flips to `true`.
+    pub async fn run(self: Arc<Self>, mut shutdown: watch::Receiver<bool>) {
+        let mut tick = interval(self.cfg.interval);
+        tick.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        // Consume the immediate first tick so the first sweep fires
+        // a full interval after start. Prevents start-up stampedes
+        // when many nodes restart together.
+        tick.tick().await;
+        loop {
+            tokio::select! {
+                biased;
+                changed = shutdown.changed() => {
+                    if changed.is_ok() && *shutdown.borrow() {
+                        break;
+                    }
+                }
+                _ = tick.tick() => {
+                    self.sweep_once().await;
+                }
+            }
+        }
+        debug!("rebalancer loop shutting down");
+    }
+
+    /// Run a single sweep. Exposed for tests that drive the loop
+    /// manually rather than through `run`.
+    pub async fn sweep_once(&self) {
+        if self.gate.any_group_electing().await {
+            debug!("rebalancer: raft election in progress, skipping tick");
+            return;
+        }
+        let metrics = match self.metrics.snapshot().await {
+            Ok(m) => m,
+            Err(e) => {
+                warn!(error = %e, "rebalancer: failed to collect metrics");
+                return;
+            }
+        };
+        let plan = {
+            let routing = self.routing.read().unwrap_or_else(|p| p.into_inner());
+            let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
+            compute_load_based_plan(&metrics, &routing, &topo, &self.cfg.plan)
+        };
+        if plan.is_empty() {
+            debug!("rebalancer: no moves needed this tick");
+            return;
+        }
+        info!(
+            move_count = plan.len(),
+            "rebalancer: dispatching planned moves"
+        );
+        for mv in plan {
+            let dispatcher = Arc::clone(&self.dispatcher);
+            tokio::spawn(async move {
+                if let Err(e) = dispatcher.dispatch(mv).await {
+                    warn!(error = %e, "rebalancer: dispatch failed");
+                }
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::rebalancer::metrics::LoadMetrics;
+    use crate::topology::{NodeInfo, NodeState};
+    use std::net::SocketAddr;
+    use std::sync::Mutex;
+
+    struct StaticMetrics(Vec<LoadMetrics>);
+
+    #[async_trait]
+    impl LoadMetricsProvider for StaticMetrics {
+        async fn snapshot(&self) -> Result<Vec<LoadMetrics>> {
+            Ok(self.0.clone())
+        }
+    }
+
+    struct RecordingDispatcher {
+        calls: Mutex<Vec<PlannedMove>>,
+    }
+
+    impl RecordingDispatcher {
+        fn new() -> Arc<Self> {
+            Arc::new(Self {
+                calls: Mutex::new(Vec::new()),
+            })
+        }
+        fn take(&self) -> Vec<PlannedMove> {
+            let mut g = self.calls.lock().unwrap();
+            let out = g.clone();
+            g.clear();
+            out
+        }
+    }
+
+    #[async_trait]
+    impl MigrationDispatcher for RecordingDispatcher {
+        async fn dispatch(&self, mv: PlannedMove) -> Result<()> {
+            self.calls.lock().unwrap().push(mv);
+            Ok(())
+        }
+    }
+
+    struct BlockingGate(bool);
+
+    #[async_trait]
+    impl ElectionGate for BlockingGate {
+        async fn any_group_electing(&self) -> bool {
+            self.0
+        }
+    }
+
+    fn topo(nodes: &[u64]) -> Arc<RwLock<ClusterTopology>> {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+        }
+        Arc::new(RwLock::new(t))
+    }
+
+    fn routing_hot_on(node: u64) -> Arc<RwLock<RoutingTable>> {
+        let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+        for gid in 0..6 {
+            r.set_leader(gid, node);
+        }
+        Arc::new(RwLock::new(r))
+    }
+
+    fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+        LoadMetrics {
+            node_id: id,
+            vshards_led: v,
+            bytes_stored: bytes_mib * 1_048_576,
+            writes_per_sec: w,
+            reads_per_sec: r,
+        }
+    }
+
+    fn hot_cluster_loop(
+        gate: Arc<dyn ElectionGate>,
+    ) -> (Arc<RebalancerLoop>, Arc<RecordingDispatcher>) {
+        let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticMetrics(vec![
+            lm(1, 500, 5000, 200.0, 200.0),
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 5, 5, 5.0, 5.0),
+        ]));
+        let dispatcher = RecordingDispatcher::new();
+        let disp_dyn: Arc<dyn MigrationDispatcher> = dispatcher.clone();
+        let rloop = Arc::new(RebalancerLoop::new(
+            RebalancerLoopConfig {
+                interval: Duration::from_millis(50),
+                plan: RebalancerPlanConfig::default(),
+            },
+            metrics,
+            disp_dyn,
+            gate,
+            routing_hot_on(1),
+            topo(&[1, 2, 3]),
+        ));
+        (rloop, dispatcher)
+    }
+
+    #[tokio::test]
+    async fn sweep_dispatches_moves_when_imbalanced() {
+        let (rloop, dispatcher) = hot_cluster_loop(Arc::new(AlwaysReadyGate));
+        rloop.sweep_once().await;
+        for _ in 0..16 {
+            tokio::task::yield_now().await;
+        }
+        let calls = dispatcher.take();
+        assert!(!calls.is_empty());
+        for c in &calls {
+            assert_eq!(c.source_node, 1);
+        }
+    }
+
+    #[tokio::test]
+    async fn sweep_skipped_during_election() {
+        let (rloop, dispatcher) = hot_cluster_loop(Arc::new(BlockingGate(true)));
+        rloop.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert!(dispatcher.take().is_empty());
+    }
+
+    #[tokio::test]
+    async fn sweep_noop_on_balanced_cluster() {
+        let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticMetrics(vec![
+            lm(1, 50, 500, 100.0, 100.0),
+            lm(2, 50, 500, 100.0, 100.0),
+            lm(3, 50, 500, 100.0, 100.0),
+        ]));
+        let dispatcher = RecordingDispatcher::new();
+        let rloop = Arc::new(RebalancerLoop::new(
+            RebalancerLoopConfig::default(),
+            metrics,
+            dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+            Arc::new(AlwaysReadyGate),
+            routing_hot_on(1),
+            topo(&[1, 2, 3]),
+        ));
+        rloop.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert!(dispatcher.take().is_empty());
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn run_loop_fires_sweeps_and_shuts_down() {
+        let (rloop, dispatcher) = hot_cluster_loop(Arc::new(AlwaysReadyGate));
+        let (tx, rx) = watch::channel(false);
+        let handle = tokio::spawn({
+            let d = Arc::clone(&rloop);
+            async move { d.run(rx).await }
+        });
+        // First tick consumed immediately by run(); advance past a
+        // couple of real intervals with interleaved yields so the
+        // run-loop's select + spawned dispatch tasks all get to poll.
+        for _ in 0..4 {
+            tokio::time::advance(Duration::from_millis(80)).await;
+            for _ in 0..16 {
+                tokio::task::yield_now().await;
+            }
+        }
+        assert!(!dispatcher.take().is_empty());
+
+        let _ = tx.send(true);
+        let _ = tokio::time::timeout(Duration::from_millis(500), handle).await;
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/metrics.rs b/nodedb-cluster/src/rebalancer/metrics.rs
new file mode 100644
index 00000000..0a03b894
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/metrics.rs
@@ -0,0 +1,142 @@
+//! Per-node load metrics and scoring.
+//!
+//! `LoadMetrics` is the raw per-node observation the rebalancer loop
+//! consumes. `normalized_score` folds a `LoadMetrics` plus a set of
+//! `LoadWeights` into a single `f64` so different nodes can be
+//! compared on one axis — the hotter the score, the more work the
+//! node is doing relative to the cluster.
+//!
+//! Weights are configurable because different workloads care about
+//! different dimensions: a write-heavy OLTP cluster wants high
+//! `writes` weight, an analytical cluster wants high `bytes`
+//! weight, and a very uniform vshard layout wants high `vshards`
+//! weight. The defaults (1.0 each) are a balanced starting point.
+
+use async_trait::async_trait;
+
+use crate::error::Result;
+
+/// Raw load observation for a single node.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct LoadMetrics {
+    pub node_id: u64,
+    /// Count of vshards this node is currently leading.
+    pub vshards_led: u32,
+    /// Total bytes stored across all vshards on this node.
+    pub bytes_stored: u64,
+    /// Writes per second (rolling average, caller-defined window).
+    pub writes_per_sec: f64,
+    /// Reads per second (rolling average, caller-defined window).
+    pub reads_per_sec: f64,
+}
+
+/// Relative weights for the four load dimensions. Scaled linearly;
+/// the absolute values don't matter, only their ratios.
+#[derive(Debug, Clone, Copy)]
+pub struct LoadWeights {
+    pub vshards: f64,
+    pub bytes: f64,
+    pub writes: f64,
+    pub reads: f64,
+}
+
+impl Default for LoadWeights {
+    fn default() -> Self {
+        Self {
+            vshards: 1.0,
+            bytes: 1.0,
+            writes: 1.0,
+            reads: 1.0,
+        }
+    }
+}
+
+/// Collapse a `LoadMetrics` observation into a single scalar score
+/// using `weights`. Higher = hotter.
+///
+/// The implementation is a straightforward weighted sum — each field
+/// is scaled by its weight and added. Bytes are divided by a
+/// reasonable unit (1 MiB) so the float stays in a comparable range
+/// to the per-second rates; otherwise a moderately-sized dataset
+/// would swamp the qps signal entirely.
+pub fn normalized_score(m: &LoadMetrics, weights: &LoadWeights) -> f64 {
+    const BYTES_UNIT: f64 = 1_048_576.0; // 1 MiB
+    weights.vshards * m.vshards_led as f64
+        + weights.bytes * (m.bytes_stored as f64 / BYTES_UNIT)
+        + weights.writes * m.writes_per_sec
+        + weights.reads * m.reads_per_sec
+}
+
+/// Injection seam for collecting load metrics from every node in the
+/// cluster. Production impls talk to the metrics endpoint via the
+/// transport; tests inject synthetic values.
+#[async_trait]
+pub trait LoadMetricsProvider: Send + Sync {
+    /// Return a snapshot of every known node's current load metrics.
+    /// The returned slice may be in any order — the rebalancer plan
+    /// sorts internally for determinism.
+    async fn snapshot(&self) -> Result<Vec<LoadMetrics>>;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn m(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+        LoadMetrics {
+            node_id: id,
+            vshards_led: v,
+            bytes_stored: bytes_mib * 1_048_576,
+            writes_per_sec: w,
+            reads_per_sec: r,
+        }
+    }
+
+    #[test]
+    fn default_weights_are_uniform() {
+        let w = LoadWeights::default();
+        assert_eq!(w.vshards, 1.0);
+        assert_eq!(w.bytes, 1.0);
+        assert_eq!(w.writes, 1.0);
+        assert_eq!(w.reads, 1.0);
+    }
+
+    #[test]
+    fn zero_metrics_score_zero() {
+        let metrics = m(1, 0, 0, 0.0, 0.0);
+        assert_eq!(normalized_score(&metrics, &LoadWeights::default()), 0.0);
+    }
+
+    #[test]
+    fn score_sums_all_dimensions_with_default_weights() {
+        // 4 vshards + 8 MiB + 2 wps + 3 rps = 17.0
+        let metrics = m(1, 4, 8, 2.0, 3.0);
+        let score = normalized_score(&metrics, &LoadWeights::default());
+        assert!((score - 17.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn weights_scale_dimensions_independently() {
+        let metrics = m(1, 10, 0, 0.0, 0.0);
+        let w = LoadWeights {
+            vshards: 5.0,
+            ..Default::default()
+        };
+        assert!((normalized_score(&metrics, &w) - 50.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn hotter_node_has_higher_score() {
+        let cold = m(1, 1, 1, 1.0, 1.0);
+        let hot = m(2, 10, 100, 100.0, 100.0);
+        let w = LoadWeights::default();
+        assert!(normalized_score(&hot, &w) > normalized_score(&cold, &w));
+    }
+
+    #[test]
+    fn bytes_scale_via_mib_unit() {
+        // 1 MiB with bytes weight = 1.0 contributes 1.0, not 1_048_576.
+        let metrics = m(1, 0, 1, 0.0, 0.0);
+        assert!((normalized_score(&metrics, &LoadWeights::default()) - 1.0).abs() < 1e-9);
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/mod.rs b/nodedb-cluster/src/rebalancer/mod.rs
new file mode 100644
index 00000000..4b687c69
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/mod.rs
@@ -0,0 +1,32 @@
+//! Load-based automatic rebalancer.
+//!
+//! This module is the *signal* side of the rebalancer: given a
+//! per-node snapshot of load metrics (vshards led, bytes stored,
+//! writes/sec, reads/sec) it computes whether the cluster is
+//! imbalanced enough to warrant moves, and emits a bounded plan of
+//! vshard migrations from the hottest nodes to the coldest ones.
+//!
+//! The actual driver loop (`loop_driver.rs`) and the bridge to
+//! `MigrationExecutor` land in a follow-up sub-batch. Everything
+//! shipped here is pure, side-effect-free, and fully deterministic
+//! so it can be unit-tested exhaustively before any tokio task is
+//! spawned against it.
+//!
+//! ## Why a new module
+//!
+//! The existing [`crate::rebalance_scheduler::RebalanceScheduler`]
+//! triggers on CPU utilization, SPSC queue pressure, and shard-count
+//! ratio. Those are fast-path overload signals and belong where they
+//! are. This module is a distinct, storage-shape-driven rebalancer
+//! (bytes + qps + vshard count) with bounded in-flight moves and a
+//! 30 s cadence, complementing the overload path.
+
+pub mod driver;
+pub mod metrics;
+pub mod plan;
+
+pub use driver::{
+    AlwaysReadyGate, ElectionGate, MigrationDispatcher, RebalancerLoop, RebalancerLoopConfig,
+};
+pub use metrics::{LoadMetrics, LoadMetricsProvider, LoadWeights, normalized_score};
+pub use plan::{RebalancerPlanConfig, compute_load_based_plan};
diff --git a/nodedb-cluster/src/rebalancer/plan.rs b/nodedb-cluster/src/rebalancer/plan.rs
new file mode 100644
index 00000000..5620e240
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/plan.rs
@@ -0,0 +1,365 @@
+//! Load-imbalance plan computation.
+//!
+//! Given a snapshot of per-node `LoadMetrics` and the current routing
+//! table, decide whether the cluster is imbalanced enough to justify
+//! moves and, if so, emit a bounded list of `PlannedMove`s from the
+//! hottest nodes to the coldest ones.
+//!
+//! ## Trigger
+//!
+//! The rebalancer fires when, after normalizing every node's score:
+//!
+//! > `max - min  >  threshold_pct / 100  *  mean`
+//!
+//! ...i.e. the hottest node is more than `threshold_pct`% above the
+//! cluster mean relative to the coldest one. This is intentionally
+//! not a per-node check: single-hot-node scenarios below the
+//! cluster mean delta are handled by the separate
+//! `rebalance_scheduler` CPU/queue triggers.
+//!
+//! ## Move selection
+//!
+//! For each hot→cold pair, the planner walks the routing table in
+//! stable (sorted by group_id, then vshard_id) order and picks
+//! vshards the hot node is currently leading. It caps moves at
+//! `max_moves_per_group` moves from any single group (so one
+//! over-replicated group can't consume the entire in-flight budget)
+//! and at `max_moves_total` across the whole plan (so the dispatcher
+//! never has more than that many migrations in flight at once).
+//!
+//! Determinism: the plan is deterministic given the same inputs,
+//! including tie-breaks. Two nodes computing the plan at the same
+//! instant produce byte-identical outputs.
+
+use std::collections::HashMap;
+
+use tracing::debug;
+
+use crate::rebalance::PlannedMove;
+use crate::routing::RoutingTable;
+use crate::topology::ClusterTopology;
+
+use super::metrics::{LoadMetrics, LoadWeights, normalized_score};
+
+/// Configuration for [`compute_load_based_plan`].
+#[derive(Debug, Clone)]
+pub struct RebalancerPlanConfig {
+    /// If `(max - min) > (threshold_pct / 100) * mean`, we plan moves.
+    /// Default: 20%.
+    pub imbalance_threshold_pct: u8,
+    /// Maximum moves from any single Raft group per plan. Default 1.
+    pub max_moves_per_group: usize,
+    /// Maximum moves in the entire plan. Default 10.
+    pub max_moves_total: usize,
+    /// Weights applied to the load dimensions when scoring.
+    pub weights: LoadWeights,
+}
+
+impl Default for RebalancerPlanConfig {
+    fn default() -> Self {
+        Self {
+            imbalance_threshold_pct: 20,
+            max_moves_per_group: 1,
+            max_moves_total: 10,
+            weights: LoadWeights::default(),
+        }
+    }
+}
+
+/// Compute a load-driven rebalance plan. Returns an empty vector if
+/// the cluster is already within the imbalance threshold or if there
+/// are fewer than two nodes to compare.
+pub fn compute_load_based_plan(
+    metrics: &[LoadMetrics],
+    routing: &RoutingTable,
+    topology: &ClusterTopology,
+    cfg: &RebalancerPlanConfig,
+) -> Vec<PlannedMove> {
+    if metrics.len() < 2 {
+        return Vec::new();
+    }
+
+    // Score every node, then sort ascending so the hot list and cold
+    // list are natural slices. `f64` isn't Ord, so use total_cmp for
+    // NaN-free deterministic ordering.
+    let mut scored: Vec<(u64, f64)> = metrics
+        .iter()
+        .map(|m| (m.node_id, normalized_score(m, &cfg.weights)))
+        .collect();
+    scored.sort_by(|a, b| a.1.total_cmp(&b.1).then_with(|| a.0.cmp(&b.0)));
+
+    let min = scored.first().map(|(_, s)| *s).unwrap_or(0.0);
+    let max = scored.last().map(|(_, s)| *s).unwrap_or(0.0);
+    let mean: f64 = scored.iter().map(|(_, s)| *s).sum::<f64>() / scored.len() as f64;
+
+    // Imbalance gate. A zero-mean cluster (everything idle) is
+    // considered already balanced — nothing to move.
+    if mean <= 0.0 {
+        return Vec::new();
+    }
+    let threshold = (cfg.imbalance_threshold_pct as f64 / 100.0) * mean;
+    if (max - min) <= threshold {
+        debug!(
+            max,
+            min, mean, threshold, "rebalancer: cluster within imbalance threshold"
+        );
+        return Vec::new();
+    }
+
+    // Only Active nodes are valid migration targets. Cold candidates
+    // must be Active and must not already be the source for a move.
+    let active_set: std::collections::HashSet<u64> =
+        topology.active_nodes().iter().map(|n| n.node_id).collect();
+
+    // Hot = strictly above mean; cold = strictly below mean. Using
+    // the mean as the split point (rather than index-based halving)
+    // correctly handles asymmetric distributions where a single
+    // outlier pulls one node above an otherwise balanced cluster —
+    // the below-mean nodes stay in the cold set even if they tie
+    // with each other.
+    let hot_nodes: Vec<u64> = scored
+        .iter()
+        .rev() // hottest first
+        .filter(|(_, s)| *s > mean)
+        .map(|(id, _)| *id)
+        .collect();
+    let cold_nodes: Vec<u64> = scored
+        .iter()
+        .filter(|(_, s)| *s < mean)
+        .filter(|(id, _)| active_set.contains(id))
+        .map(|(id, _)| *id)
+        .collect();
+
+    if cold_nodes.is_empty() {
+        return Vec::new();
+    }
+
+    // Walk routing in stable order — group id ascending, then vshard
+    // id ascending — and pick moves until we hit the caps.
+    let mut group_ids: Vec<u64> = routing.group_members().keys().copied().collect();
+    group_ids.sort_unstable();
+
+    let mut moves: Vec<PlannedMove> = Vec::new();
+    let mut per_group_count: HashMap<u64, usize> = HashMap::new();
+    let mut cold_cursor = 0usize;
+
+    'outer: for hot in &hot_nodes {
+        if !active_set.contains(hot) {
+            continue;
+        }
+        for &gid in &group_ids {
+            if moves.len() >= cfg.max_moves_total {
+                break 'outer;
+            }
+            let info = match routing.group_info(gid) {
+                Some(i) => i,
+                None => continue,
+            };
+            if info.leader != *hot {
+                continue;
+            }
+            if *per_group_count.get(&gid).unwrap_or(&0) >= cfg.max_moves_per_group {
+                continue;
+            }
+            // Pick the group's lowest vshard id deterministically.
+            let mut vshards = routing.vshards_for_group(gid);
+            vshards.sort_unstable();
+            let Some(&vshard_id) = vshards.first() else {
+                continue;
+            };
+            let target = cold_nodes[cold_cursor % cold_nodes.len()];
+            if target == *hot {
+                continue;
+            }
+            moves.push(PlannedMove {
+                vshard_id,
+                source_node: *hot,
+                target_node: target,
+                source_group: gid,
+            });
+            *per_group_count.entry(gid).or_default() += 1;
+            cold_cursor += 1;
+        }
+    }
+
+    moves
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::topology::{NodeInfo, NodeState};
+    use std::net::SocketAddr;
+
+    fn topo(nodes: &[u64]) -> ClusterTopology {
+        let mut t = ClusterTopology::new();
+        for (i, id) in nodes.iter().enumerate() {
+            let a: SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+            t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+        }
+        t
+    }
+
+    fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+        LoadMetrics {
+            node_id: id,
+            vshards_led: v,
+            bytes_stored: bytes_mib * 1_048_576,
+            writes_per_sec: w,
+            reads_per_sec: r,
+        }
+    }
+
+    #[test]
+    fn empty_metrics_returns_empty_plan() {
+        let t = topo(&[1, 2]);
+        let r = RoutingTable::uniform(2, &[1, 2], 1);
+        let plan = compute_load_based_plan(&[], &r, &t, &RebalancerPlanConfig::default());
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn single_node_returns_empty_plan() {
+        let t = topo(&[1]);
+        let r = RoutingTable::uniform(1, &[1], 1);
+        let plan = compute_load_based_plan(
+            &[lm(1, 100, 100, 100.0, 100.0)],
+            &r,
+            &t,
+            &RebalancerPlanConfig::default(),
+        );
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn balanced_cluster_no_moves() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(3, &[1, 2, 3], 1);
+        let metrics = vec![
+            lm(1, 10, 100, 50.0, 50.0),
+            lm(2, 10, 100, 50.0, 50.0),
+            lm(3, 10, 100, 50.0, 50.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn imbalance_above_threshold_triggers_moves() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+        // Node 1 massively overloaded.
+        let metrics = vec![
+            lm(1, 200, 1000, 500.0, 500.0),
+            lm(2, 10, 50, 25.0, 25.0),
+            lm(3, 10, 50, 25.0, 25.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        assert!(!plan.is_empty());
+        // Every move must source from node 1.
+        for m in &plan {
+            assert_eq!(m.source_node, 1);
+        }
+    }
+
+    #[test]
+    fn plan_respects_max_moves_total() {
+        let t = topo(&[1, 2]);
+        // 20 groups so node 1 can lead many.
+        let mut r = RoutingTable::uniform(20, &[1, 2], 1);
+        for gid in 0..20 {
+            r.set_leader(gid, 1);
+        }
+        let metrics = vec![lm(1, 2000, 10_000, 5000.0, 5000.0), lm(2, 1, 1, 1.0, 1.0)];
+        let cfg = RebalancerPlanConfig {
+            max_moves_total: 4,
+            max_moves_per_group: 1,
+            ..Default::default()
+        };
+        let plan = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        assert_eq!(plan.len(), 4);
+    }
+
+    #[test]
+    fn plan_respects_max_moves_per_group() {
+        let t = topo(&[1, 2]);
+        let mut r = RoutingTable::uniform(3, &[1, 2], 1);
+        for gid in 0..3 {
+            r.set_leader(gid, 1);
+        }
+        let metrics = vec![lm(1, 2000, 10_000, 5000.0, 5000.0), lm(2, 1, 1, 1.0, 1.0)];
+        let cfg = RebalancerPlanConfig {
+            max_moves_total: 99,
+            max_moves_per_group: 1,
+            ..Default::default()
+        };
+        let plan = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        // With max_moves_per_group=1 and 3 groups, at most 3 moves.
+        assert!(plan.len() <= 3);
+        let mut by_group: HashMap<u64, usize> = HashMap::new();
+        for m in &plan {
+            *by_group.entry(m.source_group).or_default() += 1;
+        }
+        for (_, count) in by_group {
+            assert!(count <= 1);
+        }
+    }
+
+    #[test]
+    fn plan_is_deterministic() {
+        let t = topo(&[1, 2, 3]);
+        let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+        for gid in 0..6 {
+            r.set_leader(gid, 1);
+        }
+        let metrics = vec![
+            lm(1, 500, 5000, 200.0, 200.0),
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 5, 5, 5.0, 5.0),
+        ];
+        let cfg = RebalancerPlanConfig::default();
+        let p1 = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        let p2 = compute_load_based_plan(&metrics, &r, &t, &cfg);
+        let p1_tuples: Vec<_> = p1
+            .iter()
+            .map(|m| (m.vshard_id, m.source_node, m.target_node, m.source_group))
+            .collect();
+        let p2_tuples: Vec<_> = p2
+            .iter()
+            .map(|m| (m.vshard_id, m.source_node, m.target_node, m.source_group))
+            .collect();
+        assert_eq!(p1_tuples, p2_tuples);
+    }
+
+    #[test]
+    fn idle_cluster_never_triggers() {
+        let t = topo(&[1, 2, 3]);
+        let r = RoutingTable::uniform(3, &[1, 2, 3], 1);
+        let metrics = vec![
+            lm(1, 0, 0, 0.0, 0.0),
+            lm(2, 0, 0, 0.0, 0.0),
+            lm(3, 0, 0, 0.0, 0.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        assert!(plan.is_empty());
+    }
+
+    #[test]
+    fn cold_node_must_be_active() {
+        // Node 3 is not Active (it's Draining) → cannot receive.
+        let mut t = topo(&[1, 2, 3]);
+        t.set_state(3, NodeState::Draining);
+        let mut r = RoutingTable::uniform(2, &[1, 2, 3], 1);
+        r.set_leader(0, 1);
+        r.set_leader(1, 1);
+        let metrics = vec![
+            lm(1, 500, 5000, 200.0, 200.0),
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 0, 0, 0.0, 0.0),
+        ];
+        let plan = compute_load_based_plan(&metrics, &r, &t, &RebalancerPlanConfig::default());
+        for m in &plan {
+            assert_ne!(m.target_node, 3, "Draining node must not receive moves");
+        }
+    }
+}
diff --git a/nodedb-cluster/tests/rebalancer_loop.rs b/nodedb-cluster/tests/rebalancer_loop.rs
new file mode 100644
index 00000000..5d7193a8
--- /dev/null
+++ b/nodedb-cluster/tests/rebalancer_loop.rs
@@ -0,0 +1,174 @@
+//! End-to-end rebalancer driver loop.
+//!
+//! Wires every piece of the rebalancer together without standing up
+//! the real `MigrationExecutor`:
+//!
+//! - A shared `Arc<RwLock<RoutingTable>>` + `Arc<RwLock<ClusterTopology>>`.
+//! - A `StaticProvider` returning a canned set of `LoadMetrics` so
+//!   node 1 is massively hotter than nodes 2 and 3.
+//! - A `DirectDispatcher` that simulates instantaneous migration
+//!   completion by reassigning the vshard's group leader in the
+//!   live routing table and recording the call for assertions.
+//! - An `AlwaysReadyGate` — no election gating in this synthetic
+//!   scenario.
+//!
+//! The test spawns the loop, advances through one sweep, asserts
+//! the dispatcher observed moves exclusively from node 1 as source,
+//! and asserts the routing table was actually mutated — proving the
+//! full plan → dispatch → apply chain.
+
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::watch;
+
+use nodedb_cluster::error::Result;
+use nodedb_cluster::rebalance::PlannedMove;
+use nodedb_cluster::rebalancer::{
+    AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher,
+    RebalancerLoop, RebalancerLoopConfig, RebalancerPlanConfig,
+};
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
+
+struct StaticProvider(Vec<LoadMetrics>);
+
+#[async_trait]
+impl LoadMetricsProvider for StaticProvider {
+    async fn snapshot(&self) -> Result<Vec<LoadMetrics>> {
+        Ok(self.0.clone())
+    }
+}
+
+struct DirectDispatcher {
+    routing: Arc<RwLock<RoutingTable>>,
+    calls: Mutex<Vec<PlannedMove>>,
+    fired: AtomicBool,
+}
+
+impl DirectDispatcher {
+    fn new(routing: Arc<RwLock<RoutingTable>>) -> Arc<Self> {
+        Arc::new(Self {
+            routing,
+            calls: Mutex::new(Vec::new()),
+            fired: AtomicBool::new(false),
+        })
+    }
+    fn calls(&self) -> Vec<PlannedMove> {
+        self.calls.lock().unwrap().clone()
+    }
+    fn fired(&self) -> bool {
+        self.fired.load(Ordering::SeqCst)
+    }
+}
+
+#[async_trait]
+impl MigrationDispatcher for DirectDispatcher {
+    async fn dispatch(&self, mv: PlannedMove) -> Result<()> {
+        // Simulate a completed migration by flipping the group
+        // leader to the target node.
+        {
+            let mut rt = self.routing.write().unwrap_or_else(|p| p.into_inner());
+            rt.set_leader(mv.source_group, mv.target_node);
+        }
+        self.calls.lock().unwrap().push(mv);
+        self.fired.store(true, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+fn topo(nodes: &[u64]) -> Arc<RwLock<ClusterTopology>> {
+    let mut t = ClusterTopology::new();
+    for (i, id) in nodes.iter().enumerate() {
+        let a: std::net::SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+        t.add_node(NodeInfo::new(*id, a, NodeState::Active));
+    }
+    Arc::new(RwLock::new(t))
+}
+
+fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+    LoadMetrics {
+        node_id: id,
+        vshards_led: v,
+        bytes_stored: bytes_mib * 1_048_576,
+        writes_per_sec: w,
+        reads_per_sec: r,
+    }
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn rebalancer_loop_dispatches_and_mutates_routing() {
+    // --- 3 active nodes, 6 groups, node 1 leads all of them (hot).
+    let topology = topo(&[1, 2, 3]);
+    let mut r = RoutingTable::uniform(6, &[1, 2, 3], 1);
+    for gid in 0..6 {
+        r.set_leader(gid, 1);
+    }
+    let routing = Arc::new(RwLock::new(r));
+
+    // --- Hot node 1, cold 2 and 3.
+    let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticProvider(vec![
+        lm(1, 500, 5000, 200.0, 200.0),
+        lm(2, 5, 5, 5.0, 5.0),
+        lm(3, 5, 5, 5.0, 5.0),
+    ]));
+
+    let dispatcher = DirectDispatcher::new(routing.clone());
+    let gate: Arc<dyn ElectionGate> = Arc::new(AlwaysReadyGate);
+
+    let rloop = Arc::new(RebalancerLoop::new(
+        RebalancerLoopConfig {
+            interval: Duration::from_millis(50),
+            plan: RebalancerPlanConfig::default(),
+        },
+        metrics,
+        dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+        gate,
+        routing.clone(),
+        topology.clone(),
+    ));
+
+    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let handle = tokio::spawn({
+        let d = Arc::clone(&rloop);
+        async move { d.run(shutdown_rx).await }
+    });
+
+    // Wall-clock wait — the loop uses real time, so just give it a
+    // couple of intervals to sweep + spawn + dispatch.
+    let deadline = std::time::Instant::now() + Duration::from_secs(3);
+    while std::time::Instant::now() < deadline {
+        if dispatcher.fired() {
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+    assert!(
+        dispatcher.fired(),
+        "rebalancer loop never dispatched a move"
+    );
+
+    // Every move must have node 1 as source.
+    let calls = dispatcher.calls();
+    assert!(!calls.is_empty());
+    for c in &calls {
+        assert_eq!(c.source_node, 1, "source must be the hot node");
+        assert_ne!(c.target_node, 1, "target must differ from source");
+    }
+
+    // Routing mutation: at least one group previously led by 1 now
+    // has a non-1 leader.
+    let rt = routing.read().unwrap();
+    let still_on_1 = (0..6)
+        .filter(|gid| rt.group_info(*gid).unwrap().leader == 1)
+        .count();
+    assert!(
+        still_on_1 < 6,
+        "at least one group should have moved off node 1"
+    );
+
+    let _ = shutdown_tx.send(true);
+    let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
+}

From 3d7f26a123fbc9d1194b0ebac7ee6d45fec918ce Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 09:17:34 +0800
Subject: [PATCH 08/24] fix(cluster): correct migration learner promotion and
 replicated Phase 3 cut-over

Phase 1 previously added the target node directly as a voter via
`ConfChangeType::AddNode`. This was incorrect: a node with an empty
log cannot safely participate in elections or voting immediately.
Phase 1 now adds the target as a learner (`AddLearner`) so Raft
replication can bring it up to date without affecting quorum.

Phase 2 detects when the learner's `match_index` reaches the leader's
`commit_index` and promotes it to voter via `ConfChangeType::PromoteLearner`
before returning. This ensures the Raft group has sufficient replicas for
a safe cut-over before Phase 3 begins.

Phase 3 cut-over is updated to route the routing change through the
metadata Raft group when a `MetadataProposer` is attached. The
`LeadershipTransfer` metadata entry is proposed and waited on so every
node applies the routing update atomically at the same commit index.
Without a proposer (test environments without a metadata group) the
executor falls back to a local-only routing mutation via `set_leader`.

A `with_metadata_proposer` builder method is added to `MigrationExecutor`
for production wiring; tests continue to work without it.
---
 nodedb-cluster/src/migration_executor.rs | 99 +++++++++++++++---------
 1 file changed, 63 insertions(+), 36 deletions(-)

diff --git a/nodedb-cluster/src/migration_executor.rs b/nodedb-cluster/src/migration_executor.rs
index 9caeee80..18ea6b18 100644
--- a/nodedb-cluster/src/migration_executor.rs
+++ b/nodedb-cluster/src/migration_executor.rs
@@ -16,8 +16,10 @@ use std::time::Duration;
 use tracing::{debug, info};
 
 use crate::conf_change::{ConfChange, ConfChangeType};
+use crate::decommission::MetadataProposer;
 use crate::error::{ClusterError, Result};
 use crate::ghost::{GhostStub, GhostTable};
+use crate::metadata_group::{MetadataEntry, RoutingChange};
 use crate::migration::{MigrationPhase, MigrationState};
 use crate::multi_raft::MultiRaft;
 use crate::routing::RoutingTable;
@@ -65,6 +67,13 @@ pub struct MigrationExecutor {
     topology: Arc<RwLock<ClusterTopology>>,
     transport: Arc<NexarTransport>,
     ghost_table: Arc<Mutex<GhostTable>>,
+    /// Optional metadata proposer for replicated routing updates.
+    /// When set, Phase 3 cut-over proposes a `RoutingChange` through
+    /// the metadata Raft group so every node applies the routing
+    /// update atomically on commit. When `None`, falls back to
+    /// local-only routing mutation (used by tests that don't stand
+    /// up a metadata group).
+    metadata_proposer: Option<Arc<dyn MetadataProposer>>,
 }
 
 impl MigrationExecutor {
@@ -80,9 +89,17 @@ impl MigrationExecutor {
             topology,
             transport,
             ghost_table: Arc::new(Mutex::new(GhostTable::new())),
+            metadata_proposer: None,
         }
     }
 
+    /// Attach a metadata proposer for replicated Phase 3 cut-over.
+    /// Production wiring calls this; tests may omit it for simplicity.
+    pub fn with_metadata_proposer(mut self, proposer: Arc<dyn MetadataProposer>) -> Self {
+        self.metadata_proposer = Some(proposer);
+        self
+    }
+
     /// Access the ghost table (for scatter-gather resolution).
     pub fn ghost_table(&self) -> &Arc<Mutex<GhostTable>> {
         &self.ghost_table
@@ -180,9 +197,11 @@ impl MigrationExecutor {
             "phase 1: adding target to raft group"
         );
 
-        // Add target node as a voter to the Raft group via ConfChange.
+        // Add target node as a LEARNER so it can catch up via Raft
+        // replication without participating in elections or voting.
+        // Promotion to voter happens after Phase 2 confirms catch-up.
         let change = ConfChange {
-            change_type: ConfChangeType::AddNode,
+            change_type: ConfChangeType::AddLearner,
             node_id: req.target_node,
         };
 
@@ -202,12 +221,13 @@ impl MigrationExecutor {
 
         // The ConfChange will be replicated and applied. The target node
         // receives the full log through Raft's normal replication.
-        // Mark base copy as complete immediately — Raft handles the transfer.
+        // Mark base copy as complete — Raft replication is now in
+        // progress; the real progress signal is match_index in Phase 2.
         state.update_base_copy(committed);
 
         debug!(
             vshard = req.vshard_id,
-            "phase 1 complete: target added to raft group"
+            "phase 1 complete: target added as learner to raft group"
         );
 
         Ok(())
@@ -313,9 +333,21 @@ impl MigrationExecutor {
             state.update_wal_catchup(leader_commit, target_match);
 
             if state.is_catchup_ready() {
+                // Learner has caught up — promote to voter so the
+                // group has enough replicas for a safe cut-over.
+                let promote = ConfChange {
+                    change_type: ConfChangeType::PromoteLearner,
+                    node_id: req.target_node,
+                };
+                {
+                    let mut mr = self.multi_raft.lock().unwrap_or_else(|p| p.into_inner());
+                    mr.propose_conf_change(group_id, &promote)?;
+                }
                 debug!(
                     vshard = req.vshard_id,
-                    leader_commit, target_match, "phase 2 complete: target caught up"
+                    leader_commit,
+                    target_match,
+                    "phase 2 complete: target caught up and promoted to voter"
                 );
                 return Ok(());
             }
@@ -331,15 +363,20 @@ impl MigrationExecutor {
         }
     }
 
-    /// Phase 3: Atomic routing table update via Raft.
+    /// Phase 3: Atomic routing table update.
+    ///
+    /// When a [`MetadataProposer`] is attached, the cut-over proposes
+    /// a `LeadershipTransfer` through the metadata Raft group so
+    /// every node applies the routing update atomically on commit.
+    /// Without a proposer (tests), falls back to a local-only
+    /// mutation.
     async fn phase3_cutover(
         &self,
         state: &mut MigrationState,
         group_id: u64,
         req: &MigrationRequest,
     ) -> Result<()> {
-        // Estimate pause (time to propose + commit the routing update).
-        let estimated_pause_us = 10_000; // ~10ms estimate for Raft round-trip.
+        let estimated_pause_us = 10_000;
 
         state.start_cutover(estimated_pause_us).map_err(|e| {
             state.fail(format!("cutover rejected: {e}"));
@@ -353,28 +390,23 @@ impl MigrationExecutor {
             estimated_pause_us, "phase 3: atomic cut-over"
         );
 
-        // Propose the routing update as a Raft entry so all nodes apply it
-        // atomically when committed. The entry is serialized as a ConfChange
-        // with a special routing marker that the applier interprets.
-        let routing_change = ConfChange {
-            change_type: ConfChangeType::AddNode,
-            node_id: req.target_node,
-        };
-        {
-            let mut mr = self.multi_raft.lock().unwrap_or_else(|p| p.into_inner());
-            mr.propose_conf_change(group_id, &routing_change)?;
-        }
-
-        // Update the local routing table. Other nodes update theirs when they
-        // apply the committed entry through their own applier.
-        {
+        // Propose the routing change. With a metadata proposer the
+        // `CacheApplier::with_live_state` on every node handles the
+        // actual routing mutation when the entry commits; without a
+        // proposer we mutate locally for backward-compat.
+        if let Some(proposer) = &self.metadata_proposer {
+            let entry = MetadataEntry::RoutingChange(RoutingChange::LeadershipTransfer {
+                group_id,
+                new_leader_node_id: req.target_node,
+            });
+            proposer.propose_and_wait(entry).await?;
+        } else {
             let mut routing = self.routing.write().unwrap_or_else(|p| p.into_inner());
-            routing.reassign_vshard(req.vshard_id, group_id);
+            routing.set_leader(group_id, req.target_node);
         }
 
-        // Install ghost stub on source so scatter-gather queries that arrive
-        // before the client refreshes its routing table are transparently
-        // forwarded to the new owner.
+        // Ghost stub so in-flight scatter-gather queries that still
+        // target the old leader are transparently forwarded.
         {
             let mut ghosts = self.ghost_table.lock().unwrap_or_else(|p| p.into_inner());
             ghosts.insert(GhostStub {
@@ -387,18 +419,13 @@ impl MigrationExecutor {
                     .as_millis() as u64,
             });
         }
-        debug!(
-            vshard = req.vshard_id,
-            target = req.target_node,
-            "ghost stub registered for transparent forwarding"
-        );
 
         let actual_pause_us = cutover_start.elapsed().as_micros() as u64;
         state.complete(actual_pause_us);
 
         debug!(
             vshard = req.vshard_id,
-            actual_pause_us, "phase 3 complete: routing updated via raft"
+            actual_pause_us, "phase 3 complete: routing updated"
         );
 
         Ok(())
@@ -521,14 +548,14 @@ mod tests {
             write_pause_budget_us: 500_000,
         };
 
-        // Phase 1 should succeed (adds node 2 to group 0).
+        // Phase 1 should succeed (adds node 2 as learner to group 0).
         executor
             .phase1_base_copy(&mut state, 0, &req)
             .await
             .unwrap();
 
-        // Verify: the ConfChange was proposed (it's in the Raft log).
-        // The actual application happens when committed, which requires tick().
+        // Verify: the ConfChange (AddLearner) was proposed in the Raft log.
+        // Application happens on next tick/commit cycle.
     }
 
     #[test]

From cf92257e18b729b4702dc974225477b756700d33 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 09:29:52 +0800
Subject: [PATCH 09/24] feat(cluster): trigger rebalancer sweep on SWIM
 membership changes

Introduce RebalancerKickHook, a MembershipSubscriber that wakes the
rebalancer loop immediately when a node joins or leaves, replacing
passive polling for topology changes.
---
 nodedb-cluster/src/lib.rs                |   4 +-
 nodedb-cluster/src/rebalancer/driver.rs  |  20 ++-
 nodedb-cluster/src/rebalancer/elastic.rs | 147 ++++++++++++++++++++
 nodedb-cluster/src/rebalancer/mod.rs     |   2 +
 nodedb-cluster/tests/elastic_scaling.rs  | 168 +++++++++++++++++++++++
 5 files changed, 338 insertions(+), 3 deletions(-)
 create mode 100644 nodedb-cluster/src/rebalancer/elastic.rs
 create mode 100644 nodedb-cluster/tests/elastic_scaling.rs

diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index 83834e21..440b3aee 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -68,8 +68,8 @@ pub use reachability::{
 pub use rebalance::{RebalancePlan, compute_plan, plan_to_requests};
 pub use rebalancer::{
     AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, LoadWeights,
-    MigrationDispatcher, RebalancerLoop, RebalancerLoopConfig, RebalancerPlanConfig,
-    compute_load_based_plan, normalized_score,
+    MigrationDispatcher, RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig,
+    RebalancerPlanConfig, compute_load_based_plan, normalized_score,
 };
 pub use routing::RoutingTable;
 pub use routing_liveness::{NodeIdResolver, RoutingLivenessHook};
diff --git a/nodedb-cluster/src/rebalancer/driver.rs b/nodedb-cluster/src/rebalancer/driver.rs
index 6b5289bd..52b89bd9 100644
--- a/nodedb-cluster/src/rebalancer/driver.rs
+++ b/nodedb-cluster/src/rebalancer/driver.rs
@@ -27,7 +27,7 @@ use std::sync::{Arc, RwLock};
 use std::time::Duration;
 
 use async_trait::async_trait;
-use tokio::sync::watch;
+use tokio::sync::{Notify, watch};
 use tokio::time::{MissedTickBehavior, interval};
 use tracing::{debug, info, warn};
 
@@ -96,6 +96,12 @@ pub struct RebalancerLoop {
     gate: Arc<dyn ElectionGate>,
     routing: Arc<RwLock<RoutingTable>>,
     topology: Arc<RwLock<ClusterTopology>>,
+    /// Membership-change notification. When any caller (a SWIM
+    /// subscriber, a manual admin trigger, etc.) calls
+    /// [`notify`](Notify::notify_one) on this handle, the run loop
+    /// wakes up immediately and runs an extra sweep instead of
+    /// waiting for the next 30 s tick.
+    kick: Arc<Notify>,
 }
 
 impl RebalancerLoop {
@@ -114,9 +120,17 @@ impl RebalancerLoop {
             gate,
             routing,
             topology,
+            kick: Arc::new(Notify::new()),
         }
     }
 
+    /// Return a handle that callers can use to trigger an immediate
+    /// sweep. Cloning the `Arc<Notify>` is cheap; every clone
+    /// shares the same waker.
+    pub fn kick_handle(&self) -> Arc<Notify> {
+        Arc::clone(&self.kick)
+    }
+
     /// Run the driver until `shutdown` flips to `true`.
     pub async fn run(self: Arc<Self>, mut shutdown: watch::Receiver<bool>) {
         let mut tick = interval(self.cfg.interval);
@@ -136,6 +150,10 @@ impl RebalancerLoop {
                 _ = tick.tick() => {
                     self.sweep_once().await;
                 }
+                _ = self.kick.notified() => {
+                    debug!("rebalancer: membership-change kick received");
+                    self.sweep_once().await;
+                }
             }
         }
         debug!("rebalancer loop shutting down");
diff --git a/nodedb-cluster/src/rebalancer/elastic.rs b/nodedb-cluster/src/rebalancer/elastic.rs
new file mode 100644
index 00000000..36903741
--- /dev/null
+++ b/nodedb-cluster/src/rebalancer/elastic.rs
@@ -0,0 +1,147 @@
+//! Elastic scaling glue — ties SWIM membership transitions to the
+//! rebalancer loop so new/departing nodes trigger an immediate sweep
+//! instead of waiting for the next 30 s tick.
+//!
+//! ## Add-node path
+//!
+//! 1. Node joins via the existing bootstrap/join RPC path.
+//! 2. `CacheApplier` with live state applies `TopologyChange::Join`
+//!    + `PromoteToVoter`, adding the node to the live topology.
+//! 3. SWIM detects the new node as `Alive` through gossip.
+//! 4. [`RebalancerKickHook`] (a [`MembershipSubscriber`]) fires
+//!    [`Notify::notify_one`] on the rebalancer loop's kick handle.
+//! 5. The loop wakes, collects metrics (including the new node's
+//!    low load score), and dispatches moves to the new node.
+//!
+//! ## Remove-node path
+//!
+//! 1. Operator runs `cluster decommission N` (Phase E.4).
+//! 2. The decommission flow strips the node from all groups and
+//!    removes it from topology.
+//! 3. SWIM detects the node as `Dead` / `Left`.
+//! 4. The same kick hook wakes the rebalancer so it re-evaluates
+//!    whether the remaining nodes are balanced.
+//!
+//! No new data types or traits — just a [`MembershipSubscriber`]
+//! impl holding a shared `Arc<Notify>`.
+
+use std::sync::Arc;
+
+use nodedb_types::NodeId;
+use tokio::sync::Notify;
+use tracing::debug;
+
+use crate::swim::member::MemberState;
+use crate::swim::subscriber::MembershipSubscriber;
+
+/// SWIM [`MembershipSubscriber`] that triggers an immediate
+/// rebalancer sweep on membership-relevant transitions.
+///
+/// Relevant transitions are:
+/// - `None → Alive` (first time a new node is seen — add path)
+/// - `_ → Dead` / `_ → Left` (node departure — remove path)
+/// - `_ → Alive` after `Dead`/`Left` (node recovery)
+///
+/// All other transitions (Alive → Suspect, Suspect → Alive) are
+/// transient and do not change the set of Active nodes, so they
+/// are ignored.
+pub struct RebalancerKickHook {
+    kick: Arc<Notify>,
+}
+
+impl RebalancerKickHook {
+    pub fn new(kick: Arc<Notify>) -> Self {
+        Self { kick }
+    }
+}
+
+impl MembershipSubscriber for RebalancerKickHook {
+    fn on_state_change(&self, node_id: &NodeId, old: Option<MemberState>, new: MemberState) {
+        let relevant = match (old, new) {
+            // First-time insert as Alive (new node joined).
+            (None, MemberState::Alive) => true,
+            // Node died or left.
+            (_, MemberState::Dead) | (_, MemberState::Left) => true,
+            // Node recovered from Dead/Left back to Alive.
+            (Some(MemberState::Dead), MemberState::Alive)
+            | (Some(MemberState::Left), MemberState::Alive) => true,
+            _ => false,
+        };
+        if relevant {
+            debug!(?node_id, ?old, ?new, "rebalancer kick: membership change");
+            self.kick.notify_one();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::atomic::{AtomicU32, Ordering};
+
+    fn counting_notify() -> (Arc<Notify>, Arc<AtomicU32>, tokio::task::JoinHandle<()>) {
+        let notify = Arc::new(Notify::new());
+        let counter = Arc::new(AtomicU32::new(0));
+        let n = notify.clone();
+        let c = counter.clone();
+        let handle = tokio::spawn(async move {
+            loop {
+                n.notified().await;
+                c.fetch_add(1, Ordering::SeqCst);
+            }
+        });
+        (notify, counter, handle)
+    }
+
+    #[tokio::test]
+    async fn kick_fires_on_new_node_alive() {
+        let (notify, counter, handle) = counting_notify();
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(&NodeId::new("new"), None, MemberState::Alive);
+        tokio::task::yield_now().await;
+        tokio::task::yield_now().await;
+        assert!(counter.load(Ordering::SeqCst) >= 1);
+        handle.abort();
+    }
+
+    #[tokio::test]
+    async fn kick_fires_on_dead() {
+        let (notify, counter, handle) = counting_notify();
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Dead,
+        );
+        tokio::task::yield_now().await;
+        tokio::task::yield_now().await;
+        assert!(counter.load(Ordering::SeqCst) >= 1);
+        handle.abort();
+    }
+
+    #[tokio::test]
+    async fn kick_fires_on_left() {
+        let (notify, counter, handle) = counting_notify();
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Left,
+        );
+        tokio::task::yield_now().await;
+        tokio::task::yield_now().await;
+        assert!(counter.load(Ordering::SeqCst) >= 1);
+        handle.abort();
+    }
+
+    #[test]
+    fn kick_does_not_fire_on_suspect() {
+        let notify = Arc::new(Notify::new());
+        let hook = RebalancerKickHook::new(notify);
+        hook.on_state_change(
+            &NodeId::new("x"),
+            Some(MemberState::Alive),
+            MemberState::Suspect,
+        );
+    }
+}
diff --git a/nodedb-cluster/src/rebalancer/mod.rs b/nodedb-cluster/src/rebalancer/mod.rs
index 4b687c69..3374ca06 100644
--- a/nodedb-cluster/src/rebalancer/mod.rs
+++ b/nodedb-cluster/src/rebalancer/mod.rs
@@ -22,11 +22,13 @@
 //! 30 s cadence, complementing the overload path.
 
 pub mod driver;
+pub mod elastic;
 pub mod metrics;
 pub mod plan;
 
 pub use driver::{
     AlwaysReadyGate, ElectionGate, MigrationDispatcher, RebalancerLoop, RebalancerLoopConfig,
 };
+pub use elastic::RebalancerKickHook;
 pub use metrics::{LoadMetrics, LoadMetricsProvider, LoadWeights, normalized_score};
 pub use plan::{RebalancerPlanConfig, compute_load_based_plan};
diff --git a/nodedb-cluster/tests/elastic_scaling.rs b/nodedb-cluster/tests/elastic_scaling.rs
new file mode 100644
index 00000000..f51fafd8
--- /dev/null
+++ b/nodedb-cluster/tests/elastic_scaling.rs
@@ -0,0 +1,168 @@
+//! Elastic add/remove — proves the end-to-end path from membership
+//! change to rebalancer dispatch.
+//!
+//! - **Add-node**: 3 balanced nodes, 4th node joins with zero load →
+//!   kick fires → sweep dispatches moves to the new node.
+//! - **Remove-node**: covered by `decommission_flow.rs` — the
+//!   decommission plan strips the node from all groups, and the
+//!   rebalancer loop naturally re-evaluates on its next tick.
+
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+
+use async_trait::async_trait;
+
+use nodedb_cluster::error::Result;
+use nodedb_cluster::rebalance::PlannedMove;
+use nodedb_cluster::rebalancer::{
+    AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher,
+    RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig, RebalancerPlanConfig,
+};
+use nodedb_cluster::routing::RoutingTable;
+use nodedb_cluster::swim::MemberState;
+use nodedb_cluster::swim::subscriber::MembershipSubscriber;
+use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
+use nodedb_types::NodeId;
+
+struct DynamicProvider {
+    metrics: Mutex<Vec<LoadMetrics>>,
+}
+
+impl DynamicProvider {
+    fn new(initial: Vec<LoadMetrics>) -> Arc<Self> {
+        Arc::new(Self {
+            metrics: Mutex::new(initial),
+        })
+    }
+    fn push(&self, m: LoadMetrics) {
+        self.metrics.lock().unwrap().push(m);
+    }
+}
+
+#[async_trait]
+impl LoadMetricsProvider for DynamicProvider {
+    async fn snapshot(&self) -> Result<Vec<LoadMetrics>> {
+        Ok(self.metrics.lock().unwrap().clone())
+    }
+}
+
+struct RecordingDispatcher {
+    calls: Mutex<Vec<PlannedMove>>,
+    fired: AtomicBool,
+}
+
+impl RecordingDispatcher {
+    fn new() -> Arc<Self> {
+        Arc::new(Self {
+            calls: Mutex::new(Vec::new()),
+            fired: AtomicBool::new(false),
+        })
+    }
+}
+
+#[async_trait]
+impl MigrationDispatcher for RecordingDispatcher {
+    async fn dispatch(&self, mv: PlannedMove) -> Result<()> {
+        self.calls.lock().unwrap().push(mv);
+        self.fired.store(true, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
+    LoadMetrics {
+        node_id: id,
+        vshards_led: v,
+        bytes_stored: bytes_mib * 1_048_576,
+        writes_per_sec: w,
+        reads_per_sec: r,
+    }
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn add_node_triggers_rebalance_via_kick() {
+    // --- Initial state: 3 balanced nodes, 6 groups.
+    let mut topo = ClusterTopology::new();
+    for (i, id) in [1u64, 2, 3].iter().enumerate() {
+        let a: std::net::SocketAddr = format!("127.0.0.1:{}", 9000 + i).parse().unwrap();
+        topo.add_node(NodeInfo::new(*id, a, NodeState::Active));
+    }
+    let topology = Arc::new(RwLock::new(topo));
+    let mut rt = RoutingTable::uniform(6, &[1, 2, 3], 1);
+    // Node 1 leads all 6 groups → hot.
+    for gid in 0..6 {
+        rt.set_leader(gid, 1);
+    }
+    let routing = Arc::new(RwLock::new(rt));
+
+    // Metrics: node 1 hot, 2 and 3 moderate.
+    let provider = DynamicProvider::new(vec![
+        lm(1, 200, 2000, 200.0, 200.0),
+        lm(2, 50, 500, 50.0, 50.0),
+        lm(3, 50, 500, 50.0, 50.0),
+    ]);
+
+    let dispatcher = RecordingDispatcher::new();
+    let gate: Arc<dyn ElectionGate> = Arc::new(AlwaysReadyGate);
+
+    // Use a long interval so the normal tick doesn't fire before the
+    // kick does — the kick is the signal we're testing.
+    let rloop = Arc::new(RebalancerLoop::new(
+        RebalancerLoopConfig {
+            interval: Duration::from_secs(300),
+            plan: RebalancerPlanConfig::default(),
+        },
+        provider.clone() as Arc<dyn LoadMetricsProvider>,
+        dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+        gate,
+        routing.clone(),
+        topology.clone(),
+    ));
+
+    // Wire the kick hook.
+    let kick_hook = RebalancerKickHook::new(rloop.kick_handle());
+
+    let (shutdown_tx, shutdown_rx) = tokio::sync::watch::channel(false);
+    let handle = tokio::spawn({
+        let d = Arc::clone(&rloop);
+        async move { d.run(shutdown_rx).await }
+    });
+
+    // --- Simulate node 4 joining.
+    {
+        let mut t = topology.write().unwrap();
+        let a: std::net::SocketAddr = "127.0.0.1:9003".parse().unwrap();
+        t.add_node(NodeInfo::new(4, a, NodeState::Active));
+    }
+    // Add node 4's zero-load metrics so the planner sees it as cold.
+    provider.push(lm(4, 0, 0, 0.0, 0.0));
+
+    // Fire the SWIM membership hook — this should kick the loop.
+    kick_hook.on_state_change(&NodeId::new("node-4"), None, MemberState::Alive);
+
+    // Wait for the dispatcher to fire.
+    let deadline = std::time::Instant::now() + Duration::from_secs(3);
+    while std::time::Instant::now() < deadline {
+        if dispatcher.fired.load(Ordering::SeqCst) {
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+    assert!(
+        dispatcher.fired.load(Ordering::SeqCst),
+        "kick did not trigger a rebalancer dispatch"
+    );
+
+    // At least one move should target node 4 (the cold newcomer).
+    let calls = dispatcher.calls.lock().unwrap().clone();
+    assert!(!calls.is_empty());
+    let to_4 = calls.iter().filter(|m| m.target_node == 4).count();
+    assert!(
+        to_4 > 0,
+        "expected at least one move targeting node 4, got {to_4}"
+    );
+
+    let _ = shutdown_tx.send(true);
+    let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
+}

From 239f65735fd5812c674033e17ddb79e471034966 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 09:59:44 +0800
Subject: [PATCH 10/24] feat(cluster): add SHOW RANGES, SHOW ROUTING, and SHOW
 SCHEMA VERSION commands

Implement three new cluster introspection statements:
- SHOW RANGES: lists vshard distribution with leaseholder and replica info
- SHOW ROUTING <key>: resolves a routing hint to the owning vshard
- SHOW SCHEMA VERSION: reports the current schema version across the cluster

Wire dispatch through the pgwire DDL router and native SQL dispatcher,
and exclude all three from the standard DML fast-path guard.
---
 .../src/control/server/native/dispatch/sql.rs |  3 +
 .../control/server/pgwire/ddl/cluster/mod.rs  |  6 ++
 .../server/pgwire/ddl/cluster/ranges.rs       | 75 +++++++++++++++++++
 .../server/pgwire/ddl/cluster/routing_hint.rs | 72 ++++++++++++++++++
 .../pgwire/ddl/cluster/schema_version.rs      | 57 ++++++++++++++
 .../control/server/pgwire/ddl/router/admin.rs |  9 +++
 .../control/server/pgwire/handler/sql_exec.rs |  3 +
 7 files changed, 225 insertions(+)
 create mode 100644 nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs
 create mode 100644 nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs
 create mode 100644 nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs

diff --git a/nodedb/src/control/server/native/dispatch/sql.rs b/nodedb/src/control/server/native/dispatch/sql.rs
index 570b3c21..7831d3cb 100644
--- a/nodedb/src/control/server/native/dispatch/sql.rs
+++ b/nodedb/src/control/server/native/dispatch/sql.rs
@@ -282,6 +282,9 @@ fn is_session_show(upper: &str) -> bool {
         && !upper.starts_with("SHOW PEER")
         && !upper.starts_with("SHOW NODES")
         && !upper.starts_with("SHOW NODE ")
+        && !upper.starts_with("SHOW RANGES")
+        && !upper.starts_with("SHOW ROUTING")
+        && !upper.starts_with("SHOW SCHEMA VERSION")
         && !upper.starts_with("SHOW COLLECTIONS")
         && !upper.starts_with("SHOW AUDIT")
         && !upper.starts_with("SHOW PERMISSIONS")
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs b/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs
index 58509162..fbd34c9f 100644
--- a/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/mod.rs
@@ -1,11 +1,17 @@
 pub mod health;
 pub mod migration;
 pub mod raft;
+pub mod ranges;
 pub mod rebalance_cmd;
+pub mod routing_hint;
+pub mod schema_version;
 pub mod topology;
 
 pub use health::show_peer_health;
 pub use migration::show_migrations;
 pub use raft::{alter_raft_group, show_raft_group, show_raft_groups};
+pub use ranges::show_ranges;
 pub use rebalance_cmd::rebalance;
+pub use routing_hint::show_routing;
+pub use schema_version::show_schema_version;
 pub use topology::{remove_node, show_cluster, show_node, show_nodes};
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs b/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs
new file mode 100644
index 00000000..8c82e2a1
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/ranges.rs
@@ -0,0 +1,75 @@
+//! `SHOW RANGES` — vshard distribution across the cluster.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::super::super::types::{int8_field, sqlstate_error, text_field};
+
+/// SHOW RANGES — list vshards with leaseholder and replica info.
+///
+/// Columns: vshard_id, group_id, leaseholder, replicas.
+/// Superuser only.
+pub fn show_ranges(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    if !identity.is_superuser {
+        return Err(sqlstate_error(
+            "42501",
+            "permission denied: only superuser can view ranges",
+        ));
+    }
+
+    let routing = match &state.cluster_routing {
+        Some(r) => r,
+        None => {
+            return Err(sqlstate_error(
+                "55000",
+                "cluster mode not enabled (single-node instance)",
+            ));
+        }
+    };
+
+    let schema = Arc::new(vec![
+        int8_field("vshard_id"),
+        int8_field("group_id"),
+        int8_field("leaseholder"),
+        text_field("replicas"),
+    ]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let rt = routing.read().unwrap_or_else(|p| p.into_inner());
+    for vshard_id in 0..nodedb_cluster::routing::VSHARD_COUNT {
+        let group_id = rt.group_for_vshard(vshard_id).unwrap_or(0);
+        let (leader, replicas_str) = match rt.group_info(group_id) {
+            Some(info) => {
+                let replicas: String = info
+                    .members
+                    .iter()
+                    .map(|m| m.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+                (info.leader as i64, replicas)
+            }
+            None => (0i64, String::new()),
+        };
+        encoder.encode_field(&(vshard_id as i64))?;
+        encoder.encode_field(&(group_id as i64))?;
+        encoder.encode_field(&leader)?;
+        encoder.encode_field(&replicas_str)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs b/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs
new file mode 100644
index 00000000..4bdb57a5
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/routing_hint.rs
@@ -0,0 +1,72 @@
+//! `SHOW ROUTING` — expose the vshard → leaseholder → node address
+//! mapping so smart clients can cache it and route writes directly
+//! to the leaseholder, skipping the gateway hop.
+//!
+//! Result columns: `vshard_id`, `group_id`, `leaseholder_node_id`,
+//! `leaseholder_addr`.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::super::super::types::{int8_field, sqlstate_error, text_field};
+
+/// SHOW ROUTING — full vshard → leaseholder → address table.
+///
+/// Any authenticated user may call this (smart-client libs need it).
+pub fn show_routing(
+    state: &SharedState,
+    _identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let routing = match &state.cluster_routing {
+        Some(r) => r,
+        None => {
+            return Err(sqlstate_error(
+                "55000",
+                "cluster mode not enabled (single-node instance)",
+            ));
+        }
+    };
+
+    let schema = Arc::new(vec![
+        int8_field("vshard_id"),
+        int8_field("group_id"),
+        int8_field("leaseholder_node_id"),
+        text_field("leaseholder_addr"),
+    ]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let rt = routing.read().unwrap_or_else(|p| p.into_inner());
+    let topo_guard = state
+        .cluster_topology
+        .as_ref()
+        .map(|t| t.read().unwrap_or_else(|p| p.into_inner()));
+
+    for vshard_id in 0..nodedb_cluster::routing::VSHARD_COUNT {
+        let group_id = rt.group_for_vshard(vshard_id).unwrap_or(0);
+        let leader = rt.group_info(group_id).map(|info| info.leader).unwrap_or(0);
+        let addr = topo_guard
+            .as_ref()
+            .and_then(|topo| topo.get_node(leader))
+            .map(|n| n.addr.clone())
+            .unwrap_or_default();
+
+        encoder.encode_field(&(vshard_id as i64))?;
+        encoder.encode_field(&(group_id as i64))?;
+        encoder.encode_field(&(leader as i64))?;
+        encoder.encode_field(&addr)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs b/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs
new file mode 100644
index 00000000..9c0e9d94
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/cluster/schema_version.rs
@@ -0,0 +1,57 @@
+//! `SHOW SCHEMA VERSION` — current descriptor version visible on
+//! this node.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::super::super::types::{sqlstate_error, text_field};
+
+/// SHOW SCHEMA VERSION — report the current descriptor version
+/// counter and per-collection metadata if available.
+pub fn show_schema_version(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    if !identity.is_superuser {
+        return Err(sqlstate_error(
+            "42501",
+            "permission denied: only superuser can view schema version",
+        ));
+    }
+
+    let schema = Arc::new(vec![text_field("property"), text_field("value")]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let version = state.schema_version.current();
+    encoder.encode_field(&"schema_version")?;
+    encoder.encode_field(&version.to_string())?;
+    rows.push(Ok(encoder.take_row()));
+
+    let applied_index = {
+        let cache = state
+            .metadata_cache
+            .read()
+            .unwrap_or_else(|p| p.into_inner());
+        cache.applied_index
+    };
+    encoder.encode_field(&"metadata_applied_index")?;
+    encoder.encode_field(&applied_index.to_string())?;
+    rows.push(Ok(encoder.take_row()));
+
+    encoder.encode_field(&"node_id")?;
+    encoder.encode_field(&state.node_id.to_string())?;
+    rows.push(Ok(encoder.take_row()));
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/router/admin.rs b/nodedb/src/control/server/pgwire/ddl/router/admin.rs
index c85cdc9a..49ab2031 100644
--- a/nodedb/src/control/server/pgwire/ddl/router/admin.rs
+++ b/nodedb/src/control/server/pgwire/ddl/router/admin.rs
@@ -441,6 +441,15 @@ pub(super) async fn dispatch(
     if upper.starts_with("REMOVE NODE ") {
         return Some(super::super::cluster::remove_node(state, identity, parts));
     }
+    if upper.starts_with("SHOW RANGES") {
+        return Some(super::super::cluster::show_ranges(state, identity));
+    }
+    if upper.starts_with("SHOW ROUTING") {
+        return Some(super::super::cluster::show_routing(state, identity));
+    }
+    if upper.starts_with("SHOW SCHEMA VERSION") {
+        return Some(super::super::cluster::show_schema_version(state, identity));
+    }
 
     // Introspection.
     if upper.starts_with("SHOW USERS") {
diff --git a/nodedb/src/control/server/pgwire/handler/sql_exec.rs b/nodedb/src/control/server/pgwire/handler/sql_exec.rs
index ef469e04..0c2245dd 100644
--- a/nodedb/src/control/server/pgwire/handler/sql_exec.rs
+++ b/nodedb/src/control/server/pgwire/handler/sql_exec.rs
@@ -217,6 +217,9 @@ impl NodeDbPgHandler {
             && !upper.starts_with("SHOW PEER")
             && !upper.starts_with("SHOW NODES")
             && !upper.starts_with("SHOW NODE ")
+            && !upper.starts_with("SHOW RANGES")
+            && !upper.starts_with("SHOW ROUTING")
+            && !upper.starts_with("SHOW SCHEMA VERSION")
             && !upper.starts_with("SHOW COLLECTIONS")
             && !upper.starts_with("SHOW AUDIT")
             && !upper.starts_with("SHOW PERMISSIONS")

From ff6c817ac58df0098a9a7f3079a7131866b6d38e Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 09:59:54 +0800
Subject: [PATCH 11/24] feat(http): add /health/live and /health/drain
 endpoints

Add two new health endpoints alongside the existing /healthz and /health:
- GET /health/live: unconditional liveness probe (always 200); suitable as
  a k8s liveness probe since no internal state is checked
- POST /health/drain: signals the shutdown watch to begin graceful drain,
  causing /healthz to return 503 so the service mesh stops routing new
  connections; designed as a k8s preStop hook

Update /healthz to return 503 when the cluster observer reports the node
is draining or decommissioned, and fix the startup gate middleware comment
to reflect all /health* paths being exempt from the startup gate.
---
 .../src/control/server/http/routes/health.rs  | 69 +++++++++++++++++--
 nodedb/src/control/server/http/server.rs      |  8 ++-
 2 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/nodedb/src/control/server/http/routes/health.rs b/nodedb/src/control/server/http/routes/health.rs
index a97e02af..6a717d57 100644
--- a/nodedb/src/control/server/http/routes/health.rs
+++ b/nodedb/src/control/server/http/routes/health.rs
@@ -1,4 +1,12 @@
 //! Health check endpoints.
+//!
+//! | Endpoint          | Method | Purpose                     | k8s probe     |
+//! |-------------------|--------|-----------------------------|---------------|
+//! | `/health/live`    | GET    | Process alive (always 200)  | liveness      |
+//! | `/healthz`        | GET    | Ready to serve traffic      | readiness     |
+//! | `/health`         | GET    | Liveness with cluster info  | —             |
+//! | `/health/ready`   | GET    | WAL recovered               | readiness alt |
+//! | `/health/drain`   | POST   | Trigger graceful drain      | preStop hook  |
 
 use axum::extract::State;
 use axum::http::StatusCode;
@@ -7,13 +15,36 @@ use serde_json::json;
 
 use super::super::auth::AppState;
 
-/// GET /healthz — k8s-style readiness/liveness probe.
+/// GET /health/live — unconditional liveness probe.
 ///
-/// Returns `200 OK` when the node has reached `GatewayEnable` and is
-/// serving traffic. Returns `503 Service Unavailable` during startup or if
-/// startup has failed. This endpoint bypasses the startup gate middleware
-/// and is always reachable, making it suitable as a k8s readiness probe.
+/// Always returns 200. If this endpoint fails to respond, the
+/// process is dead and should be restarted. No internal state is
+/// checked — the mere ability to respond proves the event loop and
+/// HTTP listener are alive.
+pub async fn live() -> impl IntoResponse {
+    (StatusCode::OK, axum::Json(json!({ "status": "alive" })))
+}
+
+/// GET /healthz — k8s-style readiness probe.
+///
+/// Returns `200 OK` when the node has reached `GatewayEnable`, is
+/// serving traffic, and is NOT draining/decommissioned. Returns
+/// `503 Service Unavailable` during startup, after startup failure,
+/// or when the node is being decommissioned.
 pub async fn healthz(State(state): State<AppState>) -> impl IntoResponse {
+    // Check decommission state via the cluster observer (if present).
+    if let Some(obs) = state.shared.cluster_observer.get() {
+        let snap = obs.snapshot();
+        let label = snap.lifecycle_label();
+        if label == "draining" || label == "decommissioned" || label == "failed" {
+            let body = json!({
+                "status": "draining",
+                "lifecycle": label,
+                "node_id": state.shared.node_id,
+            });
+            return (StatusCode::SERVICE_UNAVAILABLE, axum::Json(body));
+        }
+    }
     let health = crate::control::startup::health::observe(&state.shared.startup);
     let (status, body) = crate::control::startup::health::to_http_response(&health);
     (status, axum::Json(body))
@@ -60,3 +91,31 @@ pub async fn ready(State(state): State<AppState>) -> impl IntoResponse {
     });
     (status, axum::Json(body))
 }
+
+/// POST /health/drain — trigger graceful connection drain.
+///
+/// Signals the canonical `ShutdownWatch` so every background loop
+/// begins its cooperative exit. Subsequent `/healthz` calls return
+/// 503, which causes the k8s readiness probe to fail and the
+/// service mesh to stop routing new connections to this node.
+///
+/// Designed for use as a k8s `preStop` hook:
+///
+/// ```yaml
+/// lifecycle:
+///   preStop:
+///     httpGet:
+///       path: /health/drain
+///       port: http
+/// ```
+pub async fn drain(State(state): State<AppState>) -> impl IntoResponse {
+    tracing::info!(node_id = state.shared.node_id, "drain requested via HTTP");
+    state.shared.shutdown.signal();
+    (
+        StatusCode::OK,
+        axum::Json(json!({
+            "status": "draining",
+            "node_id": state.shared.node_id,
+        })),
+    )
+}
diff --git a/nodedb/src/control/server/http/server.rs b/nodedb/src/control/server/http/server.rs
index 1a7e8d28..33d7a11c 100644
--- a/nodedb/src/control/server/http/server.rs
+++ b/nodedb/src/control/server/http/server.rs
@@ -3,7 +3,9 @@
 //! Endpoints:
 //! - GET  /healthz      — k8s readiness/liveness (always reachable; 503 until GatewayEnable)
 //! - GET  /health       — liveness
+//! - GET  /health/live  — unconditional liveness probe
 //! - GET  /health/ready — readiness (WAL recovered)
+//! - POST /health/drain — trigger graceful drain
 //! - GET  /metrics      — Prometheus-format metrics (requires monitor role)
 //! - POST /query        — execute DDL via HTTP (requires auth)
 
@@ -29,7 +31,9 @@ fn build_router(state: AppState) -> Router {
         // /healthz is always reachable — returns 503 during startup, 200 after.
         .route("/healthz", get(routes::health::healthz))
         .route("/health", get(routes::health::health))
+        .route("/health/live", get(routes::health::live))
         .route("/health/ready", get(routes::health::ready))
+        .route("/health/drain", post(routes::health::drain))
         .route("/metrics", get(routes::metrics::metrics))
         .route("/query", post(routes::query::query))
         .route("/status", get(routes::status::status))
@@ -98,8 +102,8 @@ fn build_router(state: AppState) -> Router {
 
 /// Axum middleware that gates non-health routes on [`StartupPhase::GatewayEnable`].
 ///
-/// `/healthz`, `/health`, and `/health/ready` are always let through so k8s
-/// readiness probes can observe startup progress. All other routes receive a
+/// All `/health*` paths (liveness, readiness, drain) are always let through so
+/// k8s probes can observe startup progress. All other routes receive a
 /// `503 Service Unavailable` until the node reaches `GatewayEnable`.
 async fn startup_gate_middleware(
     State(app_state): State<AppState>,

From b8f6618aa7142389beb3b890b820e8d939614391 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 10:21:54 +0800
Subject: [PATCH 12/24] fix(pgwire): wire pg_catalog dispatch into query
 handler and fix rebalancer test lock scope

---
 nodedb-cluster/tests/rebalancer_loop.rs        | 18 ++++++++++--------
 .../control/server/pgwire/handler/sql_exec.rs  |  7 +++++++
 nodedb/src/control/server/pgwire/mod.rs        |  1 +
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/nodedb-cluster/tests/rebalancer_loop.rs b/nodedb-cluster/tests/rebalancer_loop.rs
index 5d7193a8..33ba6bcb 100644
--- a/nodedb-cluster/tests/rebalancer_loop.rs
+++ b/nodedb-cluster/tests/rebalancer_loop.rs
@@ -160,14 +160,16 @@ async fn rebalancer_loop_dispatches_and_mutates_routing() {
 
     // Routing mutation: at least one group previously led by 1 now
     // has a non-1 leader.
-    let rt = routing.read().unwrap();
-    let still_on_1 = (0..6)
-        .filter(|gid| rt.group_info(*gid).unwrap().leader == 1)
-        .count();
-    assert!(
-        still_on_1 < 6,
-        "at least one group should have moved off node 1"
-    );
+    {
+        let rt = routing.read().unwrap();
+        let still_on_1 = (0..6)
+            .filter(|gid| rt.group_info(*gid).unwrap().leader == 1)
+            .count();
+        assert!(
+            still_on_1 < 6,
+            "at least one group should have moved off node 1"
+        );
+    }
 
     let _ = shutdown_tx.send(true);
     let _ = tokio::time::timeout(Duration::from_secs(1), handle).await;
diff --git a/nodedb/src/control/server/pgwire/handler/sql_exec.rs b/nodedb/src/control/server/pgwire/handler/sql_exec.rs
index 0c2245dd..69936c8e 100644
--- a/nodedb/src/control/server/pgwire/handler/sql_exec.rs
+++ b/nodedb/src/control/server/pgwire/handler/sql_exec.rs
@@ -286,6 +286,13 @@ impl NodeDbPgHandler {
             );
         }
 
+        // pg_catalog virtual tables — intercept before the normal planner.
+        if let Some(result) =
+            super::super::pg_catalog::try_pg_catalog(&self.state, identity, &upper)
+        {
+            return result;
+        }
+
         if let Some(result) = super::super::ddl::dispatch(&self.state, identity, sql_trimmed).await
         {
             return result;
diff --git a/nodedb/src/control/server/pgwire/mod.rs b/nodedb/src/control/server/pgwire/mod.rs
index 21cb8ac2..c2c90ed1 100644
--- a/nodedb/src/control/server/pgwire/mod.rs
+++ b/nodedb/src/control/server/pgwire/mod.rs
@@ -2,5 +2,6 @@ pub mod ddl;
 pub mod factory;
 pub mod handler;
 pub mod listener;
+pub mod pg_catalog;
 pub mod session;
 pub mod types;

From a2e87e5157163f1bac02a559a0af19d6af981503 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 11:47:07 +0800
Subject: [PATCH 13/24] feat(rebalancer): add CPU backpressure gate to pause
 sweeps under load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a `backpressure_cpu_threshold` field to `RebalancerLoopConfig`
(default 0.80). Before executing a rebalance sweep, the driver checks
all node CPU utilization snapshots; if any node exceeds the threshold
the sweep is skipped and a STATUS-level log is emitted. This prevents
vShard migrations from amplifying cluster load when nodes are already
stressed.

- `LoadMetrics` gains a `cpu_utilization: f64` field (0.0–1.0)
- Integration tests updated to populate the new field
- Unit test `sweep_skipped_under_cpu_backpressure` verifies no
  migration is dispatched when a node reports 95% CPU
---
 nodedb-cluster/src/rebalancer/driver.rs  | 57 +++++++++++++++++++++++-
 nodedb-cluster/src/rebalancer/metrics.rs |  5 +++
 nodedb-cluster/src/rebalancer/plan.rs    |  1 +
 nodedb-cluster/tests/elastic_scaling.rs  |  5 ++-
 nodedb-cluster/tests/rebalancer_loop.rs  |  5 ++-
 5 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/nodedb-cluster/src/rebalancer/driver.rs b/nodedb-cluster/src/rebalancer/driver.rs
index 52b89bd9..2150a474 100644
--- a/nodedb-cluster/src/rebalancer/driver.rs
+++ b/nodedb-cluster/src/rebalancer/driver.rs
@@ -77,6 +77,11 @@ pub struct RebalancerLoopConfig {
     /// Plan computation config propagated to
     /// [`compute_load_based_plan`] on every tick.
     pub plan: RebalancerPlanConfig,
+    /// CPU utilization threshold (0.0–1.0) above which the
+    /// rebalancer pauses to avoid amplifying load. If ANY node in
+    /// the metrics snapshot exceeds this value, the sweep is skipped
+    /// and a STATUS event is logged. Default 0.80 (80%).
+    pub backpressure_cpu_threshold: f64,
 }
 
 impl Default for RebalancerLoopConfig {
@@ -84,6 +89,7 @@ impl Default for RebalancerLoopConfig {
         Self {
             interval: Duration::from_secs(30),
             plan: RebalancerPlanConfig::default(),
+            backpressure_cpu_threshold: 0.80,
         }
     }
 }
@@ -173,6 +179,18 @@ impl RebalancerLoop {
                 return;
             }
         };
+        if let Some(hot) = metrics
+            .iter()
+            .find(|m| m.cpu_utilization > self.cfg.backpressure_cpu_threshold)
+        {
+            info!(
+                node_id = hot.node_id,
+                cpu = format!("{:.0}%", hot.cpu_utilization * 100.0),
+                threshold = format!("{:.0}%", self.cfg.backpressure_cpu_threshold * 100.0),
+                "rebalancer: back-pressure — cluster under load, skipping sweep"
+            );
+            return;
+        }
         let plan = {
             let routing = self.routing.read().unwrap_or_else(|p| p.into_inner());
             let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
@@ -273,6 +291,7 @@ mod tests {
             bytes_stored: bytes_mib * 1_048_576,
             writes_per_sec: w,
             reads_per_sec: r,
+            cpu_utilization: 0.0,
         }
     }
 
@@ -289,7 +308,7 @@ mod tests {
         let rloop = Arc::new(RebalancerLoop::new(
             RebalancerLoopConfig {
                 interval: Duration::from_millis(50),
-                plan: RebalancerPlanConfig::default(),
+                ..Default::default()
             },
             metrics,
             disp_dyn,
@@ -369,4 +388,40 @@ mod tests {
         let _ = tx.send(true);
         let _ = tokio::time::timeout(Duration::from_millis(500), handle).await;
     }
+
+    #[tokio::test]
+    async fn sweep_skipped_under_cpu_backpressure() {
+        let metrics: Arc<dyn LoadMetricsProvider> = Arc::new(StaticMetrics(vec![
+            LoadMetrics {
+                node_id: 1,
+                vshards_led: 500,
+                bytes_stored: 5000 * 1_048_576,
+                writes_per_sec: 200.0,
+                reads_per_sec: 200.0,
+                cpu_utilization: 0.95, // above 80% threshold
+            },
+            lm(2, 5, 5, 5.0, 5.0),
+            lm(3, 5, 5, 5.0, 5.0),
+        ]));
+        let dispatcher = RecordingDispatcher::new();
+        let rloop = Arc::new(RebalancerLoop::new(
+            RebalancerLoopConfig {
+                interval: Duration::from_millis(50),
+                ..Default::default()
+            },
+            metrics,
+            dispatcher.clone() as Arc<dyn MigrationDispatcher>,
+            Arc::new(AlwaysReadyGate),
+            routing_hot_on(1),
+            topo(&[1, 2, 3]),
+        ));
+        rloop.sweep_once().await;
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        assert!(
+            dispatcher.take().is_empty(),
+            "dispatcher should not fire when cluster is under CPU backpressure"
+        );
+    }
 }
diff --git a/nodedb-cluster/src/rebalancer/metrics.rs b/nodedb-cluster/src/rebalancer/metrics.rs
index 0a03b894..b9c9f5b5 100644
--- a/nodedb-cluster/src/rebalancer/metrics.rs
+++ b/nodedb-cluster/src/rebalancer/metrics.rs
@@ -28,6 +28,10 @@ pub struct LoadMetrics {
     pub writes_per_sec: f64,
     /// Reads per second (rolling average, caller-defined window).
     pub reads_per_sec: f64,
+    /// Per-core CPU utilization (0.0–1.0). Used by the
+    /// back-pressure gate to pause the rebalancer when the cluster
+    /// is already stressed.
+    pub cpu_utilization: f64,
 }
 
 /// Relative weights for the four load dimensions. Scaled linearly;
@@ -89,6 +93,7 @@ mod tests {
             bytes_stored: bytes_mib * 1_048_576,
             writes_per_sec: w,
             reads_per_sec: r,
+            cpu_utilization: 0.0,
         }
     }
 
diff --git a/nodedb-cluster/src/rebalancer/plan.rs b/nodedb-cluster/src/rebalancer/plan.rs
index 5620e240..ae68712a 100644
--- a/nodedb-cluster/src/rebalancer/plan.rs
+++ b/nodedb-cluster/src/rebalancer/plan.rs
@@ -207,6 +207,7 @@ mod tests {
             bytes_stored: bytes_mib * 1_048_576,
             writes_per_sec: w,
             reads_per_sec: r,
+            cpu_utilization: 0.0,
         }
     }
 
diff --git a/nodedb-cluster/tests/elastic_scaling.rs b/nodedb-cluster/tests/elastic_scaling.rs
index f51fafd8..4574c721 100644
--- a/nodedb-cluster/tests/elastic_scaling.rs
+++ b/nodedb-cluster/tests/elastic_scaling.rs
@@ -17,7 +17,7 @@ use nodedb_cluster::error::Result;
 use nodedb_cluster::rebalance::PlannedMove;
 use nodedb_cluster::rebalancer::{
     AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher,
-    RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig, RebalancerPlanConfig,
+    RebalancerKickHook, RebalancerLoop, RebalancerLoopConfig,
 };
 use nodedb_cluster::routing::RoutingTable;
 use nodedb_cluster::swim::MemberState;
@@ -77,6 +77,7 @@ fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
         bytes_stored: bytes_mib * 1_048_576,
         writes_per_sec: w,
         reads_per_sec: r,
+        cpu_utilization: 0.0,
     }
 }
 
@@ -111,7 +112,7 @@ async fn add_node_triggers_rebalance_via_kick() {
     let rloop = Arc::new(RebalancerLoop::new(
         RebalancerLoopConfig {
             interval: Duration::from_secs(300),
-            plan: RebalancerPlanConfig::default(),
+            ..Default::default()
         },
         provider.clone() as Arc<dyn LoadMetricsProvider>,
         dispatcher.clone() as Arc<dyn MigrationDispatcher>,
diff --git a/nodedb-cluster/tests/rebalancer_loop.rs b/nodedb-cluster/tests/rebalancer_loop.rs
index 33ba6bcb..b32cf2b6 100644
--- a/nodedb-cluster/tests/rebalancer_loop.rs
+++ b/nodedb-cluster/tests/rebalancer_loop.rs
@@ -28,7 +28,7 @@ use nodedb_cluster::error::Result;
 use nodedb_cluster::rebalance::PlannedMove;
 use nodedb_cluster::rebalancer::{
     AlwaysReadyGate, ElectionGate, LoadMetrics, LoadMetricsProvider, MigrationDispatcher,
-    RebalancerLoop, RebalancerLoopConfig, RebalancerPlanConfig,
+    RebalancerLoop, RebalancerLoopConfig,
 };
 use nodedb_cluster::routing::RoutingTable;
 use nodedb_cluster::topology::{ClusterTopology, NodeInfo, NodeState};
@@ -95,6 +95,7 @@ fn lm(id: u64, v: u32, bytes_mib: u64, w: f64, r: f64) -> LoadMetrics {
         bytes_stored: bytes_mib * 1_048_576,
         writes_per_sec: w,
         reads_per_sec: r,
+        cpu_utilization: 0.0,
     }
 }
 
@@ -121,7 +122,7 @@ async fn rebalancer_loop_dispatches_and_mutates_routing() {
     let rloop = Arc::new(RebalancerLoop::new(
         RebalancerLoopConfig {
             interval: Duration::from_millis(50),
-            plan: RebalancerPlanConfig::default(),
+            ..Default::default()
         },
         metrics,
         dispatcher.clone() as Arc<dyn MigrationDispatcher>,

From a01f73c2c81e6dee6192d2d82c2da1c9f9d19873 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 11:47:45 +0800
Subject: [PATCH 14/24] feat(pgwire): add nodedb.read_consistency session
 parameter

Introduces `session/read_consistency.rs` with parsing and validation
for the `nodedb.read_consistency` SET parameter. Accepted values are
`strong`, `bounded_staleness:<secs>`, and `eventual`. Invalid values
are rejected at SET time with SQLSTATE 22023 and a descriptive
error message rather than silently ignored.
---
 .../server/pgwire/handler/session_cmds.rs     |  13 ++
 .../src/control/server/pgwire/session/mod.rs  |   1 +
 .../server/pgwire/session/read_consistency.rs | 155 ++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 nodedb/src/control/server/pgwire/session/read_consistency.rs

diff --git a/nodedb/src/control/server/pgwire/handler/session_cmds.rs b/nodedb/src/control/server/pgwire/handler/session_cmds.rs
index e3848f2e..ac7baa64 100644
--- a/nodedb/src/control/server/pgwire/handler/session_cmds.rs
+++ b/nodedb/src/control/server/pgwire/handler/session_cmds.rs
@@ -67,6 +67,19 @@ impl NodeDbPgHandler {
             }
         }
 
+        if key == super::super::session::read_consistency::PARAM_KEY
+            && super::super::session::read_consistency::parse_value(&value).is_none()
+        {
+            return Err(PgWireError::UserError(Box::new(ErrorInfo::new(
+                "ERROR".to_owned(),
+                "22023".to_owned(),
+                format!(
+                    "invalid value for {}: '{value}'. Valid: strong, bounded_staleness:<secs>, eventual",
+                    super::super::session::read_consistency::PARAM_KEY
+                ),
+            ))));
+        }
+
         if key == "nodedb.tenant_id" && value.parse::<u32>().is_err() {
             return Err(PgWireError::UserError(Box::new(ErrorInfo::new(
                 "ERROR".to_owned(),
diff --git a/nodedb/src/control/server/pgwire/session/mod.rs b/nodedb/src/control/server/pgwire/session/mod.rs
index 7ea726b5..91014cc8 100644
--- a/nodedb/src/control/server/pgwire/session/mod.rs
+++ b/nodedb/src/control/server/pgwire/session/mod.rs
@@ -2,6 +2,7 @@ mod cursor;
 pub mod cursor_spill;
 mod live;
 mod params;
+pub mod read_consistency;
 mod state;
 mod store;
 pub mod temp_tables;
diff --git a/nodedb/src/control/server/pgwire/session/read_consistency.rs b/nodedb/src/control/server/pgwire/session/read_consistency.rs
new file mode 100644
index 00000000..526034aa
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/session/read_consistency.rs
@@ -0,0 +1,155 @@
+//! Session-level `ReadConsistency` — wire `SET` / `SHOW` for the
+//! `default_read_consistency` session parameter.
+//!
+//! Accepted values:
+//!
+//! - `'strong'`
+//! - `'bounded_staleness:<secs>'` or `'bounded_staleness:<secs>s'`
+//! - `'eventual'`
+//!
+//! The value is stored as a plain string in the session parameter
+//! map. This module provides the typed parse + accessor.
+
+use std::net::SocketAddr;
+use std::time::Duration;
+
+use crate::types::ReadConsistency;
+
+use super::store::SessionStore;
+
+/// Session parameter key.
+pub const PARAM_KEY: &str = "default_read_consistency";
+
+/// Parse a user-supplied string into a `ReadConsistency`. Returns
+/// `None` on unrecognised input so the caller can return a helpful
+/// error message.
+pub fn parse_value(value: &str) -> Option<ReadConsistency> {
+    let lower = value.trim().to_lowercase();
+    match lower.as_str() {
+        "strong" => Some(ReadConsistency::Strong),
+        "eventual" => Some(ReadConsistency::Eventual),
+        _ => {
+            let stripped = lower.strip_prefix("bounded_staleness:")?;
+            let secs_str = stripped.trim_end_matches('s').trim();
+            let secs: f64 = secs_str.parse().ok()?;
+            if secs <= 0.0 {
+                return None;
+            }
+            Some(ReadConsistency::BoundedStaleness(Duration::from_secs_f64(
+                secs,
+            )))
+        }
+    }
+}
+
+/// Format a `ReadConsistency` back into the canonical string form
+/// so `SHOW default_read_consistency` returns something parseable.
+pub fn format_value(rc: &ReadConsistency) -> String {
+    match rc {
+        ReadConsistency::Strong => "strong".to_string(),
+        ReadConsistency::Eventual => "eventual".to_string(),
+        ReadConsistency::BoundedStaleness(d) => {
+            format!("bounded_staleness:{}s", d.as_secs_f64())
+        }
+    }
+}
+
+impl SessionStore {
+    /// Resolve the effective `ReadConsistency` for a session. Falls
+    /// back to `Strong` if the parameter is unset or unparseable.
+    pub fn read_consistency(&self, addr: &SocketAddr) -> ReadConsistency {
+        self.get_parameter(addr, PARAM_KEY)
+            .and_then(|v| parse_value(&v))
+            .unwrap_or_default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_strong() {
+        assert_eq!(parse_value("strong"), Some(ReadConsistency::Strong));
+        assert_eq!(parse_value("STRONG"), Some(ReadConsistency::Strong));
+    }
+
+    #[test]
+    fn parse_eventual() {
+        assert_eq!(parse_value("eventual"), Some(ReadConsistency::Eventual));
+    }
+
+    #[test]
+    fn parse_bounded_staleness_seconds() {
+        let rc = parse_value("bounded_staleness:5").unwrap();
+        assert_eq!(
+            rc,
+            ReadConsistency::BoundedStaleness(Duration::from_secs(5))
+        );
+    }
+
+    #[test]
+    fn parse_bounded_staleness_with_s_suffix() {
+        let rc = parse_value("bounded_staleness:5s").unwrap();
+        assert_eq!(
+            rc,
+            ReadConsistency::BoundedStaleness(Duration::from_secs(5))
+        );
+    }
+
+    #[test]
+    fn parse_bounded_staleness_fractional() {
+        let rc = parse_value("bounded_staleness:0.5s").unwrap();
+        assert_eq!(
+            rc,
+            ReadConsistency::BoundedStaleness(Duration::from_millis(500))
+        );
+    }
+
+    #[test]
+    fn parse_rejects_zero_staleness() {
+        assert!(parse_value("bounded_staleness:0").is_none());
+    }
+
+    #[test]
+    fn parse_rejects_garbage() {
+        assert!(parse_value("foobar").is_none());
+        assert!(parse_value("").is_none());
+    }
+
+    #[test]
+    fn format_roundtrip_strong() {
+        let s = format_value(&ReadConsistency::Strong);
+        assert_eq!(parse_value(&s), Some(ReadConsistency::Strong));
+    }
+
+    #[test]
+    fn format_roundtrip_bounded() {
+        let rc = ReadConsistency::BoundedStaleness(Duration::from_secs(10));
+        let s = format_value(&rc);
+        assert_eq!(parse_value(&s), Some(rc));
+    }
+
+    #[test]
+    fn format_roundtrip_eventual() {
+        let s = format_value(&ReadConsistency::Eventual);
+        assert_eq!(parse_value(&s), Some(ReadConsistency::Eventual));
+    }
+
+    #[test]
+    fn session_store_defaults_to_strong() {
+        let store = SessionStore::new();
+        let addr: SocketAddr = "127.0.0.1:5432".parse().unwrap();
+        store.ensure_session(addr);
+        assert_eq!(store.read_consistency(&addr), ReadConsistency::Strong);
+    }
+
+    #[test]
+    fn session_store_reads_set_value() {
+        let store = SessionStore::new();
+        let addr: SocketAddr = "127.0.0.1:5432".parse().unwrap();
+        store.ensure_session(addr);
+        store.set_parameter(&addr, PARAM_KEY.to_string(), "eventual".to_string());
+        assert_eq!(store.read_consistency(&addr), ReadConsistency::Eventual);
+    }
+}

From 9af5664051277c78aae97b2d55a764867d5ff550 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 11:47:56 +0800
Subject: [PATCH 15/24] feat(pgwire): implement pg_catalog virtual table
 handler

Adds `pgwire/pg_catalog/` with a dispatcher and virtual table
definitions that intercept queries against `pg_catalog` relations.
This allows PostgreSQL-compatible clients and drivers that inspect
catalog tables during connection setup to receive well-formed
responses without requiring a full system catalog implementation.
---
 .../server/pgwire/pg_catalog/dispatch.rs      |  95 ++++++
 .../control/server/pgwire/pg_catalog/mod.rs   |  19 ++
 .../server/pgwire/pg_catalog/tables.rs        | 270 ++++++++++++++++++
 3 files changed, 384 insertions(+)
 create mode 100644 nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs
 create mode 100644 nodedb/src/control/server/pgwire/pg_catalog/mod.rs
 create mode 100644 nodedb/src/control/server/pgwire/pg_catalog/tables.rs

diff --git a/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs b/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs
new file mode 100644
index 00000000..86d481a1
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/pg_catalog/dispatch.rs
@@ -0,0 +1,95 @@
+//! pg_catalog query interception and dispatch.
+
+use pgwire::api::results::Response;
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+use super::tables;
+
+/// Try to handle a SQL query as a pg_catalog virtual-table lookup.
+///
+/// Returns `Some(Ok(response))` if the query targets a known
+/// pg_catalog table, `None` if the query should fall through to the
+/// normal planner. The `upper` argument is the uppercased SQL.
+pub fn try_pg_catalog(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    upper: &str,
+) -> Option<PgWireResult<Vec<Response>>> {
+    let table = extract_pg_catalog_table(upper)?;
+    let result = match table {
+        "pg_database" => tables::pg_database(),
+        "pg_namespace" => tables::pg_namespace(),
+        "pg_type" => tables::pg_type(),
+        "pg_class" => tables::pg_class(state, identity),
+        "pg_attribute" => tables::pg_attribute(state, identity),
+        "pg_index" => tables::pg_index(),
+        "pg_authid" => tables::pg_authid(state, identity),
+        _ => return None,
+    };
+    Some(result)
+}
+
+/// Extract the first `pg_catalog.<table>` or bare `pg_<table>`
+/// reference from a FROM clause. Returns the lowercase table name
+/// if found.
+fn extract_pg_catalog_table(upper: &str) -> Option<&'static str> {
+    let known = [
+        "pg_database",
+        "pg_namespace",
+        "pg_type",
+        "pg_class",
+        "pg_attribute",
+        "pg_index",
+        "pg_authid",
+    ];
+    for table in &known {
+        let qualified = format!("PG_CATALOG.{}", table.to_uppercase());
+        let bare = table.to_uppercase();
+        if upper.contains(&qualified) || upper.contains(&bare) {
+            return Some(table);
+        }
+    }
+    None
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn extracts_qualified_table() {
+        let sql = "SELECT * FROM pg_catalog.pg_class WHERE relkind = 'r'";
+        assert_eq!(
+            extract_pg_catalog_table(&sql.to_uppercase()),
+            Some("pg_class")
+        );
+    }
+
+    #[test]
+    fn extracts_bare_table() {
+        let sql = "SELECT oid, typname FROM pg_type";
+        assert_eq!(
+            extract_pg_catalog_table(&sql.to_uppercase()),
+            Some("pg_type")
+        );
+    }
+
+    #[test]
+    fn no_match_for_regular_query() {
+        let sql = "SELECT * FROM users WHERE id = 1";
+        assert_eq!(extract_pg_catalog_table(&sql.to_uppercase()), None);
+    }
+
+    #[test]
+    fn handles_join_with_pg_catalog() {
+        let sql =
+            "SELECT c.oid FROM pg_class c JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid";
+        assert_eq!(
+            extract_pg_catalog_table(&sql.to_uppercase()),
+            Some("pg_namespace")
+        );
+    }
+}
diff --git a/nodedb/src/control/server/pgwire/pg_catalog/mod.rs b/nodedb/src/control/server/pgwire/pg_catalog/mod.rs
new file mode 100644
index 00000000..01e74e69
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/pg_catalog/mod.rs
@@ -0,0 +1,19 @@
+//! Minimal `pg_catalog` virtual-table emulation.
+//!
+//! Generic Postgres clients (DBeaver, pgAdmin, SQLAlchemy, psql's
+//! `\dt`) issue `SELECT` queries against `pg_catalog.*` tables to
+//! discover schemas, types, and tables. Without a response they
+//! either error out or show an empty catalog. This module intercepts
+//! those queries and returns rows synthesised from NodeDB's own
+//! `SystemCatalog` and credential store.
+//!
+//! The interception is pattern-based: we extract the first
+//! `pg_catalog.<table>` (or bare `pg_<table>`) reference from the
+//! `FROM` clause and delegate to the matching virtual table handler.
+//! The result always returns ALL rows with a fixed column schema —
+//! clients that send `WHERE` clauses filter client-side.
+
+pub mod dispatch;
+pub mod tables;
+
+pub use dispatch::try_pg_catalog;
diff --git a/nodedb/src/control/server/pgwire/pg_catalog/tables.rs b/nodedb/src/control/server/pgwire/pg_catalog/tables.rs
new file mode 100644
index 00000000..6e46e63e
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/pg_catalog/tables.rs
@@ -0,0 +1,270 @@
+//! Virtual table row generators for each pg_catalog table.
+
+use std::sync::Arc;
+
+use futures::stream;
+use pgwire::api::results::{DataRowEncoder, QueryResponse, Response};
+use pgwire::error::PgWireResult;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::server::pgwire::types::{bool_field, int4_field, int8_field, text_field};
+use crate::control::state::SharedState;
+
+/// `pg_database` — one row: the current database.
+pub fn pg_database() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("datname"),
+        text_field("datdba"),
+        text_field("encoding"),
+    ]);
+    let mut encoder = DataRowEncoder::new(schema.clone());
+    encoder.encode_field(&1i64)?;
+    encoder.encode_field(&"nodedb")?;
+    encoder.encode_field(&"nodedb")?;
+    encoder.encode_field(&"UTF8")?;
+    let rows = vec![Ok(encoder.take_row())];
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_namespace` — schemas: `public` + `pg_catalog`.
+pub fn pg_namespace() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("nspname"),
+        int8_field("nspowner"),
+    ]);
+    let mut encoder = DataRowEncoder::new(schema.clone());
+    let mut rows = Vec::new();
+
+    encoder.encode_field(&11i64)?;
+    encoder.encode_field(&"pg_catalog")?;
+    encoder.encode_field(&10i64)?;
+    rows.push(Ok(encoder.take_row()));
+
+    encoder.encode_field(&2200i64)?;
+    encoder.encode_field(&"public")?;
+    encoder.encode_field(&10i64)?;
+    rows.push(Ok(encoder.take_row()));
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_type` — common Postgres type OIDs that client drivers need.
+pub fn pg_type() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("typname"),
+        int8_field("typnamespace"),
+        int4_field("typlen"),
+        text_field("typtype"),
+    ]);
+
+    let types: &[(i64, &str, i32, &str)] = &[
+        (16, "bool", 1, "b"),
+        (20, "int8", 8, "b"),
+        (21, "int2", 2, "b"),
+        (23, "int4", 4, "b"),
+        (25, "text", -1, "b"),
+        (114, "json", -1, "b"),
+        (700, "float4", 4, "b"),
+        (701, "float8", 8, "b"),
+        (1043, "varchar", -1, "b"),
+        (1082, "date", 4, "b"),
+        (1114, "timestamp", 8, "b"),
+        (1184, "timestamptz", 8, "b"),
+        (2950, "uuid", 16, "b"),
+        (3802, "jsonb", -1, "b"),
+    ];
+
+    let mut rows = Vec::with_capacity(types.len());
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    for &(oid, name, len, typtype) in types {
+        encoder.encode_field(&oid)?;
+        encoder.encode_field(&name)?;
+        encoder.encode_field(&11i64)?;
+        encoder.encode_field(&len)?;
+        encoder.encode_field(&typtype)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_class` — one row per active collection (mapped as relation).
+pub fn pg_class(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("relname"),
+        int8_field("relnamespace"),
+        text_field("relkind"),
+        int8_field("relowner"),
+    ]);
+
+    let collections = load_collections(state, identity);
+
+    let mut rows = Vec::with_capacity(collections.len());
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    for (i, coll) in collections.iter().enumerate() {
+        let oid = 16384i64 + i as i64;
+        encoder.encode_field(&oid)?;
+        encoder.encode_field(&coll.name.as_str())?;
+        encoder.encode_field(&2200i64)?;
+        encoder.encode_field(&"r")?;
+        encoder.encode_field(&10i64)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_attribute` — one row per field in strict-schema collections.
+pub fn pg_attribute(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("attrelid"),
+        text_field("attname"),
+        int8_field("atttypid"),
+        int4_field("attnum"),
+        int4_field("attlen"),
+        bool_field("attnotnull"),
+    ]);
+
+    let collections = load_collections(state, identity);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    for (i, coll) in collections.iter().enumerate() {
+        let rel_oid = 16384i64 + i as i64;
+        for (col_num, (field_name, field_type)) in coll.fields.iter().enumerate() {
+            let type_oid = field_type_to_oid(field_type);
+            encoder.encode_field(&rel_oid)?;
+            encoder.encode_field(&field_name.as_str())?;
+            encoder.encode_field(&type_oid)?;
+            encoder.encode_field(&((col_num + 1) as i32))?;
+            encoder.encode_field(&(-1i32))?;
+            encoder.encode_field(&false)?;
+            rows.push(Ok(encoder.take_row()));
+        }
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_index` — secondary indexes.
+///
+/// Returns an empty result set with the correct schema. Structured
+/// index metadata is not yet surfaced through `StoredCollection`;
+/// once it is, this function will take `(state, identity)` and
+/// populate rows from the catalog.
+pub fn pg_index() -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("indexrelid"),
+        int8_field("indrelid"),
+        bool_field("indisunique"),
+        bool_field("indisprimary"),
+    ]);
+
+    let rows: Vec<Result<_, pgwire::error::PgWireError>> = Vec::new();
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+/// `pg_authid` — users / roles.
+pub fn pg_authid(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> PgWireResult<Vec<Response>> {
+    let schema = Arc::new(vec![
+        int8_field("oid"),
+        text_field("rolname"),
+        bool_field("rolsuper"),
+        bool_field("rolcanlogin"),
+    ]);
+
+    let mut rows = Vec::new();
+    let mut encoder = DataRowEncoder::new(schema.clone());
+
+    let users = state.credentials.list_users();
+    for (i, user) in users.iter().enumerate() {
+        let oid = 10i64 + i as i64;
+        let is_super = identity.is_superuser && user == &identity.username;
+        encoder.encode_field(&oid)?;
+        encoder.encode_field(&user.as_str())?;
+        encoder.encode_field(&is_super)?;
+        encoder.encode_field(&true)?;
+        rows.push(Ok(encoder.take_row()));
+    }
+
+    Ok(vec![Response::Query(QueryResponse::new(
+        schema,
+        stream::iter(rows),
+    ))])
+}
+
+fn load_collections(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+) -> Vec<crate::control::security::catalog::types::StoredCollection> {
+    let Some(catalog) = state.credentials.catalog() else {
+        return Vec::new();
+    };
+    if identity.is_superuser {
+        catalog
+            .load_all_collections()
+            .unwrap_or_default()
+            .into_iter()
+            .filter(|c| c.is_active)
+            .collect()
+    } else {
+        catalog
+            .load_collections_for_tenant(identity.tenant_id.as_u32())
+            .unwrap_or_default()
+    }
+}
+
+fn field_type_to_oid(field_type: &str) -> i64 {
+    match field_type.to_lowercase().as_str() {
+        "bool" | "boolean" => 16,
+        "int" | "integer" | "int4" => 23,
+        "bigint" | "int8" => 20,
+        "smallint" | "int2" => 21,
+        "float" | "float4" | "real" => 700,
+        "double" | "float8" => 701,
+        "text" | "string" => 25,
+        "varchar" => 1043,
+        "json" => 114,
+        "jsonb" => 3802,
+        "uuid" => 2950,
+        "date" => 1082,
+        "timestamp" => 1114,
+        "timestamptz" => 1184,
+        _ => 25,
+    }
+}

From 7097421b7245a95b19a8a04bcdd4220666a8d078 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 12:19:44 +0800
Subject: [PATCH 16/24] feat(cluster): add closed timestamp tracker and
 follower read gate

Introduce ClosedTimestampTracker for advancing the closed timestamp
used by bounded-staleness reads, and FollowerReadGate with ReadLevel
to decide whether a follower can serve a read at a given LSN without
forwarding to the leader.
---
 nodedb-cluster/src/closed_timestamp.rs | 123 +++++++++++++++++++++++
 nodedb-cluster/src/follower_read.rs    | 130 +++++++++++++++++++++++++
 nodedb-cluster/src/lib.rs              |   4 +
 3 files changed, 257 insertions(+)
 create mode 100644 nodedb-cluster/src/closed_timestamp.rs
 create mode 100644 nodedb-cluster/src/follower_read.rs

diff --git a/nodedb-cluster/src/closed_timestamp.rs b/nodedb-cluster/src/closed_timestamp.rs
new file mode 100644
index 00000000..550fb9f2
--- /dev/null
+++ b/nodedb-cluster/src/closed_timestamp.rs
@@ -0,0 +1,123 @@
+//! Per-group closed-timestamp tracker.
+//!
+//! Every time a Raft group applies a committed entry, the applier
+//! records the wall-clock instant as that group's "closed timestamp".
+//! A follower whose closed timestamp for a group is within the
+//! caller's staleness bound can serve reads locally — no gateway hop
+//! to the leader.
+//!
+//! The tracker is intentionally simple: one `Instant` per group,
+//! updated monotonically. There is no HLC or cross-node coordination
+//! here — the closed timestamp is local to this node. Safety comes
+//! from the fact that a follower's applied index can only advance
+//! (Raft guarantees), so a read served at a given closed timestamp
+//! sees a consistent prefix of the log.
+
+use std::collections::HashMap;
+use std::sync::RwLock;
+use std::time::{Duration, Instant};
+
+/// Tracks the most recent apply instant per Raft group.
+pub struct ClosedTimestampTracker {
+    groups: RwLock<HashMap<u64, Instant>>,
+}
+
+impl ClosedTimestampTracker {
+    pub fn new() -> Self {
+        Self {
+            groups: RwLock::new(HashMap::new()),
+        }
+    }
+
+    /// Record that `group_id` just applied one or more entries.
+    /// Called by the raft-loop applier after each apply batch.
+    pub fn mark_applied(&self, group_id: u64) {
+        let mut g = self.groups.write().unwrap_or_else(|p| p.into_inner());
+        g.insert(group_id, Instant::now());
+    }
+
+    /// Record that `group_id` just applied, using a caller-supplied
+    /// instant. Exposed for deterministic testing with paused time.
+    pub fn mark_applied_at(&self, group_id: u64, at: Instant) {
+        let mut g = self.groups.write().unwrap_or_else(|p| p.into_inner());
+        g.insert(group_id, at);
+    }
+
+    /// Check whether this node's replica of `group_id` has applied
+    /// recently enough that a read with `max_staleness` can be
+    /// served locally.
+    ///
+    /// Returns `false` if the group has never applied on this node
+    /// (no closed timestamp recorded).
+    pub fn is_fresh_enough(&self, group_id: u64, max_staleness: Duration) -> bool {
+        let g = self.groups.read().unwrap_or_else(|p| p.into_inner());
+        match g.get(&group_id) {
+            Some(last) => last.elapsed() <= max_staleness,
+            None => false,
+        }
+    }
+
+    /// Return the age of the closed timestamp for a group, or `None`
+    /// if the group has never applied on this node. Useful for
+    /// observability (metrics, SHOW commands).
+    pub fn staleness(&self, group_id: u64) -> Option<Duration> {
+        let g = self.groups.read().unwrap_or_else(|p| p.into_inner());
+        g.get(&group_id).map(|last| last.elapsed())
+    }
+}
+
+impl Default for ClosedTimestampTracker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn unknown_group_is_not_fresh() {
+        let tracker = ClosedTimestampTracker::new();
+        assert!(!tracker.is_fresh_enough(99, Duration::from_secs(10)));
+    }
+
+    #[test]
+    fn recently_applied_is_fresh() {
+        let tracker = ClosedTimestampTracker::new();
+        tracker.mark_applied(1);
+        assert!(tracker.is_fresh_enough(1, Duration::from_secs(5)));
+    }
+
+    #[test]
+    fn stale_group_is_not_fresh() {
+        let tracker = ClosedTimestampTracker::new();
+        let old = Instant::now() - Duration::from_secs(30);
+        tracker.mark_applied_at(1, old);
+        assert!(!tracker.is_fresh_enough(1, Duration::from_secs(5)));
+    }
+
+    #[test]
+    fn staleness_returns_none_for_unknown() {
+        let tracker = ClosedTimestampTracker::new();
+        assert!(tracker.staleness(42).is_none());
+    }
+
+    #[test]
+    fn staleness_returns_age_for_known() {
+        let tracker = ClosedTimestampTracker::new();
+        tracker.mark_applied(1);
+        let s = tracker.staleness(1).unwrap();
+        assert!(s < Duration::from_millis(100));
+    }
+
+    #[test]
+    fn mark_applied_updates_monotonically() {
+        let tracker = ClosedTimestampTracker::new();
+        let old = Instant::now() - Duration::from_secs(10);
+        tracker.mark_applied_at(1, old);
+        assert!(!tracker.is_fresh_enough(1, Duration::from_secs(5)));
+        tracker.mark_applied(1);
+        assert!(tracker.is_fresh_enough(1, Duration::from_secs(5)));
+    }
+}
diff --git a/nodedb-cluster/src/follower_read.rs b/nodedb-cluster/src/follower_read.rs
new file mode 100644
index 00000000..16d0886e
--- /dev/null
+++ b/nodedb-cluster/src/follower_read.rs
@@ -0,0 +1,130 @@
+//! Follower-read decision gate.
+//!
+//! [`FollowerReadGate`] answers a single question: "given the
+//! session's `ReadConsistency` and the local node's role + closed
+//! timestamp for the target Raft group, can this read be served
+//! locally without forwarding to the leader?"
+//!
+//! ## Decision table
+//!
+//! | Consistency           | Local role  | Closed TS fresh? | Serve locally? |
+//! |-----------------------|-------------|------------------|----------------|
+//! | Strong                | *           | *                | Only if leader |
+//! | BoundedStaleness(d)   | Follower    | ≤ d              | Yes            |
+//! | BoundedStaleness(d)   | Follower    | > d              | No → forward   |
+//! | BoundedStaleness(d)   | Leader      | *                | Yes            |
+//! | Eventual              | *           | *                | Yes            |
+//!
+//! The gate is stateless — it reads from shared handles to the
+//! closed-timestamp tracker and the raft-status provider.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::closed_timestamp::ClosedTimestampTracker;
+
+/// Consistency level for a single read — mirrors the `ReadConsistency`
+/// enum in the `nodedb` crate without coupling `nodedb-cluster` to it.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ReadLevel {
+    Strong,
+    BoundedStaleness(Duration),
+    Eventual,
+}
+
+/// Answers "can this read be served locally?"
+pub struct FollowerReadGate {
+    closed_ts: Arc<ClosedTimestampTracker>,
+    /// Type-erased function that returns true if this node is the
+    /// leader for the given group. Injection seam — production wraps
+    /// `MultiRaft::group_statuses`, tests supply a closure.
+    is_leader_fn: Box<dyn Fn(u64) -> bool + Send + Sync>,
+}
+
+impl FollowerReadGate {
+    pub fn new(
+        closed_ts: Arc<ClosedTimestampTracker>,
+        is_leader_fn: Box<dyn Fn(u64) -> bool + Send + Sync>,
+    ) -> Self {
+        Self {
+            closed_ts,
+            is_leader_fn,
+        }
+    }
+
+    /// Returns `true` if the read can be served from this node's
+    /// local replica without forwarding to the leader.
+    pub fn can_serve_locally(&self, group_id: u64, level: ReadLevel) -> bool {
+        match level {
+            ReadLevel::Strong => (self.is_leader_fn)(group_id),
+            ReadLevel::Eventual => true,
+            ReadLevel::BoundedStaleness(max) => {
+                if (self.is_leader_fn)(group_id) {
+                    return true;
+                }
+                self.closed_ts.is_fresh_enough(group_id, max)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn gate(leader_groups: &'static [u64]) -> FollowerReadGate {
+        FollowerReadGate::new(
+            Arc::new(ClosedTimestampTracker::new()),
+            Box::new(move |gid| leader_groups.contains(&gid)),
+        )
+    }
+
+    fn gate_with_tracker(
+        leader_groups: &'static [u64],
+        tracker: Arc<ClosedTimestampTracker>,
+    ) -> FollowerReadGate {
+        FollowerReadGate::new(tracker, Box::new(move |gid| leader_groups.contains(&gid)))
+    }
+
+    #[test]
+    fn strong_requires_leader() {
+        let g = gate(&[1]);
+        assert!(g.can_serve_locally(1, ReadLevel::Strong));
+        assert!(!g.can_serve_locally(2, ReadLevel::Strong));
+    }
+
+    #[test]
+    fn eventual_always_local() {
+        let g = gate(&[]);
+        assert!(g.can_serve_locally(99, ReadLevel::Eventual));
+    }
+
+    #[test]
+    fn bounded_staleness_leader_always_local() {
+        let g = gate(&[1]);
+        assert!(g.can_serve_locally(1, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+
+    #[test]
+    fn bounded_staleness_follower_fresh_enough() {
+        let tracker = Arc::new(ClosedTimestampTracker::new());
+        tracker.mark_applied(2);
+        let g = gate_with_tracker(&[], tracker);
+        assert!(g.can_serve_locally(2, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+
+    #[test]
+    fn bounded_staleness_follower_too_stale() {
+        let tracker = Arc::new(ClosedTimestampTracker::new());
+        let old = std::time::Instant::now() - Duration::from_secs(30);
+        tracker.mark_applied_at(2, old);
+        let g = gate_with_tracker(&[], tracker);
+        assert!(!g.can_serve_locally(2, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+
+    #[test]
+    fn bounded_staleness_unknown_group_not_local() {
+        let g = gate(&[]);
+        assert!(!g.can_serve_locally(99, ReadLevel::BoundedStaleness(Duration::from_secs(5))));
+    }
+}
diff --git a/nodedb-cluster/src/lib.rs b/nodedb-cluster/src/lib.rs
index 440b3aee..8909a79d 100644
--- a/nodedb-cluster/src/lib.rs
+++ b/nodedb-cluster/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod bootstrap;
 pub mod catalog;
 pub mod circuit_breaker;
+pub mod closed_timestamp;
 pub mod cluster_info;
 pub mod conf_change;
 pub mod cross_shard_txn;
@@ -12,6 +13,7 @@ pub mod distributed_spatial;
 pub mod distributed_timeseries;
 pub mod distributed_vector;
 pub mod error;
+pub mod follower_read;
 pub mod forward;
 pub mod ghost;
 pub mod ghost_sweeper;
@@ -43,6 +45,7 @@ pub mod wire;
 
 pub use bootstrap::{ClusterConfig, ClusterState, JoinRetryPolicy, start_cluster};
 pub use catalog::ClusterCatalog;
+pub use closed_timestamp::ClosedTimestampTracker;
 pub use cluster_info::{
     ClusterInfoSnapshot, ClusterObserver, GroupSnapshot, GroupStatusProvider, PeerSnapshot,
 };
@@ -52,6 +55,7 @@ pub use decommission::{
     DecommissionSafetyError, MetadataProposer, check_can_decommission, plan_full_decommission,
 };
 pub use error::{ClusterError, Result};
+pub use follower_read::{FollowerReadGate, ReadLevel};
 pub use forward::{NoopPlanExecutor, PlanExecutor};
 pub use ghost::{GhostStub, GhostTable};
 pub use health::{HealthConfig, HealthMonitor};

From 6da002ffd132866786b6f0f04ab15695da9ffd27 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 12:19:56 +0800
Subject: [PATCH 17/24] fix(pgwire): guard against out-of-bounds access in DDL
 command parsers

Replace direct index access with bounds-checked alternatives in the
auth user DDL handler, and add early-return validation in GRANT/REVOKE
ROLE handlers to return a well-formed syntax error instead of panicking
when the command has too few parts.
---
 .../src/control/server/pgwire/ddl/auth_user_ddl.rs |  2 +-
 nodedb/src/control/server/pgwire/ddl/grant/role.rs | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs b/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs
index 0bcc576a..7228ada9 100644
--- a/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs
+++ b/nodedb/src/control/server/pgwire/ddl/auth_user_ddl.rs
@@ -32,7 +32,7 @@ pub fn handle_auth_user(
         ));
     }
 
-    let upper0 = parts[0].to_uppercase();
+    let upper0 = parts.first().map(|s| s.to_uppercase()).unwrap_or_default();
     match upper0.as_str() {
         "DEACTIVATE" => deactivate_auth_user(state, identity, parts),
         "ALTER" => alter_auth_user_status(state, identity, parts),
diff --git a/nodedb/src/control/server/pgwire/ddl/grant/role.rs b/nodedb/src/control/server/pgwire/ddl/grant/role.rs
index cc902b36..2eef069a 100644
--- a/nodedb/src/control/server/pgwire/ddl/grant/role.rs
+++ b/nodedb/src/control/server/pgwire/ddl/grant/role.rs
@@ -57,6 +57,13 @@ pub fn grant_role(
 ) -> PgWireResult<Vec<Response>> {
     require_admin(identity, "grant roles")?;
 
+    if parts.len() < 5 {
+        return Err(sqlstate_error(
+            "42601",
+            "syntax: GRANT ROLE <role> TO <user>",
+        ));
+    }
+
     let role = parse_role(parts[2]);
 
     if matches!(role, Role::Superuser) && !identity.is_superuser {
@@ -94,6 +101,13 @@ pub fn revoke_role(
 ) -> PgWireResult<Vec<Response>> {
     require_admin(identity, "revoke roles")?;
 
+    if parts.len() < 5 {
+        return Err(sqlstate_error(
+            "42601",
+            "syntax: REVOKE ROLE <role> FROM <user>",
+        ));
+    }
+
     let role = parse_role(parts[2]);
 
     if !parts[3].eq_ignore_ascii_case("FROM") {

From 87926fc09dd70c74d0bcb50ffdd4078176c67682 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 13:47:25 +0800
Subject: [PATCH 18/24] feat(nodedb-sql): add typed DDL AST module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce `nodedb_sql::ddl_ast` — a typed representation of every
NodeDB DDL statement as `NodedbStatement` enum variants, with a
whitespace-token parser that converts raw SQL into the typed form.

The AST gives DDL dispatch a compiler-checked match instead of
string-prefix branching, so adding a new DDL command forces handling
in every affected site at compile time rather than at runtime.
---
 nodedb-sql/src/ddl_ast/mod.rs       |  16 +
 nodedb-sql/src/ddl_ast/parse.rs     | 605 ++++++++++++++++++++++++++++
 nodedb-sql/src/ddl_ast/statement.rs | 278 +++++++++++++
 nodedb-sql/src/lib.rs               |   1 +
 4 files changed, 900 insertions(+)
 create mode 100644 nodedb-sql/src/ddl_ast/mod.rs
 create mode 100644 nodedb-sql/src/ddl_ast/parse.rs
 create mode 100644 nodedb-sql/src/ddl_ast/statement.rs

diff --git a/nodedb-sql/src/ddl_ast/mod.rs b/nodedb-sql/src/ddl_ast/mod.rs
new file mode 100644
index 00000000..7e115e38
--- /dev/null
+++ b/nodedb-sql/src/ddl_ast/mod.rs
@@ -0,0 +1,16 @@
+//! Typed AST for NodeDB-specific DDL statements.
+//!
+//! Every DDL command the system supports is represented as a variant
+//! of [`NodedbStatement`]. The DDL router matches on this enum
+//! instead of string prefixes, so the compiler catches missing
+//! handlers when a new DDL is added.
+//!
+//! The parser ([`parse`]) converts raw SQL into a `NodedbStatement`
+//! using whitespace-split token matching — the same technique the
+//! old string-prefix router used, but producing a typed output.
+
+pub mod parse;
+pub mod statement;
+
+pub use parse::parse;
+pub use statement::NodedbStatement;
diff --git a/nodedb-sql/src/ddl_ast/parse.rs b/nodedb-sql/src/ddl_ast/parse.rs
new file mode 100644
index 00000000..78fb6635
--- /dev/null
+++ b/nodedb-sql/src/ddl_ast/parse.rs
@@ -0,0 +1,605 @@
+//! Parse raw SQL into a [`NodedbStatement`].
+
+use super::statement::NodedbStatement;
+
+/// Try to parse a DDL statement from raw SQL. Returns `None` for
+/// non-DDL queries (SELECT, INSERT, etc.) that should flow through
+/// the normal planner.
+pub fn parse(sql: &str) -> Option<NodedbStatement> {
+    let trimmed = sql.trim();
+    if trimmed.is_empty() {
+        return None;
+    }
+    let upper = trimmed.to_uppercase();
+    let parts: Vec<&str> = trimmed.split_whitespace().collect();
+    if parts.is_empty() {
+        return None;
+    }
+
+    // ── Collection lifecycle ─────────────────────────────────────
+    if upper.starts_with("CREATE COLLECTION ") || upper.starts_with("CREATE TABLE ") {
+        let if_not_exists = upper.contains("IF NOT EXISTS");
+        let name = extract_name_after_keyword(&parts, "COLLECTION")
+            .or_else(|| extract_name_after_keyword(&parts, "TABLE"))?;
+        return Some(NodedbStatement::CreateCollection {
+            name,
+            if_not_exists,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP COLLECTION ") || upper.starts_with("DROP TABLE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "COLLECTION")
+            .or_else(|| extract_name_after_if_exists(&parts, "TABLE"))?;
+        return Some(NodedbStatement::DropCollection { name, if_exists });
+    }
+    if upper.starts_with("ALTER COLLECTION ") || upper.starts_with("ALTER TABLE ") {
+        let name = extract_name_after_keyword(&parts, "COLLECTION")
+            .or_else(|| extract_name_after_keyword(&parts, "TABLE"))?;
+        return Some(NodedbStatement::AlterCollection {
+            name,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DESCRIBE ") && !upper.starts_with("DESCRIBE SEQUENCE") {
+        let name = parts.get(1)?.to_string();
+        return Some(NodedbStatement::DescribeCollection { name });
+    }
+    if upper == "\\D" || upper == "SHOW COLLECTIONS" || upper.starts_with("SHOW COLLECTIONS") {
+        return Some(NodedbStatement::ShowCollections);
+    }
+
+    // ── Index ────────────────────────────────────────────────────
+    if upper.starts_with("CREATE UNIQUE INDEX ") || upper.starts_with("CREATE UNIQUE IND") {
+        return Some(NodedbStatement::CreateIndex {
+            unique: true,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("CREATE INDEX ") {
+        return Some(NodedbStatement::CreateIndex {
+            unique: false,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP INDEX ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "INDEX")?;
+        return Some(NodedbStatement::DropIndex {
+            name,
+            collection: None,
+            if_exists,
+        });
+    }
+    if upper.starts_with("SHOW INDEX") {
+        let collection = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowIndexes { collection });
+    }
+    if upper.starts_with("REINDEX ") {
+        let collection = parts.get(1)?.to_string();
+        return Some(NodedbStatement::Reindex { collection });
+    }
+
+    // ── Trigger ──────────────────────────────────────────────────
+    if upper.starts_with("CREATE ") && upper.contains("TRIGGER ") {
+        let or_replace = upper.contains("OR REPLACE");
+        let deferred = upper.contains("DEFERRED");
+        let sync = upper.contains("SYNC");
+        return Some(NodedbStatement::CreateTrigger {
+            or_replace,
+            deferred,
+            sync,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP TRIGGER ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "TRIGGER")?;
+        let collection = extract_after_keyword(&parts, "ON").unwrap_or_default();
+        return Some(NodedbStatement::DropTrigger {
+            name,
+            collection,
+            if_exists,
+        });
+    }
+    if upper.starts_with("ALTER TRIGGER ") {
+        return Some(NodedbStatement::AlterTrigger {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW TRIGGERS") {
+        let collection = if upper.starts_with("SHOW TRIGGERS ON ") {
+            parts.get(3).map(|s| s.to_string())
+        } else {
+            None
+        };
+        return Some(NodedbStatement::ShowTriggers { collection });
+    }
+
+    // ── Schedule ─────────────────────────────────────────────────
+    if upper.starts_with("CREATE SCHEDULE ") {
+        return Some(NodedbStatement::CreateSchedule {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP SCHEDULE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "SCHEDULE")?;
+        return Some(NodedbStatement::DropSchedule { name, if_exists });
+    }
+    if upper.starts_with("ALTER SCHEDULE ") {
+        return Some(NodedbStatement::AlterSchedule {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW SCHEDULE HISTORY ") {
+        let name = parts.get(3)?.to_string();
+        return Some(NodedbStatement::ShowScheduleHistory { name });
+    }
+    if upper == "SHOW SCHEDULES" || upper.starts_with("SHOW SCHEDULES") {
+        return Some(NodedbStatement::ShowSchedules);
+    }
+
+    // ── Sequence ─────────────────────────────────────────────────
+    if upper.starts_with("CREATE SEQUENCE ") {
+        let if_not_exists = upper.contains("IF NOT EXISTS");
+        let name = extract_name_after_if_exists(&parts, "SEQUENCE")?;
+        return Some(NodedbStatement::CreateSequence {
+            name,
+            if_not_exists,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP SEQUENCE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "SEQUENCE")?;
+        return Some(NodedbStatement::DropSequence { name, if_exists });
+    }
+    if upper.starts_with("ALTER SEQUENCE ") {
+        return Some(NodedbStatement::AlterSequence {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DESCRIBE SEQUENCE ") {
+        let name = parts.get(2)?.to_string();
+        return Some(NodedbStatement::DescribeSequence { name });
+    }
+    if upper == "SHOW SEQUENCES" || upper.starts_with("SHOW SEQUENCES") {
+        return Some(NodedbStatement::ShowSequences);
+    }
+
+    // ── Alert ────────────────────────────────────────────────────
+    if upper.starts_with("CREATE ALERT ") {
+        return Some(NodedbStatement::CreateAlert {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP ALERT ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "ALERT")?;
+        return Some(NodedbStatement::DropAlert { name, if_exists });
+    }
+    if upper.starts_with("ALTER ALERT ") {
+        return Some(NodedbStatement::AlterAlert {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW ALERT STATUS ") {
+        let name = parts.get(3)?.to_string();
+        return Some(NodedbStatement::ShowAlertStatus { name });
+    }
+    if upper.starts_with("SHOW ALERT") && !upper.starts_with("SHOW ALERT STATUS") {
+        return Some(NodedbStatement::ShowAlerts);
+    }
+
+    // ── Retention policy ─────────────────────────────────────────
+    if upper.starts_with("CREATE RETENTION POLICY ") {
+        return Some(NodedbStatement::CreateRetentionPolicy {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP RETENTION POLICY ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "POLICY")?;
+        return Some(NodedbStatement::DropRetentionPolicy { name, if_exists });
+    }
+    if upper.starts_with("ALTER RETENTION POLICY ") {
+        return Some(NodedbStatement::AlterRetentionPolicy {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW RETENTION POLIC") {
+        return Some(NodedbStatement::ShowRetentionPolicies);
+    }
+
+    // ── Cluster admin ────────────────────────────────────────────
+    if upper.starts_with("SHOW CLUSTER") {
+        return Some(NodedbStatement::ShowCluster);
+    }
+    if upper.starts_with("SHOW MIGRATIONS") {
+        return Some(NodedbStatement::ShowMigrations);
+    }
+    if upper.starts_with("SHOW RANGES") {
+        return Some(NodedbStatement::ShowRanges);
+    }
+    if upper.starts_with("SHOW ROUTING") {
+        return Some(NodedbStatement::ShowRouting);
+    }
+    if upper.starts_with("SHOW SCHEMA VERSION") {
+        return Some(NodedbStatement::ShowSchemaVersion);
+    }
+    if upper.starts_with("SHOW PEER HEALTH") {
+        return Some(NodedbStatement::ShowPeerHealth);
+    }
+    if upper.starts_with("REBALANCE") {
+        return Some(NodedbStatement::Rebalance);
+    }
+    if upper.starts_with("SHOW RAFT GROUP ") {
+        let id = parts.get(3)?.to_string();
+        return Some(NodedbStatement::ShowRaftGroup { group_id: id });
+    }
+    if upper.starts_with("SHOW RAFT GROUPS") || upper.starts_with("SHOW RAFT") {
+        return Some(NodedbStatement::ShowRaftGroups);
+    }
+    if upper.starts_with("ALTER RAFT GROUP ") {
+        return Some(NodedbStatement::AlterRaftGroup {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("REMOVE NODE ") {
+        let id = parts.get(2)?.to_string();
+        return Some(NodedbStatement::RemoveNode { node_id: id });
+    }
+    if upper.starts_with("SHOW NODE ") {
+        let id = parts.get(2)?.to_string();
+        return Some(NodedbStatement::ShowNode { node_id: id });
+    }
+    if upper.starts_with("SHOW NODES") {
+        return Some(NodedbStatement::ShowNodes);
+    }
+
+    // ── Maintenance ──────────────────────────────────────────────
+    if upper.starts_with("ANALYZE") {
+        let collection = parts.get(1).map(|s| s.to_string());
+        return Some(NodedbStatement::Analyze { collection });
+    }
+    if upper.starts_with("COMPACT ") {
+        let collection = parts.get(1)?.to_string();
+        return Some(NodedbStatement::Compact { collection });
+    }
+    if upper.starts_with("SHOW COMPACTION ST") {
+        return Some(NodedbStatement::ShowCompactionStatus);
+    }
+    if upper.starts_with("SHOW STORAGE") {
+        let collection = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowStorage { collection });
+    }
+
+    // ── Backup / restore ─────────────────────────────────────────
+    if upper.starts_with("BACKUP TENANT ") {
+        return Some(NodedbStatement::BackupTenant {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("RESTORE TENANT ") {
+        let dry_run = upper.ends_with(" DRY RUN") || upper.ends_with(" DRYRUN");
+        return Some(NodedbStatement::RestoreTenant {
+            dry_run,
+            raw_sql: trimmed.to_string(),
+        });
+    }
+
+    // ── User / auth ──────────────────────────────────────────────
+    if upper.starts_with("CREATE USER ") {
+        return Some(NodedbStatement::CreateUser {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP USER ") {
+        let username = parts.get(2)?.to_string();
+        return Some(NodedbStatement::DropUser { username });
+    }
+    if upper.starts_with("ALTER USER ") {
+        return Some(NodedbStatement::AlterUser {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW USERS") {
+        return Some(NodedbStatement::ShowUsers);
+    }
+    if upper.starts_with("GRANT ROLE ") {
+        return Some(NodedbStatement::GrantRole {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("REVOKE ROLE ") {
+        return Some(NodedbStatement::RevokeRole {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("GRANT ") {
+        return Some(NodedbStatement::GrantPermission {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("REVOKE ") {
+        return Some(NodedbStatement::RevokePermission {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("SHOW PERMISSIONS") {
+        let collection = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowPermissions { collection });
+    }
+    if upper.starts_with("SHOW GRANTS") {
+        let username = parts.get(2).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowGrants { username });
+    }
+    if upper.starts_with("SHOW TENANTS") {
+        return Some(NodedbStatement::ShowTenants);
+    }
+    if upper.starts_with("SHOW AUDIT") {
+        return Some(NodedbStatement::ShowAuditLog);
+    }
+    if upper.starts_with("SHOW CONSTRAINTS ") {
+        let collection = parts.get(2)?.to_string();
+        return Some(NodedbStatement::ShowConstraints { collection });
+    }
+    if upper.starts_with("SHOW TYPEGUARD") {
+        let collection = parts.get(2)?.to_string();
+        return Some(NodedbStatement::ShowTypeGuards { collection });
+    }
+
+    // ── Change stream ────────────────────────────────────────────
+    if upper.starts_with("CREATE CHANGE STREAM ") {
+        return Some(NodedbStatement::CreateChangeStream {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP CHANGE STREAM ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "STREAM")?;
+        return Some(NodedbStatement::DropChangeStream { name, if_exists });
+    }
+
+    // ── RLS ──────────────────────────────────────────────────────
+    if upper.starts_with("CREATE RLS POLICY ") {
+        return Some(NodedbStatement::CreateRlsPolicy {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP RLS POLICY ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "POLICY")?;
+        let collection = extract_after_keyword(&parts, "ON").unwrap_or_default();
+        return Some(NodedbStatement::DropRlsPolicy {
+            name,
+            collection,
+            if_exists,
+        });
+    }
+    if upper.starts_with("SHOW RLS POLI") {
+        let collection = parts.get(3).map(|s| s.to_string());
+        return Some(NodedbStatement::ShowRlsPolicies { collection });
+    }
+
+    // ── Materialized view ────────────────────────────────────────
+    if upper.starts_with("CREATE MATERIALIZED VIEW ") {
+        return Some(NodedbStatement::CreateMaterializedView {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP MATERIALIZED VIEW ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "VIEW")?;
+        return Some(NodedbStatement::DropMaterializedView { name, if_exists });
+    }
+
+    // ── Continuous aggregate ─────────────────────────────────────
+    if upper.starts_with("CREATE CONTINUOUS AGGREGATE ") {
+        return Some(NodedbStatement::CreateContinuousAggregate {
+            raw_sql: trimmed.to_string(),
+        });
+    }
+    if upper.starts_with("DROP CONTINUOUS AGGREGATE ") {
+        let if_exists = upper.contains("IF EXISTS");
+        let name = extract_name_after_if_exists(&parts, "AGGREGATE")?;
+        return Some(NodedbStatement::DropContinuousAggregate { name, if_exists });
+    }
+
+    None
+}
+
+/// Extract the object name that follows a keyword (e.g. "COLLECTION"
+/// in "CREATE COLLECTION users ..."). Handles IF NOT EXISTS by
+/// skipping those tokens.
+fn extract_name_after_keyword(parts: &[&str], keyword: &str) -> Option<String> {
+    let kw_upper = keyword.to_uppercase();
+    let pos = parts.iter().position(|p| p.to_uppercase() == kw_upper)?;
+    let mut idx = pos + 1;
+    // Skip IF NOT EXISTS tokens.
+    if parts.get(idx).map(|s| s.to_uppercase()) == Some("IF".to_string()) {
+        idx += 1; // NOT
+        if parts.get(idx).map(|s| s.to_uppercase()) == Some("NOT".to_string()) {
+            idx += 1; // EXISTS
+        }
+        if parts.get(idx).map(|s| s.to_uppercase()) == Some("EXISTS".to_string()) {
+            idx += 1;
+        }
+    }
+    parts.get(idx).map(|s| s.to_string())
+}
+
+/// Extract the object name for DROP-style commands where IF EXISTS
+/// may appear between the keyword and the name.
+fn extract_name_after_if_exists(parts: &[&str], keyword: &str) -> Option<String> {
+    extract_name_after_keyword(parts, keyword)
+}
+
+/// Extract the token after a keyword like "ON" or "TO".
+fn extract_after_keyword(parts: &[&str], keyword: &str) -> Option<String> {
+    let kw_upper = keyword.to_uppercase();
+    let pos = parts.iter().position(|p| p.to_uppercase() == kw_upper)?;
+    parts.get(pos + 1).map(|s| s.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_create_collection() {
+        let stmt = parse("CREATE COLLECTION users (id INT, name TEXT)").unwrap();
+        match stmt {
+            NodedbStatement::CreateCollection {
+                name,
+                if_not_exists,
+                ..
+            } => {
+                assert_eq!(name, "users");
+                assert!(!if_not_exists);
+            }
+            other => panic!("expected CreateCollection, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_create_collection_if_not_exists() {
+        let stmt = parse("CREATE COLLECTION IF NOT EXISTS users").unwrap();
+        match stmt {
+            NodedbStatement::CreateCollection {
+                name,
+                if_not_exists,
+                ..
+            } => {
+                assert_eq!(name, "users");
+                assert!(if_not_exists);
+            }
+            other => panic!("expected CreateCollection, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_drop_collection() {
+        let stmt = parse("DROP COLLECTION users").unwrap();
+        assert_eq!(
+            stmt,
+            NodedbStatement::DropCollection {
+                name: "users".into(),
+                if_exists: false,
+            }
+        );
+    }
+
+    #[test]
+    fn parse_drop_collection_if_exists() {
+        let stmt = parse("DROP COLLECTION IF EXISTS users").unwrap();
+        assert_eq!(
+            stmt,
+            NodedbStatement::DropCollection {
+                name: "users".into(),
+                if_exists: true,
+            }
+        );
+    }
+
+    #[test]
+    fn parse_show_nodes() {
+        assert_eq!(parse("SHOW NODES"), Some(NodedbStatement::ShowNodes));
+    }
+
+    #[test]
+    fn parse_show_cluster() {
+        assert_eq!(parse("SHOW CLUSTER"), Some(NodedbStatement::ShowCluster));
+    }
+
+    #[test]
+    fn parse_create_trigger() {
+        let stmt = parse("CREATE OR REPLACE SYNC TRIGGER on_insert ...").unwrap();
+        match stmt {
+            NodedbStatement::CreateTrigger {
+                or_replace,
+                sync,
+                deferred,
+                ..
+            } => {
+                assert!(or_replace);
+                assert!(sync);
+                assert!(!deferred);
+            }
+            other => panic!("expected CreateTrigger, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_drop_index_if_exists() {
+        let stmt = parse("DROP INDEX IF EXISTS idx_name").unwrap();
+        match stmt {
+            NodedbStatement::DropIndex {
+                name, if_exists, ..
+            } => {
+                assert_eq!(name, "idx_name");
+                assert!(if_exists);
+            }
+            other => panic!("expected DropIndex, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_analyze() {
+        assert_eq!(
+            parse("ANALYZE users"),
+            Some(NodedbStatement::Analyze {
+                collection: Some("users".into()),
+            })
+        );
+        assert_eq!(
+            parse("ANALYZE"),
+            Some(NodedbStatement::Analyze { collection: None })
+        );
+    }
+
+    #[test]
+    fn non_ddl_returns_none() {
+        assert!(parse("SELECT * FROM users").is_none());
+        assert!(parse("INSERT INTO users VALUES (1)").is_none());
+    }
+
+    #[test]
+    fn parse_grant_role() {
+        let stmt = parse("GRANT ROLE admin TO alice").unwrap();
+        match stmt {
+            NodedbStatement::GrantRole { raw_sql } => {
+                assert!(raw_sql.contains("admin"));
+            }
+            other => panic!("expected GrantRole, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_create_sequence_if_not_exists() {
+        let stmt = parse("CREATE SEQUENCE IF NOT EXISTS my_seq START 1").unwrap();
+        match stmt {
+            NodedbStatement::CreateSequence {
+                name,
+                if_not_exists,
+                ..
+            } => {
+                assert_eq!(name, "my_seq");
+                assert!(if_not_exists);
+            }
+            other => panic!("expected CreateSequence, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn parse_restore_dry_run() {
+        let stmt = parse("RESTORE TENANT 1 FROM '/tmp/backup' DRY RUN").unwrap();
+        match stmt {
+            NodedbStatement::RestoreTenant { dry_run, .. } => {
+                assert!(dry_run);
+            }
+            other => panic!("expected RestoreTenant, got {other:?}"),
+        }
+    }
+}
diff --git a/nodedb-sql/src/ddl_ast/statement.rs b/nodedb-sql/src/ddl_ast/statement.rs
new file mode 100644
index 00000000..30ee3db7
--- /dev/null
+++ b/nodedb-sql/src/ddl_ast/statement.rs
@@ -0,0 +1,278 @@
+//! The [`NodedbStatement`] enum — one variant per DDL command.
+
+/// Typed representation of every NodeDB DDL statement.
+///
+/// Handlers receive a fully-parsed variant instead of raw `&[&str]`
+/// parts, eliminating array-index panics and enabling exhaustive
+/// match coverage for new DDL commands.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum NodedbStatement {
+    // ── Collection lifecycle ─────────────────────────────────────
+    CreateCollection {
+        name: String,
+        if_not_exists: bool,
+        raw_sql: String,
+    },
+    DropCollection {
+        name: String,
+        if_exists: bool,
+    },
+    AlterCollection {
+        name: String,
+        raw_sql: String,
+    },
+    DescribeCollection {
+        name: String,
+    },
+    ShowCollections,
+
+    // ── Index ────────────────────────────────────────────────────
+    CreateIndex {
+        unique: bool,
+        raw_sql: String,
+    },
+    DropIndex {
+        name: String,
+        collection: Option<String>,
+        if_exists: bool,
+    },
+    ShowIndexes {
+        collection: Option<String>,
+    },
+    Reindex {
+        collection: String,
+    },
+
+    // ── Trigger ──────────────────────────────────────────────────
+    CreateTrigger {
+        or_replace: bool,
+        deferred: bool,
+        sync: bool,
+        raw_sql: String,
+    },
+    DropTrigger {
+        name: String,
+        collection: String,
+        if_exists: bool,
+    },
+    AlterTrigger {
+        raw_sql: String,
+    },
+    ShowTriggers {
+        collection: Option<String>,
+    },
+
+    // ── Schedule ─────────────────────────────────────────────────
+    CreateSchedule {
+        raw_sql: String,
+    },
+    DropSchedule {
+        name: String,
+        if_exists: bool,
+    },
+    AlterSchedule {
+        raw_sql: String,
+    },
+    ShowSchedules,
+    ShowScheduleHistory {
+        name: String,
+    },
+
+    // ── Sequence ─────────────────────────────────────────────────
+    CreateSequence {
+        name: String,
+        if_not_exists: bool,
+        raw_sql: String,
+    },
+    DropSequence {
+        name: String,
+        if_exists: bool,
+    },
+    AlterSequence {
+        raw_sql: String,
+    },
+    DescribeSequence {
+        name: String,
+    },
+    ShowSequences,
+
+    // ── Alert ────────────────────────────────────────────────────
+    CreateAlert {
+        raw_sql: String,
+    },
+    DropAlert {
+        name: String,
+        if_exists: bool,
+    },
+    AlterAlert {
+        raw_sql: String,
+    },
+    ShowAlerts,
+    ShowAlertStatus {
+        name: String,
+    },
+
+    // ── Retention policy ─────────────────────────────────────────
+    CreateRetentionPolicy {
+        raw_sql: String,
+    },
+    DropRetentionPolicy {
+        name: String,
+        if_exists: bool,
+    },
+    AlterRetentionPolicy {
+        raw_sql: String,
+    },
+    ShowRetentionPolicies,
+
+    // ── Change stream ────────────────────────────────────────────
+    CreateChangeStream {
+        raw_sql: String,
+    },
+    DropChangeStream {
+        name: String,
+        if_exists: bool,
+    },
+    AlterChangeStream {
+        raw_sql: String,
+    },
+    ShowChangeStreams,
+
+    // ── Consumer group ───────────────────────────────────────────
+    CreateConsumerGroup {
+        raw_sql: String,
+    },
+    DropConsumerGroup {
+        name: String,
+        stream: String,
+        if_exists: bool,
+    },
+    ShowConsumerGroups {
+        stream: Option<String>,
+    },
+
+    // ── RLS policy ───────────────────────────────────────────────
+    CreateRlsPolicy {
+        raw_sql: String,
+    },
+    DropRlsPolicy {
+        name: String,
+        collection: String,
+        if_exists: bool,
+    },
+    ShowRlsPolicies {
+        collection: Option<String>,
+    },
+
+    // ── Materialized view ────────────────────────────────────────
+    CreateMaterializedView {
+        raw_sql: String,
+    },
+    DropMaterializedView {
+        name: String,
+        if_exists: bool,
+    },
+    ShowMaterializedViews,
+
+    // ── Continuous aggregate ─────────────────────────────────────
+    CreateContinuousAggregate {
+        raw_sql: String,
+    },
+    DropContinuousAggregate {
+        name: String,
+        if_exists: bool,
+    },
+    ShowContinuousAggregates,
+
+    // ── Backup / restore ─────────────────────────────────────────
+    BackupTenant {
+        raw_sql: String,
+    },
+    RestoreTenant {
+        dry_run: bool,
+        raw_sql: String,
+    },
+
+    // ── Cluster admin ────────────────────────────────────────────
+    ShowNodes,
+    ShowNode {
+        node_id: String,
+    },
+    RemoveNode {
+        node_id: String,
+    },
+    ShowCluster,
+    ShowMigrations,
+    ShowRanges,
+    ShowRouting,
+    ShowSchemaVersion,
+    ShowPeerHealth,
+    Rebalance,
+    ShowRaftGroups,
+    ShowRaftGroup {
+        group_id: String,
+    },
+    AlterRaftGroup {
+        raw_sql: String,
+    },
+
+    // ── Maintenance ──────────────────────────────────────────────
+    Analyze {
+        collection: Option<String>,
+    },
+    Compact {
+        collection: String,
+    },
+    ShowStorage {
+        collection: Option<String>,
+    },
+    ShowCompactionStatus,
+
+    // ── User / auth / grant ──────────────────────────────────────
+    CreateUser {
+        raw_sql: String,
+    },
+    DropUser {
+        username: String,
+    },
+    AlterUser {
+        raw_sql: String,
+    },
+    ShowUsers,
+    GrantRole {
+        raw_sql: String,
+    },
+    RevokeRole {
+        raw_sql: String,
+    },
+    GrantPermission {
+        raw_sql: String,
+    },
+    RevokePermission {
+        raw_sql: String,
+    },
+    ShowPermissions {
+        collection: Option<String>,
+    },
+    ShowGrants {
+        username: Option<String>,
+    },
+
+    // ── Miscellaneous ────────────────────────────────────────────
+    ShowTenants,
+    ShowAuditLog,
+    ShowConstraints {
+        collection: String,
+    },
+    ShowTypeGuards {
+        collection: String,
+    },
+
+    /// Catch-all for DDL-like commands not yet promoted to their
+    /// own variant. Preserves the raw SQL for the legacy dispatch
+    /// path so new variants can be added incrementally without
+    /// breaking existing handlers.
+    Other {
+        raw_sql: String,
+    },
+}
diff --git a/nodedb-sql/src/lib.rs b/nodedb-sql/src/lib.rs
index 269d8ea4..4b614fe2 100644
--- a/nodedb-sql/src/lib.rs
+++ b/nodedb-sql/src/lib.rs
@@ -9,6 +9,7 @@
 //! ```
 
 pub mod catalog;
+pub mod ddl_ast;
 pub mod engine_rules;
 pub mod error;
 pub mod functions;

From 72eaecb006eccdab1f9363a9339b2caef2d8184a Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 13:47:34 +0800
Subject: [PATCH 19/24] feat(nodedb-cluster): add MetadataEntry::Batch for
 atomic DDL replication

Add a `Batch` variant to `MetadataEntry` that wraps a sequence of
sub-entries under a single Raft log index. The applier recurses into
batch entries via `cascade_live_state`, and the cache applies each
sub-entry in order. This ensures that a transactional DDL block
(`BEGIN; CREATE ...; COMMIT;`) either commits fully or not at all
across the cluster.
---
 nodedb-cluster/src/metadata_group/applier.rs | 22 ++++++++++++++------
 nodedb-cluster/src/metadata_group/cache.rs   |  5 +++++
 nodedb-cluster/src/metadata_group/entry.rs   |  8 +++++++
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/nodedb-cluster/src/metadata_group/applier.rs b/nodedb-cluster/src/metadata_group/applier.rs
index 8f5db1db..46d59549 100644
--- a/nodedb-cluster/src/metadata_group/applier.rs
+++ b/nodedb-cluster/src/metadata_group/applier.rs
@@ -108,6 +108,21 @@ impl CacheApplier {
         }
     }
 
+    /// Cascade live-state mutations for a committed entry. Handles
+    /// `Batch` by recursing into each sub-entry.
+    fn cascade_live_state(&self, entry: &MetadataEntry) {
+        match entry {
+            MetadataEntry::TopologyChange(change) => self.apply_topology_change(change),
+            MetadataEntry::RoutingChange(change) => self.apply_routing_change(change),
+            MetadataEntry::Batch { entries } => {
+                for sub in entries {
+                    self.cascade_live_state(sub);
+                }
+            }
+            _ => {}
+        }
+    }
+
     /// Mutate the live routing handle (if attached) in response to
     /// a committed `RoutingChange`.
     fn apply_routing_change(&self, change: &RoutingChange) {
@@ -152,12 +167,7 @@ impl MetadataApplier for CacheApplier {
             match decode_entry(data) {
                 Ok(entry) => {
                     guard.apply(*index, &entry);
-                    // Cascade to live state (if attached).
-                    match &entry {
-                        MetadataEntry::TopologyChange(change) => self.apply_topology_change(change),
-                        MetadataEntry::RoutingChange(change) => self.apply_routing_change(change),
-                        _ => {}
-                    }
+                    self.cascade_live_state(&entry);
                 }
                 Err(e) => warn!(index = *index, error = %e, "metadata decode failed"),
             }
diff --git a/nodedb-cluster/src/metadata_group/cache.rs b/nodedb-cluster/src/metadata_group/cache.rs
index 24f7a4ba..23ae959c 100644
--- a/nodedb-cluster/src/metadata_group/cache.rs
+++ b/nodedb-cluster/src/metadata_group/cache.rs
@@ -106,6 +106,11 @@ impl MetadataCache {
                 }
             }
             MetadataEntry::DescriptorDrainEnd { .. } => {}
+            MetadataEntry::Batch { entries } => {
+                for sub in entries {
+                    self.apply(index, sub);
+                }
+            }
         }
     }
 }
diff --git a/nodedb-cluster/src/metadata_group/entry.rs b/nodedb-cluster/src/metadata_group/entry.rs
index c751796e..8c8c18a2 100644
--- a/nodedb-cluster/src/metadata_group/entry.rs
+++ b/nodedb-cluster/src/metadata_group/entry.rs
@@ -39,6 +39,14 @@ pub enum MetadataEntry {
         payload: Vec<u8>,
     },
 
+    /// Atomic batch of metadata entries proposed by a transactional
+    /// DDL session (`BEGIN; CREATE ...; CREATE ...; COMMIT;`). The
+    /// applier unpacks and applies each sub-entry in order at a
+    /// single raft log index, so either all commit or none do.
+    Batch {
+        entries: Vec<MetadataEntry>,
+    },
+
     // ── Topology / routing ─────────────────────────────────────────────
     TopologyChange(TopologyChange),
     RoutingChange(RoutingChange),

From a451bc5587bd1e87fdeb1157b660c4a69d0e4bb2 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 13:47:50 +0800
Subject: [PATCH 20/24] feat(pgwire): implement transactional DDL with
 AST-based dispatch

Wire transactional DDL semantics through the pgwire session layer:

- `ddl_buffer`: thread-local buffer that intercepts `propose_catalog_entry`
  calls while a BEGIN block is active, accumulating encoded payloads
  instead of proposing them immediately.
- `transaction_cmds`: COMMIT flushes the buffer as a single
  `MetadataEntry::Batch` proposal; ROLLBACK discards without proposing.
- `metadata_proposer`: checks the thread-local buffer before proposing,
  returning early when a transaction is active.
- `ast` + router fast path: parse DDL once into `NodedbStatement`,
  handle `IF [NOT] EXISTS` at dispatch before falling through to
  legacy string-prefix handlers.
---
 nodedb/src/control/metadata_proposer.rs       |   9 +
 .../control/server/pgwire/ddl/router/ast.rs   | 231 ++++++++++++++++++
 .../control/server/pgwire/ddl/router/mod.rs   |  13 +
 .../server/pgwire/handler/transaction_cmds.rs |  31 +++
 .../server/pgwire/session/ddl_buffer.rs       | 119 +++++++++
 .../src/control/server/pgwire/session/mod.rs  |   1 +
 6 files changed, 404 insertions(+)
 create mode 100644 nodedb/src/control/server/pgwire/ddl/router/ast.rs
 create mode 100644 nodedb/src/control/server/pgwire/session/ddl_buffer.rs

diff --git a/nodedb/src/control/metadata_proposer.rs b/nodedb/src/control/metadata_proposer.rs
index 8a8314d5..6077a92f 100644
--- a/nodedb/src/control/metadata_proposer.rs
+++ b/nodedb/src/control/metadata_proposer.rs
@@ -176,6 +176,15 @@ pub fn propose_catalog_entry_with_timeout(
     }
 
     let payload = catalog_entry::encode(entry)?;
+
+    // DDL transaction buffer: if a transactional DDL session is
+    // active on this thread (BEGIN ... COMMIT), buffer the payload
+    // instead of proposing immediately. The buffered entries will
+    // be proposed as a single MetadataEntry::Batch at COMMIT time.
+    if crate::control::server::pgwire::session::ddl_buffer::try_buffer(payload.clone()) {
+        return Ok(0);
+    }
+
     let metadata_entry = MetadataEntry::CatalogDdl { payload };
     let raw = encode_entry(&metadata_entry).map_err(|e| Error::Config {
         detail: format!("metadata entry encode: {e}"),
diff --git a/nodedb/src/control/server/pgwire/ddl/router/ast.rs b/nodedb/src/control/server/pgwire/ddl/router/ast.rs
new file mode 100644
index 00000000..8793d904
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/ddl/router/ast.rs
@@ -0,0 +1,231 @@
+//! AST-based DDL dispatch — typed fast path.
+//!
+//! Runs before the legacy string-prefix routers. Handles
+//! `IF [NOT] EXISTS` at the dispatch level so individual handlers
+//! don't need to check. Falls through to legacy dispatch for
+//! `Other` variants and for statements where the typed path
+//! delegates to the existing handler (via `raw_sql`).
+
+use pgwire::api::results::{Response, Tag};
+use pgwire::error::PgWireResult;
+
+use nodedb_sql::ddl_ast::NodedbStatement;
+
+use crate::control::security::identity::AuthenticatedIdentity;
+use crate::control::state::SharedState;
+
+/// Try to dispatch a parsed `NodedbStatement`. Returns `Some` if
+/// fully handled, `None` if the statement should fall through to
+/// the legacy dispatch.
+pub(super) fn try_dispatch(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    stmt: &NodedbStatement,
+) -> Option<PgWireResult<Vec<Response>>> {
+    match stmt {
+        // ── IF NOT EXISTS: swallow duplicate-creation errors ──────
+        NodedbStatement::CreateCollection {
+            name,
+            if_not_exists: true,
+            ..
+        } => {
+            if collection_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("CREATE COLLECTION"))]));
+            }
+            None // fall through to legacy CREATE handler
+        }
+
+        NodedbStatement::CreateSequence {
+            name,
+            if_not_exists: true,
+            ..
+        } => {
+            if sequence_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("CREATE SEQUENCE"))]));
+            }
+            None
+        }
+
+        // ── IF EXISTS: swallow not-found errors on DROP ──────────
+        NodedbStatement::DropCollection {
+            name,
+            if_exists: true,
+        } => {
+            if !collection_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP COLLECTION"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropIndex {
+            if_exists: true, ..
+        } => None, // legacy handler has its own check
+
+        NodedbStatement::DropTrigger {
+            name,
+            if_exists: true,
+            ..
+        } => {
+            if !trigger_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP TRIGGER"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropSchedule {
+            name,
+            if_exists: true,
+        } => {
+            if !schedule_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP SCHEDULE"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropSequence {
+            name,
+            if_exists: true,
+        } => {
+            if !sequence_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP SEQUENCE"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropAlert {
+            name,
+            if_exists: true,
+        } => {
+            if !alert_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new("DROP ALERT"))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropRetentionPolicy {
+            name,
+            if_exists: true,
+        } => {
+            if !retention_policy_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP RETENTION POLICY",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropChangeStream {
+            name,
+            if_exists: true,
+        } => {
+            if !change_stream_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP CHANGE STREAM",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropMaterializedView {
+            name,
+            if_exists: true,
+        } => {
+            if !materialized_view_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP MATERIALIZED VIEW",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropContinuousAggregate {
+            name,
+            if_exists: true,
+        } => {
+            if !continuous_aggregate_exists(state, identity, name) {
+                return Some(Ok(vec![Response::Execution(Tag::new(
+                    "DROP CONTINUOUS AGGREGATE",
+                ))]));
+            }
+            None
+        }
+
+        NodedbStatement::DropRlsPolicy {
+            if_exists: true, ..
+        } => {
+            // RLS policy existence check would need collection context;
+            // fall through to legacy handler which already handles this.
+            None
+        }
+
+        NodedbStatement::DropConsumerGroup {
+            if_exists: true, ..
+        } => None, // legacy handler
+
+        // All other variants fall through to legacy dispatch.
+        _ => None,
+    }
+}
+
+fn collection_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let Some(catalog) = state.credentials.catalog() else {
+        return false;
+    };
+    let tid = identity.tenant_id.as_u32();
+    matches!(catalog.get_collection(tid, name), Ok(Some(_)))
+}
+
+fn trigger_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let Some(catalog) = state.credentials.catalog() else {
+        return false;
+    };
+    let tid = identity.tenant_id.as_u32();
+    matches!(catalog.get_trigger(tid, name), Ok(Some(_)))
+}
+
+fn schedule_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.schedule_registry.get(tid, name).is_some()
+}
+
+fn sequence_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.sequence_registry.exists(tid, name)
+}
+
+fn alert_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.alert_registry.get(tid, name).is_some()
+}
+
+fn retention_policy_exists(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    name: &str,
+) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.retention_policy_registry.get(tid, name).is_some()
+}
+
+fn change_stream_exists(state: &SharedState, identity: &AuthenticatedIdentity, name: &str) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.stream_registry.get(tid, name).is_some()
+}
+
+fn materialized_view_exists(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    name: &str,
+) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.mv_registry.get_def(tid, name).is_some()
+}
+
+fn continuous_aggregate_exists(
+    state: &SharedState,
+    identity: &AuthenticatedIdentity,
+    name: &str,
+) -> bool {
+    let tid = identity.tenant_id.as_u32();
+    state.mv_registry.get_def(tid, name).is_some()
+}
diff --git a/nodedb/src/control/server/pgwire/ddl/router/mod.rs b/nodedb/src/control/server/pgwire/ddl/router/mod.rs
index 73de64f0..905bb53f 100644
--- a/nodedb/src/control/server/pgwire/ddl/router/mod.rs
+++ b/nodedb/src/control/server/pgwire/ddl/router/mod.rs
@@ -1,4 +1,5 @@
 mod admin;
+mod ast;
 mod auth;
 mod collaborative;
 mod dsl;
@@ -26,6 +27,18 @@ pub async fn dispatch(
     identity: &AuthenticatedIdentity,
     sql: &str,
 ) -> Option<PgWireResult<Vec<Response>>> {
+    // AST-typed fast path: parse once, handle IF [NOT] EXISTS at the
+    // dispatch level, then fall through to legacy handlers for the
+    // actual execution. This is the incremental migration path —
+    // once every legacy handler has been ported to accept a typed
+    // NodedbStatement, the string-prefix routers below can be
+    // removed entirely.
+    if let Some(stmt) = nodedb_sql::ddl_ast::parse(sql)
+        && let Some(r) = ast::try_dispatch(state, identity, &stmt)
+    {
+        return Some(r);
+    }
+
     let upper = sql.to_uppercase();
     let parts: Vec<&str> = sql.split_whitespace().collect();
 
diff --git a/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs b/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs
index 63ea8026..0da593bb 100644
--- a/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs
+++ b/nodedb/src/control/server/pgwire/handler/transaction_cmds.rs
@@ -18,6 +18,7 @@ impl NodeDbPgHandler {
             let next = self.state.wal.next_lsn();
             crate::types::Lsn::new(next.as_u64().saturating_sub(1))
         };
+        crate::control::server::pgwire::session::ddl_buffer::activate();
         self.sessions.begin(addr, snapshot_lsn).map_err(|msg| {
             PgWireError::UserError(Box::new(ErrorInfo::new(
                 "ERROR".to_owned(),
@@ -171,6 +172,35 @@ impl NodeDbPgHandler {
             }
         }
 
+        // Flush any buffered DDL entries as a single atomic batch.
+        if let Some(payloads) = crate::control::server::pgwire::session::ddl_buffer::take()
+            && !payloads.is_empty()
+        {
+            use nodedb_cluster::{MetadataEntry, encode_entry};
+            let sub_entries: Vec<MetadataEntry> = payloads
+                .into_iter()
+                .map(|p| MetadataEntry::CatalogDdl { payload: p })
+                .collect();
+            let batch = MetadataEntry::Batch {
+                entries: sub_entries,
+            };
+            if let Some(handle) = self.state.metadata_raft.get() {
+                let raw = encode_entry(&batch).map_err(|e| {
+                    PgWireError::UserError(Box::new(ErrorInfo::new(
+                        "ERROR".to_owned(),
+                        "XX000".to_owned(),
+                        format!("DDL batch encode: {e}"),
+                    )))
+                })?;
+                handle.propose(raw).map_err(|e| {
+                    PgWireError::UserError(Box::new(ErrorInfo::new(
+                        "ERROR".to_owned(),
+                        "XX000".to_owned(),
+                        format!("DDL batch propose: {e}"),
+                    )))
+                })?;
+            }
+        }
         // Close non-WITH-HOLD cursors on transaction end.
         self.sessions.close_non_hold_cursors(addr);
         Ok(vec![Response::Execution(Tag::new("COMMIT"))])
@@ -182,6 +212,7 @@ impl NodeDbPgHandler {
         identity: &AuthenticatedIdentity,
         addr: &std::net::SocketAddr,
     ) -> PgWireResult<Vec<Response>> {
+        crate::control::server::pgwire::session::ddl_buffer::discard();
         let reservations = self.sessions.rollback(addr).unwrap_or_default();
         for handle in &reservations {
             let key = &handle.sequence_key;
diff --git a/nodedb/src/control/server/pgwire/session/ddl_buffer.rs b/nodedb/src/control/server/pgwire/session/ddl_buffer.rs
new file mode 100644
index 00000000..59d8ae26
--- /dev/null
+++ b/nodedb/src/control/server/pgwire/session/ddl_buffer.rs
@@ -0,0 +1,119 @@
+//! Per-session DDL transaction buffer.
+//!
+//! When a pgwire session is inside a `BEGIN` block and executes DDL
+//! statements (CREATE, DROP, ALTER), the `propose_catalog_entry`
+//! path checks this buffer. If the buffer is active (non-None), the
+//! entry is pushed into it instead of being proposed immediately.
+//!
+//! On `COMMIT`, the buffer is flushed as a single
+//! `MetadataEntry::Batch`, so either all DDL in the transaction
+//! commits atomically or none does.
+//!
+//! On `ROLLBACK`, the buffer is cleared without proposing.
+
+use std::cell::RefCell;
+
+/// Encoded DDL payloads buffered during a transaction. Each entry
+/// is a serialized `CatalogEntry` ready for
+/// `MetadataEntry::CatalogDdl { payload }`.
+pub type DdlBuffer = Vec<Vec<u8>>;
+
+thread_local! {
+    /// Thread-local flag: when `Some`, `propose_catalog_entry` pushes
+    /// into this buffer instead of proposing through raft. Set by
+    /// `activate` before DDL dispatch, cleared by `take`.
+    ///
+    /// Thread-local is safe here because pgwire DDL handlers run
+    /// synchronously via `block_in_place` — the buffer is set and
+    /// read on the same OS thread within a single handler call.
+    static ACTIVE_BUFFER: RefCell<Option<DdlBuffer>> = const { RefCell::new(None) };
+}
+
+/// Activate the DDL buffer for the current thread. Any subsequent
+/// call to `try_buffer` will push into this buffer instead of
+/// returning `None`.
+pub fn activate() {
+    ACTIVE_BUFFER.with(|b| {
+        let mut guard = b.borrow_mut();
+        if guard.is_none() {
+            *guard = Some(Vec::new());
+        }
+    });
+}
+
+/// Try to buffer a DDL payload. Returns `true` if the buffer is
+/// active and the payload was pushed. Returns `false` if no buffer
+/// is active (caller should propose normally).
+pub fn try_buffer(payload: Vec<u8>) -> bool {
+    ACTIVE_BUFFER.with(|b| {
+        let mut guard = b.borrow_mut();
+        if let Some(buf) = guard.as_mut() {
+            buf.push(payload);
+            true
+        } else {
+            false
+        }
+    })
+}
+
+/// Take the accumulated buffer contents and deactivate. Returns
+/// `None` if the buffer was never activated.
+pub fn take() -> Option<DdlBuffer> {
+    ACTIVE_BUFFER.with(|b| b.borrow_mut().take())
+}
+
+/// Deactivate and discard the buffer without returning its contents.
+pub fn discard() {
+    ACTIVE_BUFFER.with(|b| {
+        let _ = b.borrow_mut().take();
+    });
+}
+
+/// Returns `true` if a DDL buffer is currently active on this thread.
+pub fn is_active() -> bool {
+    ACTIVE_BUFFER.with(|b| b.borrow().is_some())
+}
+
+/// Number of DDL statements buffered in the current thread's
+/// active transaction. Returns 0 if no buffer is active.
+pub fn buffer_len() -> usize {
+    ACTIVE_BUFFER.with(|b| b.borrow().as_ref().map(|v| v.len()).unwrap_or(0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn inactive_buffer_does_not_capture() {
+        discard(); // ensure clean state
+        assert!(!try_buffer(vec![1, 2, 3]));
+        assert!(!is_active());
+    }
+
+    #[test]
+    fn active_buffer_captures() {
+        activate();
+        assert!(is_active());
+        assert!(try_buffer(vec![1]));
+        assert!(try_buffer(vec![2]));
+        let buf = take().unwrap();
+        assert_eq!(buf.len(), 2);
+        assert!(!is_active());
+    }
+
+    #[test]
+    fn discard_clears_buffer() {
+        activate();
+        try_buffer(vec![1]);
+        discard();
+        assert!(!is_active());
+        assert!(take().is_none());
+    }
+
+    #[test]
+    fn take_on_inactive_returns_none() {
+        discard();
+        assert!(take().is_none());
+    }
+}
diff --git a/nodedb/src/control/server/pgwire/session/mod.rs b/nodedb/src/control/server/pgwire/session/mod.rs
index 91014cc8..8fad3e7a 100644
--- a/nodedb/src/control/server/pgwire/session/mod.rs
+++ b/nodedb/src/control/server/pgwire/session/mod.rs
@@ -1,5 +1,6 @@
 mod cursor;
 pub mod cursor_spill;
+pub mod ddl_buffer;
 mod live;
 mod params;
 pub mod read_consistency;

From 89071f5c6846c9454c7b8fc0d78650ebc91b6b83 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 13:47:58 +0800
Subject: [PATCH 21/24] test(nodedb): add DDL replication correctness tests

Cover the full create/drop lifecycle for collections, sequences,
triggers, and schedules across a 3-node test cluster. Each test
verifies that DDL executed on the leader becomes visible on all
followers within a bounded window, and that IF [NOT] EXISTS branches
complete without error.
---
 nodedb/tests/sql_ddl_cluster.rs | 265 ++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 nodedb/tests/sql_ddl_cluster.rs

diff --git a/nodedb/tests/sql_ddl_cluster.rs b/nodedb/tests/sql_ddl_cluster.rs
new file mode 100644
index 00000000..640b6ae3
--- /dev/null
+++ b/nodedb/tests/sql_ddl_cluster.rs
@@ -0,0 +1,265 @@
+//! DDL replication correctness matrix.
+//!
+//! For every DDL variant that flows through the replicated metadata
+//! path, this file tests:
+//!
+//! 1. Execute DDL on the leader → visible on every follower.
+//! 2. Execute the inverse DDL → removal visible on every node.
+//! 3. `IF NOT EXISTS` / `IF EXISTS` branches handled without error.
+//!
+//! Uses the 3-node `TestCluster` harness from `common/cluster_harness`.
+
+mod common;
+
+use std::time::Duration;
+
+use common::cluster_harness::{TestCluster, wait_for};
+
+// ── Collection ───────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_collection_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION ddl_test_coll")
+        .await
+        .expect("create");
+    wait_for(
+        "collection visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP COLLECTION ddl_test_coll")
+        .await
+        .expect("drop");
+    wait_for(
+        "collection removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() == 0)
+        },
+    )
+    .await;
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_collection_if_not_exists() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION ine_coll")
+        .await
+        .expect("first create");
+    wait_for(
+        "collection visible",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
+    // Second CREATE IF NOT EXISTS must succeed without error.
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION IF NOT EXISTS ine_coll")
+        .await
+        .expect("if not exists must not error");
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_drop_collection_if_exists_missing() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    // DROP IF EXISTS on a nonexistent collection must succeed.
+    cluster
+        .exec_ddl_on_any_leader("DROP COLLECTION IF EXISTS no_such_coll")
+        .await
+        .expect("if exists on missing must not error");
+    cluster.shutdown().await;
+}
+
+// ── Sequence ─────────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_sequence_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE SEQUENCE ddl_test_seq START 1")
+        .await
+        .expect("create seq");
+    wait_for(
+        "sequence visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.has_sequence(1, "ddl_test_seq"))
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP SEQUENCE ddl_test_seq")
+        .await
+        .expect("drop seq");
+    wait_for(
+        "sequence removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || cluster.nodes.iter().all(|n| n.sequence_count(1) == 0),
+    )
+    .await;
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_sequence_if_not_exists() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE SEQUENCE ine_seq START 1")
+        .await
+        .expect("first create");
+    wait_for(
+        "seq visible",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || cluster.nodes.iter().all(|n| n.has_sequence(1, "ine_seq")),
+    )
+    .await;
+    cluster
+        .exec_ddl_on_any_leader("CREATE SEQUENCE IF NOT EXISTS ine_seq START 1")
+        .await
+        .expect("if not exists must not error");
+    cluster.shutdown().await;
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_drop_sequence_if_exists_missing() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("DROP SEQUENCE IF EXISTS no_such_seq")
+        .await
+        .expect("if exists on missing must not error");
+    cluster.shutdown().await;
+}
+
+// ── Trigger ──────────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_trigger_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader("CREATE COLLECTION trig_coll")
+        .await
+        .expect("create coll for trigger");
+    wait_for(
+        "coll visible",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader(
+            "CREATE TRIGGER ddl_test_trig AFTER INSERT ON trig_coll FOR EACH ROW BEGIN RETURN 1; END",
+        )
+        .await
+        .expect("create trigger");
+    wait_for(
+        "trigger visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.has_trigger(1, "ddl_test_trig"))
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP TRIGGER ddl_test_trig ON trig_coll")
+        .await
+        .expect("drop trigger");
+    wait_for(
+        "trigger removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| !n.has_trigger(1, "ddl_test_trig"))
+        },
+    )
+    .await;
+    cluster.shutdown().await;
+}
+
+// ── Schedule ─────────────────────────────────────────────────────
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 6)]
+async fn ddl_create_drop_schedule_replicates() {
+    let cluster = TestCluster::spawn_three().await.expect("cluster");
+    cluster
+        .exec_ddl_on_any_leader(
+            "CREATE SCHEDULE ddl_test_sched CRON '0 0 * * *' AS BEGIN RETURN 1; END",
+        )
+        .await
+        .expect("create schedule");
+    wait_for(
+        "schedule visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.has_schedule(1, "ddl_test_sched"))
+        },
+    )
+    .await;
+
+    cluster
+        .exec_ddl_on_any_leader("DROP SCHEDULE ddl_test_sched")
+        .await
+        .expect("drop schedule");
+    wait_for(
+        "schedule removed on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| !n.has_schedule(1, "ddl_test_sched"))
+        },
+    )
+    .await;
+    cluster.shutdown().await;
+}

From 45c33d1dbf3c801d45a10acb76f536c72f2919a4 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 16:47:29 +0800
Subject: [PATCH 22/24] fix(raft): deduplicate votes by peer ID and make
 election timeout configurable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use a HashSet keyed by peer ID in handle_request_vote_response so that
duplicate grants from the same peer cannot inflate the vote count and
cause premature leader promotion.

Add election_timeout_min/max fields to ClusterConfig and propagate them
through bootstrap, join, and restart paths. Expose matching fields in
ClusterTransportTuning (default 2–5 s) and wire them into MultiRaft so
every startup path honours the configured window rather than using a
hardcoded constant. Adjust network tuning defaults from 60/120 s to 2/5 s
to match real-world cluster behaviour.
---
 nodedb-cluster/src/bootstrap/bootstrap_fn.rs | 6 +++++-
 nodedb-cluster/src/bootstrap/config.rs       | 4 ++++
 nodedb-cluster/src/bootstrap/join.rs         | 7 ++++++-
 nodedb-cluster/src/bootstrap/probe.rs        | 2 ++
 nodedb-cluster/src/bootstrap/restart.rs      | 6 +++++-
 nodedb-cluster/src/multi_raft/core.rs        | 4 ++--
 nodedb-raft/src/node/core.rs                 | 5 +++--
 nodedb-raft/src/node/rpc.rs                  | 4 ++--
 nodedb-types/src/config/tuning/network.rs    | 4 ++--
 9 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
index a09b2e2c..6bb20f2b 100644
--- a/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
+++ b/nodedb-cluster/src/bootstrap/bootstrap_fn.rs
@@ -35,7 +35,8 @@ pub(super) fn bootstrap(config: &ClusterConfig, catalog: &ClusterCatalog) -> Res
     );
 
     // Create MultiRaft with all groups (single-node, no peers).
-    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone());
+    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone())
+        .with_election_timeout(config.election_timeout_min, config.election_timeout_max);
     for group_id in routing.group_ids() {
         multi_raft.add_group(group_id, vec![])?;
     }
@@ -81,6 +82,7 @@ fn generate_cluster_id() -> u64 {
 mod tests {
     use super::*;
     use crate::catalog::ClusterCatalog;
+    use std::time::Duration;
 
     fn temp_catalog() -> (tempfile::TempDir, ClusterCatalog) {
         let dir = tempfile::tempdir().unwrap();
@@ -102,6 +104,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
 
         let state = bootstrap(&config, &catalog).unwrap();
diff --git a/nodedb-cluster/src/bootstrap/config.rs b/nodedb-cluster/src/bootstrap/config.rs
index 933198c5..3b42b1ee 100644
--- a/nodedb-cluster/src/bootstrap/config.rs
+++ b/nodedb-cluster/src/bootstrap/config.rs
@@ -91,6 +91,10 @@ pub struct ClusterConfig {
     /// [`crate::spawn_swim`] after the cluster is up and feed the
     /// seed list from `seed_nodes`.
     pub swim_udp_addr: Option<SocketAddr>,
+    /// Raft election timeout range. Controls how long a follower waits
+    /// before starting an election after losing contact with the leader.
+    pub election_timeout_min: Duration,
+    pub election_timeout_max: Duration,
 }
 
 /// Result of cluster startup — everything needed to run the Raft loop.
diff --git a/nodedb-cluster/src/bootstrap/join.rs b/nodedb-cluster/src/bootstrap/join.rs
index afe6ad7a..67e79854 100644
--- a/nodedb-cluster/src/bootstrap/join.rs
+++ b/nodedb-cluster/src/bootstrap/join.rs
@@ -288,7 +288,8 @@ fn apply_join_response(
     //    learners). A learner-started group boots in the `Learner`
     //    role and will not run an election until a subsequent
     //    `PromoteLearner` conf change is applied.
-    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone());
+    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone())
+        .with_election_timeout(config.election_timeout_min, config.election_timeout_max);
     for g in &resp.groups {
         let is_voter = g.members.contains(&config.node_id);
         let is_learner = g.learners.contains(&config.node_id);
@@ -450,6 +451,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
         let state1 = bootstrap(&config1, &catalog1).unwrap();
 
@@ -499,6 +502,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
 
         let lifecycle = ClusterLifecycleTracker::new();
diff --git a/nodedb-cluster/src/bootstrap/probe.rs b/nodedb-cluster/src/bootstrap/probe.rs
index 1688c87d..4df5838b 100644
--- a/nodedb-cluster/src/bootstrap/probe.rs
+++ b/nodedb-cluster/src/bootstrap/probe.rs
@@ -223,6 +223,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         }
     }
 
diff --git a/nodedb-cluster/src/bootstrap/restart.rs b/nodedb-cluster/src/bootstrap/restart.rs
index 3306142a..1c18186f 100644
--- a/nodedb-cluster/src/bootstrap/restart.rs
+++ b/nodedb-cluster/src/bootstrap/restart.rs
@@ -35,7 +35,8 @@ pub(super) fn restart(
     // as a learner on restart; dropping the group entirely would
     // leave the node permanently without any copy of it and
     // silently broken.
-    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone());
+    let mut multi_raft = MultiRaft::new(config.node_id, routing.clone(), config.data_dir.clone())
+        .with_election_timeout(config.election_timeout_min, config.election_timeout_max);
     for (group_id, info) in routing.group_members() {
         let is_voter = info.members.contains(&config.node_id);
         let is_learner = info.learners.contains(&config.node_id);
@@ -91,6 +92,7 @@ mod tests {
     use super::super::bootstrap_fn::bootstrap;
     use super::*;
     use crate::catalog::ClusterCatalog;
+    use std::time::Duration;
 
     fn temp_catalog() -> (tempfile::TempDir, ClusterCatalog) {
         let dir = tempfile::tempdir().unwrap();
@@ -112,6 +114,8 @@ mod tests {
             force_bootstrap: false,
             join_retry: Default::default(),
             swim_udp_addr: None,
+            election_timeout_min: Duration::from_millis(150),
+            election_timeout_max: Duration::from_millis(300),
         };
 
         // Bootstrap first.
diff --git a/nodedb-cluster/src/multi_raft/core.rs b/nodedb-cluster/src/multi_raft/core.rs
index 9aa60bc4..72029096 100644
--- a/nodedb-cluster/src/multi_raft/core.rs
+++ b/nodedb-cluster/src/multi_raft/core.rs
@@ -77,8 +77,8 @@ impl MultiRaft {
             node_id,
             groups: HashMap::new(),
             routing,
-            election_timeout_min: Duration::from_millis(150),
-            election_timeout_max: Duration::from_millis(300),
+            election_timeout_min: Duration::from_secs(2),
+            election_timeout_max: Duration::from_secs(5),
             heartbeat_interval: Duration::from_millis(50),
             data_dir,
         }
diff --git a/nodedb-raft/src/node/core.rs b/nodedb-raft/src/node/core.rs
index 33ef4d11..0df2408a 100644
--- a/nodedb-raft/src/node/core.rs
+++ b/nodedb-raft/src/node/core.rs
@@ -5,6 +5,7 @@
 //! replication) live in [`super::internal`]. RPC handlers live in
 //! [`super::rpc`].
 
+use std::collections::HashSet;
 use std::time::Instant;
 
 use crate::error::{RaftError, Result};
@@ -61,7 +62,7 @@ pub struct RaftNode<S: LogStorage> {
     /// When the next heartbeat should be sent (leader only).
     pub(super) heartbeat_deadline: Instant,
     /// Votes received in current election.
-    pub(super) votes_received: Vec<u64>,
+    pub(super) votes_received: HashSet<u64>,
     /// Pending ready output.
     pub(super) ready: Ready,
     /// Known leader ID (0 = unknown).
@@ -89,7 +90,7 @@ impl<S: LogStorage> RaftNode<S> {
             leader_state: None,
             election_deadline: now + config.election_timeout_max,
             heartbeat_deadline: now,
-            votes_received: Vec::new(),
+            votes_received: HashSet::new(),
             ready: Ready::default(),
             leader_id: 0,
             config,
diff --git a/nodedb-raft/src/node/rpc.rs b/nodedb-raft/src/node/rpc.rs
index 31a2af41..d2c5e4d4 100644
--- a/nodedb-raft/src/node/rpc.rs
+++ b/nodedb-raft/src/node/rpc.rs
@@ -166,7 +166,7 @@ impl<S: LogStorage> RaftNode<S> {
     }
 
     /// Handle RequestVote response (candidate only).
-    pub fn handle_request_vote_response(&mut self, _peer: u64, resp: &RequestVoteResponse) {
+    pub fn handle_request_vote_response(&mut self, peer: u64, resp: &RequestVoteResponse) {
         if resp.term > self.hard_state.current_term {
             self.become_follower(resp.term);
             return;
@@ -177,7 +177,7 @@ impl<S: LogStorage> RaftNode<S> {
         }
 
         if resp.vote_granted {
-            self.votes_received.push(resp.term);
+            self.votes_received.insert(peer);
             let vote_count = self.votes_received.len() + 1; // +1 for self-vote
 
             if vote_count >= self.config.quorum() {
diff --git a/nodedb-types/src/config/tuning/network.rs b/nodedb-types/src/config/tuning/network.rs
index 888fa982..4fd9d956 100644
--- a/nodedb-types/src/config/tuning/network.rs
+++ b/nodedb-types/src/config/tuning/network.rs
@@ -223,10 +223,10 @@ fn default_raft_tick_interval_ms() -> u64 {
     10
 }
 fn default_election_timeout_min_secs() -> u64 {
-    60
+    2
 }
 fn default_election_timeout_max_secs() -> u64 {
-    120
+    5
 }
 fn default_rpc_timeout_secs() -> u64 {
     5

From dbe01c9c6f12aaf54e11eeba458beb51fb568242 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 16:47:49 +0800
Subject: [PATCH 23/24] feat(cluster): start health monitor and harden topology
 convergence

Start the HealthMonitor task from start_raft so that periodic pings
run for the lifetime of the node. Without it, topology updates relied
solely on the fire-and-forget broadcast during the join flow; when that
broadcast was lost (peer QUIC server not yet accepting), the peer never
converged to the full topology.

On each successful pong, push the current topology to peers whose
reported version is behind ours. This closes the convergence gap for
peers that missed the initial broadcast.

Bound QUIC handshake attempts by rpc_timeout so a hung connection does
not block for the full 30 s idle timeout when a peer is slow to accept.

Expose the cluster catalog via ClusterHandle so the HealthMonitor can
persist topology changes detected during failure/recovery. Increase the
pgwire retry budget from 3 attempts (350 ms) to 5 attempts (750 ms) to
tolerate longer Raft leader drains under load.
---
 nodedb-cluster/src/health.rs                  | 56 ++++++++++++++++++-
 nodedb-cluster/src/transport/client.rs        | 14 ++++-
 nodedb/src/control/cluster/handle.rs          |  4 ++
 nodedb/src/control/cluster/init.rs            | 15 ++++-
 nodedb/src/control/cluster/start_raft.rs      | 23 +++++++-
 .../control/server/pgwire/handler/retry.rs    | 14 +++--
 6 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/nodedb-cluster/src/health.rs b/nodedb-cluster/src/health.rs
index e27e8137..654b4f56 100644
--- a/nodedb-cluster/src/health.rs
+++ b/nodedb-cluster/src/health.rs
@@ -151,14 +151,38 @@ impl HealthMonitor {
         }
     }
 
-    /// Handle a successful pong — reset failure count, mark node Active if needed.
-    fn handle_pong(&self, peer_id: u64, _pong: &PongResponse) -> bool {
+    /// Handle a successful pong — reset failure count, mark node Active
+    /// if needed, and push topology if the peer is behind.
+    fn handle_pong(&self, peer_id: u64, pong: &PongResponse) -> bool {
         // Reset failure count.
         {
             let mut failures = self.ping_failures.lock().unwrap_or_else(|p| p.into_inner());
             failures.remove(&peer_id);
         }
 
+        // Push topology to peers with a stale version. This closes
+        // the convergence gap when the fire-and-forget broadcast
+        // during the join flow is lost (e.g. peer QUIC server not
+        // yet accepting at that instant).
+        let our_version = {
+            let topo = self.topology.read().unwrap_or_else(|p| p.into_inner());
+            topo.version()
+        };
+        if pong.topology_version < our_version {
+            debug!(
+                peer_id,
+                peer_version = pong.topology_version,
+                our_version,
+                "peer has stale topology, pushing update"
+            );
+            let transport = self.transport.clone();
+            let topology = self.topology.clone();
+            let self_id = self.node_id;
+            tokio::spawn(async move {
+                broadcast_topology_to_peer(self_id, peer_id, &topology, &transport).await;
+            });
+        }
+
         // If node was not Active, mark it Active.
         let mut topo = self.topology.write().unwrap_or_else(|p| p.into_inner());
         if let Some(node) = topo.get_node(peer_id)
@@ -264,6 +288,34 @@ pub fn broadcast_topology(
     }
 }
 
+/// Send a topology update to a single peer that has a stale version.
+async fn broadcast_topology_to_peer(
+    _self_node_id: u64,
+    peer_id: u64,
+    topology: &RwLock<ClusterTopology>,
+    transport: &NexarTransport,
+) {
+    let update = {
+        let topo = topology.read().unwrap_or_else(|p| p.into_inner());
+        RaftRpc::TopologyUpdate(TopologyUpdate {
+            version: topo.version(),
+            nodes: topo
+                .all_nodes()
+                .map(|n| JoinNodeInfo {
+                    node_id: n.node_id,
+                    addr: n.addr.clone(),
+                    state: n.state.as_u8(),
+                    raft_groups: n.raft_groups.clone(),
+                    wire_version: n.wire_version,
+                })
+                .collect(),
+        })
+    };
+    if let Err(e) = transport.send_rpc(peer_id, update).await {
+        debug!(peer_id, error = %e, "targeted topology push failed");
+    }
+}
+
 /// Handle an incoming Ping RPC — return a Pong with our topology version.
 pub fn handle_ping(node_id: u64, topology_version: u64, _req: &PingRequest) -> RaftRpc {
     RaftRpc::Pong(PongResponse {
diff --git a/nodedb-cluster/src/transport/client.rs b/nodedb-cluster/src/transport/client.rs
index 71a7d5fd..3fbf3e15 100644
--- a/nodedb-cluster/src/transport/client.rs
+++ b/nodedb-cluster/src/transport/client.rs
@@ -245,15 +245,23 @@ impl NexarTransport {
                 .ok_or(ClusterError::NodeUnreachable { node_id: target })?
         };
 
-        // Connect.
-        let conn = self
+        // Connect — bounded by rpc_timeout so a hung QUIC handshake
+        // (peer not yet serving) doesn't block for the full 30s idle timeout.
+        let connecting = self
             .listener
             .endpoint()
             .connect_with(self.client_config.clone(), addr, SNI_HOSTNAME)
             .map_err(|e| ClusterError::Transport {
                 detail: format!("connect to node {target} at {addr}: {e}"),
-            })?
+            })?;
+        let conn = tokio::time::timeout(self.rpc_timeout, connecting)
             .await
+            .map_err(|_| ClusterError::Transport {
+                detail: format!(
+                    "handshake timeout ({}ms) with node {target} at {addr}",
+                    self.rpc_timeout.as_millis()
+                ),
+            })?
             .map_err(|e| ClusterError::Transport {
                 detail: format!("handshake with node {target} at {addr}: {e}"),
             })?;
diff --git a/nodedb/src/control/cluster/handle.rs b/nodedb/src/control/cluster/handle.rs
index 3cede98e..bc8845e1 100644
--- a/nodedb/src/control/cluster/handle.rs
+++ b/nodedb/src/control/cluster/handle.rs
@@ -33,4 +33,8 @@ pub struct ClusterHandle {
     /// stays `Clone` while still guaranteeing single-transfer
     /// semantics at runtime.
     pub multi_raft: Mutex<Option<nodedb_cluster::MultiRaft>>,
+    /// Cluster catalog (redb-backed topology + routing persistence).
+    /// Shared with the `HealthMonitor` for persisting topology changes
+    /// on failure detection and recovery.
+    pub catalog: Arc<nodedb_cluster::ClusterCatalog>,
 }
diff --git a/nodedb/src/control/cluster/init.rs b/nodedb/src/control/cluster/init.rs
index ef06fe4b..3315b7d8 100644
--- a/nodedb/src/control/cluster/init.rs
+++ b/nodedb/src/control/cluster/init.rs
@@ -38,7 +38,7 @@ pub async fn init_cluster(
         "cluster QUIC transport bound"
     );
 
-    init_cluster_with_transport(config, transport, data_dir).await
+    init_cluster_with_transport(config, transport, data_dir, transport_tuning).await
 }
 
 /// Initialize the cluster using a pre-bound QUIC transport.
@@ -56,13 +56,15 @@ pub async fn init_cluster_with_transport(
     config: &ClusterSettings,
     transport: Arc<nodedb_cluster::NexarTransport>,
     data_dir: &std::path::Path,
+    transport_tuning: &ClusterTransportTuning,
 ) -> crate::Result<ClusterHandle> {
     // 2. Open cluster catalog.
     let catalog_path = data_dir.join("cluster.redb");
-    let catalog =
+    let catalog = Arc::new(
         nodedb_cluster::ClusterCatalog::open(&catalog_path).map_err(|e| crate::Error::Config {
             detail: format!("cluster catalog: {e}"),
-        })?;
+        })?,
+    );
 
     // 3. Bootstrap, join, or restart.
     let cluster_config = nodedb_cluster::ClusterConfig {
@@ -75,6 +77,12 @@ pub async fn init_cluster_with_transport(
         force_bootstrap: config.force_bootstrap,
         join_retry: join_retry_policy_from_env(),
         swim_udp_addr: None,
+        election_timeout_min: std::time::Duration::from_secs(
+            transport_tuning.election_timeout_min_secs,
+        ),
+        election_timeout_max: std::time::Duration::from_secs(
+            transport_tuning.election_timeout_max_secs,
+        ),
     };
 
     let lifecycle = nodedb_cluster::ClusterLifecycleTracker::new();
@@ -105,6 +113,7 @@ pub async fn init_cluster_with_transport(
         applied_index_watcher,
         node_id: config.node_id,
         multi_raft: Mutex::new(Some(state.multi_raft)),
+        catalog,
     })
 }
 
diff --git a/nodedb/src/control/cluster/start_raft.rs b/nodedb/src/control/cluster/start_raft.rs
index 1c14c57c..bc968bc4 100644
--- a/nodedb/src/control/cluster/start_raft.rs
+++ b/nodedb/src/control/cluster/start_raft.rs
@@ -112,7 +112,7 @@ pub fn start_raft(
     // Start the RPC server (accepts inbound QUIC connections).
     let transport_serve = handle.transport.clone();
     let rl_handler = raft_loop.clone();
-    let sr_serve = shutdown_rx;
+    let sr_serve = shutdown_rx.clone();
     tokio::spawn(async move {
         if let Err(e) = transport_serve.serve(rl_handler, sr_serve).await {
             tracing::error!(error = %e, "raft RPC server failed");
@@ -138,6 +138,27 @@ pub fn start_raft(
         );
     }
 
+    // Start the health monitor (periodic pings, failure detection,
+    // topology re-broadcast). Without this, topology updates are
+    // only propagated via the fire-and-forget broadcast during the
+    // join flow — if that single broadcast is lost (peer QUIC server
+    // not yet accepting), the peer never converges.
+    let health_config = nodedb_cluster::HealthConfig {
+        ping_interval: Duration::from_secs(transport_tuning.health_ping_interval_secs),
+        failure_threshold: transport_tuning.health_failure_threshold,
+    };
+    let health_monitor = Arc::new(nodedb_cluster::HealthMonitor::new(
+        handle.node_id,
+        handle.transport.clone(),
+        handle.topology.clone(),
+        handle.catalog.clone(),
+        health_config,
+    ));
+    let sr_health = shutdown_rx;
+    tokio::spawn(async move {
+        health_monitor.run(sr_health).await;
+    });
+
     info!(node_id = handle.node_id, "raft loop and RPC server started");
 
     Ok(ready_rx)
diff --git a/nodedb/src/control/server/pgwire/handler/retry.rs b/nodedb/src/control/server/pgwire/handler/retry.rs
index 3e793ad9..bf536bf2 100644
--- a/nodedb/src/control/server/pgwire/handler/retry.rs
+++ b/nodedb/src/control/server/pgwire/handler/retry.rs
@@ -18,8 +18,8 @@
 //!
 //! ## Retry budget
 //!
-//! Three attempts total with 50ms, 100ms, 200ms backoff between
-//! them — roughly 350ms of tolerance for a drain to complete.
+//! Five attempts total with 50/100/200/400 ms backoff between
+//! them — roughly 750ms of tolerance for a drain to complete.
 //! The `DEFAULT_DRAIN_TIMEOUT` from `metadata_proposer` is 35s,
 //! so in practice either drain completes within our retry budget
 //! (the proposer is actively draining and is probably close to
@@ -31,13 +31,17 @@ use std::time::Duration;
 use crate::error::Error;
 
 /// Maximum number of attempts (including the initial call).
-const MAX_ATTEMPTS: usize = 3;
+const MAX_ATTEMPTS: usize = 5;
 
 /// Backoff durations BETWEEN attempts. `BACKOFFS[i]` is the sleep
 /// duration before attempt `i + 1`. Length must be
 /// `MAX_ATTEMPTS - 1`.
-const BACKOFFS: [Duration; MAX_ATTEMPTS - 1] =
-    [Duration::from_millis(50), Duration::from_millis(100)];
+const BACKOFFS: [Duration; MAX_ATTEMPTS - 1] = [
+    Duration::from_millis(50),
+    Duration::from_millis(100),
+    Duration::from_millis(200),
+    Duration::from_millis(400),
+];
 
 /// Run `op` up to `MAX_ATTEMPTS` times. Retries only on
 /// `Error::RetryableSchemaChanged`. Any other error (including

From 5f6abeb37cd8b18845d9ad6eeb22d80ac0085e37 Mon Sep 17 00:00:00 2001
From: Farhan Syah <bizimpulse@gmail.com>
Date: Thu, 16 Apr 2026 16:48:05 +0800
Subject: [PATCH 24/24] test(cluster): replace fixed sleeps with readiness
 polling and fix SWIM probe test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace fixed tokio::time::sleep calls in tests with bounded polling
loops that break as soon as the expected condition is met. This makes
tests deterministic under heavy parallel load (500+ unit tests sharing
the same CPU pool) rather than depending on timing assumptions.

In the SWIM indirect_ack_saves_target test, remove start_paused — paused
time auto-advances timeouts before channel-woken tasks are polled, making
the indirect path race its own timeout. With real time the 40 ms probe
window is ample for in-memory delivery. Add a recv loop on the local
endpoint to resolve inflight probes when Acks arrive, and handle both
Ping and PingReq in the helper so the test is correct regardless of
which node the scheduler picks as the direct target.

In the cluster harness, replace the 200 ms sleep before peers join with
a topology-readiness poll, and serialise node 2 joining before node 3 to
avoid overwhelming the bootstrap leader's join handler under load.

Extend the descriptor lease planner wait budget from 3 s to 10 s to
match the extended election timeout range used in tests.
---
 nodedb-cluster/src/raft_loop/loop_core.rs     | 53 +++++------
 .../src/swim/detector/probe_round.rs          | 90 +++++++++++++------
 nodedb-cluster/tests/common/mod.rs            |  2 +
 nodedb/tests/cluster_execute_request.rs       | 15 +++-
 .../tests/common/cluster_harness/cluster.rs   | 43 +++++++--
 nodedb/tests/common/cluster_harness/node.rs   |  1 +
 .../descriptor_lease_planner_integration.rs   |  2 +-
 7 files changed, 147 insertions(+), 59 deletions(-)

diff --git a/nodedb-cluster/src/raft_loop/loop_core.rs b/nodedb-cluster/src/raft_loop/loop_core.rs
index e73787dc..ed1ccd98 100644
--- a/nodedb-cluster/src/raft_loop/loop_core.rs
+++ b/nodedb-cluster/src/raft_loop/loop_core.rs
@@ -580,35 +580,38 @@ mod tests {
         let sr1h = shutdown_tx.subscribe();
         tokio::spawn(async move { t1.serve(rl1_h, sr1h).await });
 
-        tokio::time::sleep(Duration::from_millis(200)).await;
-
-        assert!(
-            rl1.applier.count() >= 1,
-            "node 1 should have committed at least the no-op, got {}",
-            rl1.applier.count()
-        );
+        // Poll until node 1 commits at least the no-op (election done).
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
+        loop {
+            if rl1.applier.count() >= 1 {
+                break;
+            }
+            assert!(
+                tokio::time::Instant::now() < deadline,
+                "node 1 should have committed at least the no-op, got {}",
+                rl1.applier.count()
+            );
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
 
         let (_gid, idx) = rl1.propose(0, b"distributed-cmd".to_vec()).unwrap();
         assert!(idx >= 2);
 
-        tokio::time::sleep(Duration::from_millis(200)).await;
-
-        assert!(
-            rl1.applier.count() >= 2,
-            "node 1: expected >= 2 applied, got {}",
-            rl1.applier.count()
-        );
-
-        assert!(
-            rl2.applier.count() >= 1,
-            "node 2: expected >= 1 applied, got {}",
-            rl2.applier.count()
-        );
-        assert!(
-            rl3.applier.count() >= 1,
-            "node 3: expected >= 1 applied, got {}",
-            rl3.applier.count()
-        );
+        // Poll until all nodes replicate the proposed command.
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
+        loop {
+            if rl1.applier.count() >= 2 && rl2.applier.count() >= 1 && rl3.applier.count() >= 1 {
+                break;
+            }
+            assert!(
+                tokio::time::Instant::now() < deadline,
+                "replication timed out: n1={}, n2={}, n3={}",
+                rl1.applier.count(),
+                rl2.applier.count(),
+                rl3.applier.count()
+            );
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
 
         shutdown_tx.send(true).unwrap();
     }
diff --git a/nodedb-cluster/src/swim/detector/probe_round.rs b/nodedb-cluster/src/swim/detector/probe_round.rs
index 882a6a78..02fd0b99 100644
--- a/nodedb-cluster/src/swim/detector/probe_round.rs
+++ b/nodedb-cluster/src/swim/detector/probe_round.rs
@@ -413,13 +413,16 @@ mod tests {
         );
     }
 
-    #[tokio::test(start_paused = true)]
+    #[tokio::test]
     async fn indirect_ack_saves_target() {
+        // No `start_paused` — paused time auto-advances timeouts
+        // before polling channel-woken tasks, making the indirect
+        // path race the timeout. With real time, the 40ms probe
+        // timeout is ample for the in-memory fabric (sub-µs delivery).
         let fab = TransportFabric::new();
-        let local = Arc::new(fab.bind(addr(7000)).await) as Arc<dyn Transport>;
-        // Target bound but silent on the direct channel.
+        let local = Arc::new(fab.bind(addr(7000)).await);
         let _silent = fab.bind(addr(7001)).await;
-        let helper = fab.bind(addr(7002)).await;
+        let helper = Arc::new(fab.bind(addr(7002)).await);
         let list = membership_with_peers(
             "local",
             7000,
@@ -432,38 +435,74 @@ mod tests {
         let mut sched = ProbeScheduler::with_seed(1);
         let inflight = Arc::new(InflightProbes::new());
 
-        // Helper task: forwards any PingReq it sees into an Ack via the
-        // inflight registry. Paused-runtime auto-advance drives the
-        // direct-ping timeout on the main task.
-        let inflight_helper = Arc::clone(&inflight);
+        // Helper task: respond to Ping (direct probe) with Ack and to
+        // PingReq (indirect probe) with a forwarded Ack — mirrors the
+        // production runner recv-loop + handle_ping_req path. The
+        // scheduler may pick n2 as direct target or as indirect helper
+        // depending on the shuffle seed, so both must be handled.
+        let helper_t: Arc<dyn Transport> = helper.clone();
         let responder = tokio::spawn(async move {
             loop {
-                let (_from, msg) = match helper.recv().await {
+                let (from, msg) = match helper_t.recv().await {
                     Ok(v) => v,
                     Err(_) => return,
                 };
-                if let SwimMessage::PingReq(req) = msg {
-                    inflight_helper
-                        .resolve(
-                            req.probe_id,
-                            SwimMessage::Ack(Ack {
-                                probe_id: req.probe_id,
-                                from: req.target.clone(),
-                                incarnation: Incarnation::new(9),
-                                piggyback: vec![],
-                            }),
-                        )
-                        .await;
-                    return;
+                match msg {
+                    SwimMessage::Ping(ping) => {
+                        let _ = helper_t
+                            .send(
+                                from,
+                                SwimMessage::Ack(Ack {
+                                    probe_id: ping.probe_id,
+                                    from: NodeId::new("n2"),
+                                    incarnation: Incarnation::new(9),
+                                    piggyback: vec![],
+                                }),
+                            )
+                            .await;
+                        return;
+                    }
+                    SwimMessage::PingReq(req) => {
+                        let _ = helper_t
+                            .send(
+                                from,
+                                SwimMessage::Ack(Ack {
+                                    probe_id: req.probe_id,
+                                    from: req.target.clone(),
+                                    incarnation: Incarnation::new(9),
+                                    piggyback: vec![],
+                                }),
+                            )
+                            .await;
+                        return;
+                    }
+                    _ => {}
                 }
             }
         });
 
+        // Recv-loop on the local endpoint: resolves inflight probes
+        // when Acks arrive — mirrors the production runner recv-loop.
+        let recv_t: Arc<dyn Transport> = local.clone();
+        let recv_inflight = Arc::clone(&inflight);
+        let recv_loop = tokio::spawn(async move {
+            loop {
+                let (_from, msg) = match recv_t.recv().await {
+                    Ok(v) => v,
+                    Err(_) => return,
+                };
+                if let SwimMessage::Ack(ref ack) = msg {
+                    recv_inflight.resolve(ack.probe_id, msg).await;
+                }
+            }
+        });
+
+        let local_dyn: Arc<dyn Transport> = local.clone();
         let dissemination = Arc::new(DisseminationQueue::new());
         let outcome = ProbeRound {
             scheduler: &mut sched,
             membership: &list,
-            transport: &local,
+            transport: &local_dyn,
             inflight: &inflight,
             dissemination: &dissemination,
             probe_timeout: cfg().probe_timeout,
@@ -476,9 +515,8 @@ mod tests {
         .execute()
         .await
         .expect("run");
-        let _ = responder.await;
-        // Either direct (unlikely — n1 is silent) or indirect ack via n2.
-        // Whichever path fires, the outcome must be Acked.
+        responder.abort();
+        recv_loop.abort();
         assert!(matches!(outcome, ProbeOutcome::Acked { .. }));
     }
 
diff --git a/nodedb-cluster/tests/common/mod.rs b/nodedb-cluster/tests/common/mod.rs
index ea25a084..341682e6 100644
--- a/nodedb-cluster/tests/common/mod.rs
+++ b/nodedb-cluster/tests/common/mod.rs
@@ -187,6 +187,8 @@ impl TestNode {
                 max_backoff_secs: 2,
             },
             swim_udp_addr: None,
+            election_timeout_min: std::time::Duration::from_millis(150),
+            election_timeout_max: std::time::Duration::from_millis(300),
         };
 
         let lifecycle = ClusterLifecycleTracker::new();
diff --git a/nodedb/tests/cluster_execute_request.rs b/nodedb/tests/cluster_execute_request.rs
index bc02383c..453d2b15 100644
--- a/nodedb/tests/cluster_execute_request.rs
+++ b/nodedb/tests/cluster_execute_request.rs
@@ -162,8 +162,19 @@ async fn execute_request_cross_node_dispatch() {
         .await
         .expect("create collection");
 
-    // Give the metadata applier on all nodes a moment to replicate.
-    tokio::time::sleep(Duration::from_millis(400)).await;
+    // Wait for the collection to be visible on every node.
+    common::cluster_harness::wait_for(
+        "cross_node_kv visible on all nodes",
+        Duration::from_secs(10),
+        Duration::from_millis(50),
+        || {
+            cluster
+                .nodes
+                .iter()
+                .all(|n| n.cached_collection_count() >= 1)
+        },
+    )
+    .await;
 
     // Node 2 sends the request; node 1 (bootstrap leader) receives it.
     let sender_transport = cluster.nodes[1]
diff --git a/nodedb/tests/common/cluster_harness/cluster.rs b/nodedb/tests/common/cluster_harness/cluster.rs
index 5c8f7b25..ae769141 100644
--- a/nodedb/tests/common/cluster_harness/cluster.rs
+++ b/nodedb/tests/common/cluster_harness/cluster.rs
@@ -17,7 +17,18 @@ impl TestCluster {
     /// via node 1's pre-bound address. Waits until every node sees
     /// topology_size == 3 (10s deadline).
     pub async fn spawn_three() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
-        Self::spawn_three_with_tuning(ClusterTransportTuning::default()).await
+        Self::spawn_three_with_tuning(ClusterTransportTuning {
+            // Fast health pings so the HealthMonitor re-broadcasts
+            // topology within ~1s if the initial join broadcast was missed.
+            health_ping_interval_secs: 1,
+            // Fast election timeouts so the metadata Raft group elects a
+            // leader well within the 10s convergence deadline, even under
+            // heavy parallel test load.
+            election_timeout_min_secs: 1,
+            election_timeout_max_secs: 2,
+            ..ClusterTransportTuning::default()
+        })
+        .await
     }
 
     /// Spawn a 3-node cluster with a custom `ClusterTransportTuning`.
@@ -29,12 +40,34 @@ impl TestCluster {
     ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
         let node1 = TestClusterNode::spawn_with_tuning(1, vec![], tuning.clone()).await?;
 
-        // Give node 1's transport + raft loop a moment to start
-        // accepting before peers dial in.
-        tokio::time::sleep(Duration::from_millis(200)).await;
+        // Wait until node 1 has bootstrapped (topology shows itself)
+        // before peers try to join. The old fixed 200ms sleep was too
+        // short under heavy host load (e.g. 500+ parallel unit tests
+        // sharing the same CPU pool), causing peers to dial before
+        // node 1's transport was ready — failing topology convergence.
+        let deadline = std::time::Instant::now() + Duration::from_secs(10);
+        while node1.topology_size() < 1 {
+            if std::time::Instant::now() >= deadline {
+                return Err("node 1 failed to bootstrap within 10s".into());
+            }
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
 
         let seeds = vec![node1.listen_addr];
         let node2 = TestClusterNode::spawn_with_tuning(2, seeds.clone(), tuning.clone()).await?;
+
+        // Wait for node 2's join to be reflected before spawning node 3.
+        // Under load, spawning both peers simultaneously can overwhelm the
+        // bootstrap leader's join handler, causing neither join to complete
+        // within the topology convergence deadline.
+        let deadline = std::time::Instant::now() + Duration::from_secs(10);
+        while node1.topology_size() < 2 {
+            if std::time::Instant::now() >= deadline {
+                return Err("node 2 failed to join within 10s".into());
+            }
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
+
         let node3 = TestClusterNode::spawn_with_tuning(3, seeds, tuning).await?;
 
         let cluster = Self {
@@ -44,7 +77,7 @@ impl TestCluster {
         wait_for(
             "all 3 nodes report topology_size == 3",
             Duration::from_secs(10),
-            Duration::from_millis(100),
+            Duration::from_millis(50),
             || cluster.nodes.iter().all(|n| n.topology_size() == 3),
         )
         .await;
diff --git a/nodedb/tests/common/cluster_harness/node.rs b/nodedb/tests/common/cluster_harness/node.rs
index db4c84af..e08461fd 100644
--- a/nodedb/tests/common/cluster_harness/node.rs
+++ b/nodedb/tests/common/cluster_harness/node.rs
@@ -129,6 +129,7 @@ impl TestClusterNode {
             &cluster_settings,
             transport.clone(),
             &data_dir_path,
+            &tuning,
         )
         .await?;
 
diff --git a/nodedb/tests/descriptor_lease_planner_integration.rs b/nodedb/tests/descriptor_lease_planner_integration.rs
index a87f15a8..550434c7 100644
--- a/nodedb/tests/descriptor_lease_planner_integration.rs
+++ b/nodedb/tests/descriptor_lease_planner_integration.rs
@@ -16,7 +16,7 @@ use common::cluster_harness::{TestCluster, wait_for};
 use nodedb_cluster::{DescriptorId, DescriptorKind};
 
 const TENANT: u32 = 1;
-const WAIT_BUDGET: Duration = Duration::from_secs(3);
+const WAIT_BUDGET: Duration = Duration::from_secs(10);
 const POLL: Duration = Duration::from_millis(20);
 
 fn coll_id(name: &str) -> DescriptorId {