From cf95ed12fe1112c3fd85d026109d60f2364b1e29 Mon Sep 17 00:00:00 2001 From: Evrard-Nil Daillet Date: Mon, 15 Jun 2026 12:47:04 +0200 Subject: [PATCH 1/2] fix(db): tolerate not-yet-ready members in Patroni cluster discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parse each /cluster member independently and drop any that don't fully deserialize, instead of failing the entire response over one member. Root cause: Patroni lists a node as a member as soon as it registers in the DCS, which happens before it publishes its conn_url. A replica mid-creation (or an uninitialized node) therefore appears without host/port/api_url. Those fields are required on ClusterMember, so a strict Vec parse failed the whole /cluster response with 'missing field host' — surfacing as 'Failed to refresh cluster state: Failed to parse cluster info: error decoding response body' and freezing topology discovery for every consumer (cloud-api + chat-api) whenever a new postgres instance was being added. Confirmed by repro test against the real struct (PARSE ERROR: missing field host). Same resilience philosophy as the existing deserialize_lag fix: one bad member must not poison the whole cluster view; the next refresh picks the node up once it finishes registering. Adds regression tests for a single initializing member and for the full live /cluster shape with an initializing replica appended. --- crates/database/src/patroni_discovery.rs | 69 ++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/crates/database/src/patroni_discovery.rs b/crates/database/src/patroni_discovery.rs index f88e4ab84..5d7ae8a6e 100644 --- a/crates/database/src/patroni_discovery.rs +++ b/crates/database/src/patroni_discovery.rs @@ -44,8 +44,43 @@ pub struct ClusterMember { pub timeline: Option, } +/// Patroni lists a node as a member as soon as it registers in the DCS — which +/// happens *before* it publishes its `conn_url`. So a replica mid-creation (or +/// an uninitialized node) appears without `host`/`port`/`api_url`. Those fields +/// are required on `ClusterMember`, so a strict `Vec` parse fails +/// the ENTIRE `/cluster` response with `missing field \`host\`` over one +/// half-registered member — freezing topology discovery for every consumer +/// (cloud-api + chat-api) whenever a new postgres instance is being added. +/// +/// Parse each member independently and drop any that don't fully deserialize; a +/// member with no connection info is not a usable leader/replica target anyway, +/// and the next refresh picks it up once it finishes registering. Same +/// resilience philosophy as `deserialize_lag` — one bad member must not poison +/// the whole cluster view. +fn deserialize_members<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let raw = Vec::::deserialize(deserializer)?; + let mut members = Vec::with_capacity(raw.len()); + for value in raw { + match serde_json::from_value::(value.clone()) { + Ok(member) => members.push(member), + Err(e) => { + let name = value + .get("name") + .and_then(serde_json::Value::as_str) + .unwrap_or(""); + warn!("Skipping not-yet-ready cluster member {name}: {e}"); + } + } + } + Ok(members) +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ClusterInfo { + #[serde(deserialize_with = "deserialize_members")] pub members: Vec, #[serde(default)] pub scope: Option, @@ -369,6 +404,40 @@ mod tests { assert!(member.lag.is_none()); } + #[test] + fn test_initializing_member_does_not_poison_parse() { + // Regression: a replica mid-creation is registered in the DCS before it + // publishes its conn_url, so Patroni omits host/port/api_url. Previously + // this failed the whole parse with `missing field \`host\``, breaking + // discovery for cloud-api + chat-api whenever a postgres instance was + // added. The not-yet-ready member must be dropped, not poison the rest. + let json = r#"{"members": [ + {"name":"leader1","role":"leader","state":"running","host":"postgres-a.dstack.internal","port":5432,"timeline":3}, + {"name":"newrep","role":"replica","state":"creating replica","timeline":3} + ],"scope":"pg-cluster"}"#; + let info: ClusterInfo = serde_json::from_str(json).expect("must parse despite half-registered member"); + assert_eq!(info.members.len(), 1, "the not-yet-ready member should be dropped"); + assert_eq!(info.members[0].name, "leader1"); + assert_eq!(info.members[0].role, "leader"); + } + + #[test] + fn test_full_cluster_with_initializing_replica_parses() { + // Real staging /cluster shape with an extra member mid-creation appended + // (no host/port/api_url). The 4 ready members must still parse and the + // leader must still be discoverable. + let json = r#"{"members": [ + {"name":"a","role":"replica","state":"streaming","api_url":"http://[postgres-staging-5hbt5t4n.dstack.internal:8008]:8008/patroni","host":"postgres-staging-5hbt5t4n.dstack.internal","port":5432,"timeline":3,"lag":0}, + {"name":"b","role":"replica","state":"streaming","host":"postgres-yr6k7rmo.dstack.internal","port":5432,"timeline":3,"lag":0}, + {"name":"newrep","role":"replica","state":"creating replica","timeline":3}, + {"name":"leader","role":"leader","state":"running","host":"postgres-ew3zj5pk.dstack.internal","port":5432,"timeline":3} + ],"scope":"pg-cluster"}"#; + let info: ClusterInfo = serde_json::from_str(json).expect("must parse"); + assert_eq!(info.members.len(), 3, "only the 3 fully-registered members survive"); + assert!(info.members.iter().any(|m| m.role == "leader")); + assert!(info.members.iter().all(|m| m.name != "newrep")); + } + #[test] fn test_cluster_with_stopped_member_string_lag() { // Regression: a single stopped replica reporting `"lag": "unknown"` must From aa06b21bb76eb3976906421ea51e7567c921d4b9 Mon Sep 17 00:00:00 2001 From: Evrard-Nil Daillet Date: Mon, 15 Jun 2026 12:55:25 +0200 Subject: [PATCH 2/2] style: rustfmt the new patroni discovery tests --- crates/database/src/patroni_discovery.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/crates/database/src/patroni_discovery.rs b/crates/database/src/patroni_discovery.rs index 5d7ae8a6e..dd9795410 100644 --- a/crates/database/src/patroni_discovery.rs +++ b/crates/database/src/patroni_discovery.rs @@ -415,8 +415,13 @@ mod tests { {"name":"leader1","role":"leader","state":"running","host":"postgres-a.dstack.internal","port":5432,"timeline":3}, {"name":"newrep","role":"replica","state":"creating replica","timeline":3} ],"scope":"pg-cluster"}"#; - let info: ClusterInfo = serde_json::from_str(json).expect("must parse despite half-registered member"); - assert_eq!(info.members.len(), 1, "the not-yet-ready member should be dropped"); + let info: ClusterInfo = + serde_json::from_str(json).expect("must parse despite half-registered member"); + assert_eq!( + info.members.len(), + 1, + "the not-yet-ready member should be dropped" + ); assert_eq!(info.members[0].name, "leader1"); assert_eq!(info.members[0].role, "leader"); } @@ -433,7 +438,11 @@ mod tests { {"name":"leader","role":"leader","state":"running","host":"postgres-ew3zj5pk.dstack.internal","port":5432,"timeline":3} ],"scope":"pg-cluster"}"#; let info: ClusterInfo = serde_json::from_str(json).expect("must parse"); - assert_eq!(info.members.len(), 3, "only the 3 fully-registered members survive"); + assert_eq!( + info.members.len(), + 3, + "only the 3 fully-registered members survive" + ); assert!(info.members.iter().any(|m| m.role == "leader")); assert!(info.members.iter().all(|m| m.name != "newrep")); }