From f5baea26d42578f7f02ab1b519461c0f0b19240e Mon Sep 17 00:00:00 2001 From: kong Date: Thu, 1 Jan 2026 01:29:16 +0900 Subject: [PATCH] Fix: Handle 'slot already exists' error gracefully during node join When an apply worker crashes after creating a remote slot but before committing the local replication origin, subsequent restart attempts would fail with FATAL error because the slot already exists. This patch: - Catches ERRCODE_DUPLICATE_OBJECT (42710) error from CREATE_REPLICATION_SLOT - Logs a message and continues with the existing slot - Creates the local replication origin if it doesn't exist - Allows dynamic node addition to work without manual intervention --- src/pgactive.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/pgactive.c b/src/pgactive.c index 6663a1db..cc114cd8 100644 --- a/src/pgactive.c +++ b/src/pgactive.c @@ -516,10 +516,37 @@ pgactive_create_slot(PGconn *streamConn, Name slot_name, char *remote_ident, if (PQresultStatus(res) != PGRES_TUPLES_OK) { + char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); + /* - * TODO: Should test whether this error is 'already exists' and carry - * on + * If the slot already exists (ERRCODE_DUPLICATE_OBJECT = 42710), + * we can continue - just need to create the local replication identifier. */ + if (sqlstate && strcmp(sqlstate, "42710") == 0) + { + elog(LOG, "replication slot \"%s\" already exists on remote, continuing", + NameStr(*slot_name)); + PQclear(res); + + /* Check if local identifier already exists */ + *replication_identifier = replorigin_by_name(remote_ident, true); + if (*replication_identifier == InvalidRepOriginId) + { + /* Create local identifier since it doesn't exist */ + *replication_identifier = replorigin_create(remote_ident); + elog(DEBUG1, "created replication identifier %u for existing slot", + *replication_identifier); + } + else + { + elog(DEBUG1, "replication identifier %u already exists", + *replication_identifier); + } + + CurrentResourceOwner = pgactive_saved_resowner; + pfree(query.data); + return; + } elog(FATAL, "could not send replication command \"%s\": status %s: %s", query.data,