Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/spock_release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
was declared as `int`, causing signed-integer overflow and a crash when a
single replicated transaction exceeded 2 GB of WAL data. Changed to
`uint64`.
* The apply worker now exits cleanly when the upstream connection dies
(firewall reload, walsender SIGKILL/RST, walsender ping timeout) and the
manager respawns it from the last durably-committed remote LSN. Previously
a stale libpq socket fd produced an `epoll_ctl()` cascade with a follow-on
`error during exception handling` per disconnect, and a corner of the
recovery path could silently advance the replication origin past the
in-flight remote transaction, causing it to be skipped on reconnect.
* Hardened zero-downtime add-node (zodan): logical replication slots created
during sub_create are now marked with failover := true on PostgreSQL 17+
so they're picked up by the native slotsync worker, Phase 9 waits for apply
workers to come up before proceeding, and a crash in the cswp error path
(CopyErrorData called while still in ErrorContext) has been fixed.

## Spock 5.0.7

Expand Down
39 changes: 1 addition & 38 deletions src/spock_failover_slots.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@
#include "replication/slot.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#if PG_VERSION_NUM >= 170000
#include "replication/slotsync.h"
#endif

#include "storage/ipc.h"
#include "storage/procarray.h"
Expand Down Expand Up @@ -1227,14 +1224,6 @@ synchronize_failover_slots(long sleep_time)
void
spock_failover_slots_main(Datum main_arg)
{
#if PG_VERSION_NUM >= 180000
/*
* PostgreSQL 18 has native logical slot synchronization via
* sync_replication_slots = on. This worker is not registered on PG18,
* so this entry point should never be reached.
*/
elog(ERROR, "spock_failover_slots_main: not supported on PostgreSQL 18+");
#else
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGTERM, die);
Expand All @@ -1261,18 +1250,8 @@ spock_failover_slots_main(Datum main_arg)
/*
* On standby, run sync only when hot_standby_feedback is on; otherwise
* use long nap so we never elog(ERROR) for hot_standby_feedback off.
*
* On PG17+, yield entirely to PostgreSQL's native slotsync worker when
* sync_replication_slots = on is configured. IsSyncingReplicationSlots()
* is process-local and would always be false here; instead we check the
* exported sync_replication_slots GUC variable directly — if the DBA
* has enabled the native worker, we must not compete with it.
*/
if (RecoveryInProgress() && hot_standby_feedback
#if PG_VERSION_NUM >= 170000
&& !sync_replication_slots
#endif
)
if (RecoveryInProgress() && hot_standby_feedback)
sleep_time = synchronize_failover_slots(WORKER_NAP_TIME);
else
sleep_time = WORKER_NAP_TIME * 10;
Expand All @@ -1294,7 +1273,6 @@ spock_failover_slots_main(Datum main_arg)
ProcessConfigFile(PGC_SIGHUP);
}
}
#endif /* PG_VERSION_NUM < 180000 */
}

static bool
Expand Down Expand Up @@ -1626,20 +1604,6 @@ spock_init_failover_slot(void)
if (IsBinaryUpgrade)
return;

#if PG_VERSION_NUM >= 180000
/*
* PostgreSQL 18 natively synchronizes logical replication slots to
* physical standbys via sync_replication_slots = on (slotsync worker)
* and provides synchronized_standby_slots for walsender hold-back.
* Spock's failover slot worker is not needed on PG18+.
*
* To enable slot synchronization on PG18, set in postgresql.conf:
* sync_replication_slots = on
* primary_conninfo = '...'
*/
elog(LOG, "spock: skipping failover slot worker on PostgreSQL 18+ "
"(use sync_replication_slots = on instead)");
#else
/* Run the worker. */
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags =
Expand All @@ -1655,5 +1619,4 @@ spock_init_failover_slot(void)
/* Install Hooks */
original_client_auth_hook = ClientAuthentication_hook;
ClientAuthentication_hook = attach_to_walsender;
#endif /* PG_VERSION_NUM < 180000 */
}
20 changes: 2 additions & 18 deletions src/spock_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -320,30 +320,14 @@ ensure_replication_slot_snapshot(PGconn *sql_conn, PGconn *repl_conn,
StringInfoData query;
char *snapshot;

(void) use_failover_slot; /* native PG slot-sync intentionally disabled */

retry:
initStringInfo(&query);

appendStringInfo(&query, "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s",
slot_name, "spock_output");

/*
* Mark the slot with (FAILOVER) when the *remote* provider is PG17+.
* PG17+ supports logical slot synchronization to physical standbys via
* sync_replication_slots = on. PG17+ uses parenthesised option syntax:
* CREATE_REPLICATION_SLOT "name" LOGICAL plugin (FAILOVER)
*
* We key off the regular SQL connection (sql_conn) for version detection.
* Replication protocol connections (repl_conn) return 0 from PQserverVersion()
* so they cannot be used for this check.
*/
if (PQserverVersion(sql_conn) >= 170000)
appendStringInfo(&query, " (FAILOVER)");
#if PG_VERSION_NUM < 170000
else if (use_failover_slot)
appendStringInfo(&query, " (FAILOVER)");
#endif


res = PQexec(repl_conn, query.data);

if (PQresultStatus(res) != PGRES_TUPLES_OK)
Expand Down
127 changes: 50 additions & 77 deletions tests/tap/t/018_failover_slots.pl
Original file line number Diff line number Diff line change
Expand Up @@ -111,31 +111,25 @@ sub wait_until {
"Logical slot created on n1: '$slot_name'");

# ==========================================================================
# 4. Verify FAILOVER flag on slot (PG17+)
# 4. Verify FAILOVER flag on slot is NOT set (spock owns the sync; native
# PG slot-sync is intentionally disabled on this branch).
# ==========================================================================
if ($pg_major >= 17) {
my $fv = scalar_query(1,
"SELECT failover FROM pg_replication_slots WHERE slot_name='$slot_name'");
$fv =~ s/\s+//g;
is($fv, 't',
"PG$pg_major: slot '$slot_name' was created with FAILOVER=true");
is($fv, 'f',
"PG$pg_major: slot '$slot_name' NOT created with FAILOVER (spock handles sync)");
} else {
pass("PG$pg_major: FAILOVER flag not applicable (PG15/16)");
}

# ==========================================================================
# 5. Verify spock failover bgworker state on n1 (primary)
# 5. spock_failover_slots bgworker on primary: not used regardless of version
# (the worker only does work on a standby in recovery). We don't assert
# anything about its presence here — that's checked on the standby below.
# ==========================================================================
if ($pg_major >= 18) {
my $wc = scalar_query(1,
"SELECT count(*) FROM pg_stat_activity
WHERE application_name = 'spock_failover_slots worker'");
$wc =~ s/\s+//g;
is($wc, '0',
"PG18+: spock_failover_slots bgworker not registered on primary");
} else {
pass("PG$pg_major: spock bgworker expected (PG15/16/17 uses it on standby)");
}
pass("PG$pg_major: spock bgworker check deferred to standby (section 12)");

# ==========================================================================
# 6. Create physical replication slot for the standby
Expand Down Expand Up @@ -191,10 +185,9 @@ sub wait_until {
print $conf "log_min_messages = debug1\n";
print $conf "log_replication_commands = on\n";

if ($pg_major >= 17) {
# Enable native slot sync worker on standby
print $conf "sync_replication_slots = on\n";
}
# Native slot sync is intentionally NOT enabled on this branch — spock's
# failover-slot worker handles synchronization for every supported PG
# version, so leave sync_replication_slots at its default (off).
close($conf);
}

Expand All @@ -210,12 +203,10 @@ sub wait_until {
close($aconf);
}

# PG17+: hold walsenders on primary until standby confirms LSN
if ($pg_major >= 17) {
psql_or_bail(1,
"ALTER SYSTEM SET synchronized_standby_slots = 'standby_physical_slot'");
psql_or_bail(1, "SELECT pg_reload_conf()");
}
# synchronized_standby_slots is the native walsender-hold-back mechanism;
# it's intentionally NOT configured here because this branch does not use
# native PG slot sync. Spock's worker covers the sync path on every
# supported PG version.

system_or_bail("$pg_bin/pg_ctl", 'start',
'-D', $standby_datadir, '-l', "$standby_datadir/startup.log", '-w');
Expand Down Expand Up @@ -284,79 +275,66 @@ sub wait_until {
}

# ==========================================================================
# 11. PG17+: verify synced=t and failover=t on standby
# 11. PG17+: standby slot is synced by spock's worker, NOT by native PG
# slotsync. Therefore the slot must show synced=f and failover=f.
# ==========================================================================
if ($pg_major >= 17) {
# Poll until synced=true (slotsync may take a few cycles)
my $fully_synced = wait_until(30, 3, sub {
my $s = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT synced FROM pg_replication_slots
WHERE slot_name = '$slot_name'");
$s =~ s/\s+//g;
return $s eq 't';
});
is($fully_synced, 1,
"PG$pg_major: standby slot '$slot_name' has synced=true");
my $sd = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT synced FROM pg_replication_slots
WHERE slot_name = '$slot_name'");
$sd =~ s/\s+//g;
is($sd, 'f',
"PG$pg_major: standby slot '$slot_name' has synced=false (spock worker, not native)");

my $fb = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT failover FROM pg_replication_slots
WHERE slot_name = '$slot_name'");
$fb =~ s/\s+//g;
is($fb, 't',
"PG$pg_major: standby slot '$slot_name' has failover=true");

# Verify slot LSN on standby is not behind primary by more than 1MB
is($fb, 'f',
"PG$pg_major: standby slot '$slot_name' has failover=false (native sync disabled)");

# Verify slot LSN on standby is set and behind/at primary. spock's
# failover-slot worker prefers restart_lsn (which it sets during
# ReplicationSlotCreate/LogicalIncreaseRestartDecodingForSlot);
# confirmed_flush_lsn may stay NULL until LogicalConfirmReceivedLocation
# runs the first time, so poll for either column.
my $primary_lsn = scalar_query(1, "SELECT pg_current_wal_lsn()");
$primary_lsn =~ s/\s+//g;
my $slot_lsn = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT confirmed_flush_lsn FROM pg_replication_slots
WHERE slot_name = '$slot_name'");
$slot_lsn =~ s/\s+//g;
my $lag = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT '$primary_lsn'::pg_lsn - confirmed_flush_lsn
FROM pg_replication_slots WHERE slot_name = '$slot_name'");
$lag =~ s/\s+//g;
ok(defined($lag) && $lag ne '',
"PG$pg_major: slot LSN lag from primary is measurable ($lag bytes)");

diag(" primary_lsn=$primary_lsn slot_lsn=$slot_lsn lag=${lag}bytes");
my $slot_lsn = '';
my $slot_lsn_ok = wait_until(30, 2, sub {
$slot_lsn = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT coalesce(confirmed_flush_lsn::text, restart_lsn::text, '')
FROM pg_replication_slots WHERE slot_name = '$slot_name'");
$slot_lsn =~ s/\s+//g;
return $slot_lsn ne '';
});
ok($slot_lsn_ok,
"PG$pg_major: standby slot has an LSN set (slot_lsn=$slot_lsn)");

diag(" primary_lsn=$primary_lsn slot_lsn=$slot_lsn");
} else {
pass("PG$pg_major: synced column not available");
pass("PG$pg_major: failover column not available");
pass("PG$pg_major: LSN lag check skipped");
}

# ==========================================================================
# 12. Verify spock_failover_slots bgworker state on standby per PG version:
# PG15/16: worker must be running (sole sync mechanism)
# PG17: worker is registered and present; it yields to native slotsync
# when sync_replication_slots=on but still appears in pg_stat_activity
# PG18+: worker is not registered at all
# 12. spock_failover_slots bgworker must be running on the standby for
# every supported PG version — spock owns slot sync for all of them.
# ==========================================================================
my $bgw_count = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT count(*) FROM pg_stat_activity
WHERE application_name = 'spock_failover_slots worker'");
$bgw_count =~ s/\s+//g;

if ($pg_major < 17) {
ok($bgw_count > 0,
"PG$pg_major: spock_failover_slots worker running on standby");
} elsif ($pg_major == 17) {
ok($bgw_count > 0,
"PG17: spock_failover_slots worker registered on standby (yields to native slotsync)");
} else {
pass("PG$pg_major: spock bgworker not expected on standby (PG18+ native slotsync only)");
}
ok($bgw_count > 0,
"PG$pg_major: spock_failover_slots worker running on standby");

# ==========================================================================
# 13. PG18+: confirm no spock bgworker on standby
# 13. (placeholder to keep test count stable across the schedule)
# ==========================================================================
if ($pg_major >= 18) {
is($bgw_count, '0',
"PG18+: no spock_failover_slots bgworker on standby");
} else {
pass("PG$pg_major: bgworker absence check not applicable (< PG18)");
}
pass("PG$pg_major: spock owns failover slot sync regardless of PG version");

# ==========================================================================
# 14. Write data on n1, verify n2 receives it (baseline replication check)
Expand Down Expand Up @@ -483,14 +461,9 @@ sub wait_until {
# ==========================================================================
system("$pg_bin/pg_ctl stop -D $standby_datadir -m immediate >> /dev/null 2>&1");

# Undo primary GUC change so destroy_cluster can restart n1 cleanly
# Restart n1 so destroy_cluster can connect cleanly.
system("$pg_bin/postgres -D $primary_dir >> /dev/null 2>&1 &");
sleep(10);
system_maybe("$pg_bin/psql", '-h', $host, '-p', $primary_port,
'-d', $dbname, '-U', $db_user,
'-c', "ALTER SYSTEM RESET synchronized_standby_slots");
system_maybe("$pg_bin/psql", '-h', $host, '-p', $primary_port,
'-d', $dbname, '-U', $db_user, '-c', "SELECT pg_reload_conf()");

system("rm -rf $standby_datadir 2>/dev/null");

Expand Down
Loading