Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions samples/Z0DAN/zodan.sql
Original file line number Diff line number Diff line change
Expand Up @@ -353,9 +353,10 @@ LANGUAGE plpgsql
AS
$$
DECLARE
remotesql text;
result RECORD;
exists_count int;
remotesql text;
result RECORD;
exists_count int;
remote_version int;
BEGIN
-- ============================================================================
-- Step 1: Check if replication slot already exists on remote node
Expand Down Expand Up @@ -383,11 +384,27 @@ BEGIN

-- ============================================================================
-- Step 2: Build remote SQL for replication slot creation
-- On PG17+ pass failover := true so the slot is picked up by the
-- native slotsync worker (sync_replication_slots = on) and is
-- synchronized to physical standbys, matching what Spock does
-- on its own CREATE_REPLICATION_SLOT path (see spock_sync.c).
-- ============================================================================
remotesql := format(
'SELECT slot_name, lsn FROM pg_create_logical_replication_slot(%L, %L)',
slot_name, plugin
);
SELECT v INTO remote_version
FROM dblink(node_dsn, 'SHOW server_version_num') AS t(v int);

IF remote_version >= 170000 THEN
remotesql := format(
'SELECT slot_name, lsn '
'FROM pg_create_logical_replication_slot(%L, %L, false, false, true)',
slot_name, plugin
);
ELSE
remotesql := format(
'SELECT slot_name, lsn '
'FROM pg_create_logical_replication_slot(%L, %L)',
slot_name, plugin
);
END IF;

IF verb THEN
RAISE NOTICE '[QUERY] %', remotesql;
Expand Down Expand Up @@ -1768,6 +1785,9 @@ BEGIN
verb -- verbose
);
RAISE NOTICE ' ✓ %', rpad('Creating subscription ' || sub_name || ' on node ' || rec.node_name || '...', 120, ' ');
-- Allow the apply worker on rec.node time to come up and
-- create its slot on new_node before the next iteration
-- (and before subsequent zodan phases poke this state).
PERFORM pg_sleep(5);
subscription_count := subscription_count + 1;
EXCEPTION
Expand Down
13 changes: 12 additions & 1 deletion src/spock_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -1483,7 +1483,17 @@ spock_sync_subscription(SpockSubscription *sub)
}
PG_CATCH();
{
ErrorData *edata = CopyErrorData();
MemoryContext savecxt;
ErrorData *edata;

/*
* CopyErrorData() requires that we are NOT running in
* ErrorContext, otherwise its assertion in elog.c trips on
* cassert builds and the apply worker dies with SIGABRT.
* Switch into our long-lived sync context first.
*/
savecxt = MemoryContextSwitchTo(myctx);
edata = CopyErrorData();

FlushErrorState();
elog(LOG, "SPOCK cswp error sub=%s slot=%s: %s",
Expand All @@ -1502,6 +1512,7 @@ spock_sync_subscription(SpockSubscription *sub)
}

FreeErrorData(edata);
MemoryContextSwitchTo(savecxt);
PG_RE_THROW();
}
PG_END_TRY();
Expand Down
64 changes: 64 additions & 0 deletions tests/tap/t/018_failover_slots.pl
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,70 @@ sub wait_until {
});
ok($data_ok, 'Row (1, before_failover) replicated n1 -> n2 before failover');

# ==========================================================================
# 14b. REGRESSION: read-only standby is queryable while spock is loaded
#
# A customer reported that after enabling spock with logical slot failover,
# the hot_standby could not be queried — basic SELECTs failed because of
# spock interactions on a recovery backend. Re-running the full
# slot-failover dance is not enough; we need explicit assertions that the
# standby answers user SELECT, spock catalog SELECT, and pg_replication_slots
# while it's still in recovery. Without these checks a future regression
# could quietly reintroduce the same bug.
# ==========================================================================

# Wait for the standby to apply the row we just wrote on n1.
my $primary_wal_lsn = scalar_query(1, "SELECT pg_current_wal_lsn()");
$primary_wal_lsn =~ s/\s+//g;
my $standby_caught_up = wait_until(60, 2, sub {
my $rl = qport($pg_bin, $host, $standby_port, $dbname, $db_user,
"SELECT pg_last_wal_replay_lsn() >= '$primary_wal_lsn'::pg_lsn");
$rl =~ s/\s+//g;
return $rl eq 't';
});
ok($standby_caught_up,
"Standby applied WAL up to primary lsn $primary_wal_lsn");

# Standby must still be in recovery — confirms hot_standby mode and that
# no spock hook accidentally took the standby out of recovery.
my $still_in_recovery = qport($pg_bin, $host, $standby_port,
$dbname, $db_user, "SELECT pg_is_in_recovery()");
$still_in_recovery =~ s/\s+//g;
is($still_in_recovery, 't',
'Read-only standby is still in recovery (hot_standby mode)');

# 1) User-table SELECT against the standby returns the committed row.
my $val_on_standby = qport($pg_bin, $host, $standby_port,
$dbname, $db_user, "SELECT val FROM failover_test WHERE id = 1");
$val_on_standby =~ s/\s+//g;
is($val_on_standby, 'before_failover',
'Read-only standby returns committed user data (SELECT works)');

# 2) Spock catalog SELECT against the standby — the original customer
# failure mode was that spock.* reads errored out on a recovery backend.
my $standby_node_count = qport($pg_bin, $host, $standby_port,
$dbname, $db_user, "SELECT count(*) FROM spock.node");
$standby_node_count =~ s/\s+//g;
ok(($standby_node_count =~ /^\d+$/) && $standby_node_count >= 1,
"Read-only standby returns spock.node ($standby_node_count rows)");

# 3) The synced logical slot is visible on the standby.
my $standby_slot_count = qport($pg_bin, $host, $standby_port,
$dbname, $db_user,
"SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$slot_name'");
$standby_slot_count =~ s/\s+//g;
is($standby_slot_count, '1',
"Read-only standby returns synced slot '$slot_name' via pg_replication_slots");

# 4) Writes are rejected — the standby must remain read-only.
my $write_rc = system(
"$pg_bin/psql -X -h $host -p $standby_port -d $dbname -U $db_user "
. "-v ON_ERROR_STOP=1 "
. "-c \"INSERT INTO failover_test VALUES (999, 'must_fail')\" "
. ">/dev/null 2>&1");
isnt($write_rc, 0,
'Write against read-only standby is rejected (read-only enforced)');
Comment thread
ibrarahmad marked this conversation as resolved.

# ==========================================================================
# 15. Verify invalidation_reason is NULL (slot is healthy on standby)
# ==========================================================================
Expand Down
Loading