diff --git a/samples/Z0DAN/zodan.sql b/samples/Z0DAN/zodan.sql index 98d109f0..f7da08eb 100644 --- a/samples/Z0DAN/zodan.sql +++ b/samples/Z0DAN/zodan.sql @@ -353,9 +353,10 @@ LANGUAGE plpgsql AS $$ DECLARE - remotesql text; - result RECORD; - exists_count int; + remotesql text; + result RECORD; + exists_count int; + remote_version int; BEGIN -- ============================================================================ -- Step 1: Check if replication slot already exists on remote node @@ -383,11 +384,27 @@ BEGIN -- ============================================================================ -- Step 2: Build remote SQL for replication slot creation + -- On PG17+ pass failover := true so the slot is picked up by the + -- native slotsync worker (sync_replication_slots = on) and is + -- synchronized to physical standbys, matching what Spock does + -- on its own CREATE_REPLICATION_SLOT path (see spock_sync.c). -- ============================================================================ - remotesql := format( - 'SELECT slot_name, lsn FROM pg_create_logical_replication_slot(%L, %L)', - slot_name, plugin - ); + SELECT v INTO remote_version + FROM dblink(node_dsn, 'SHOW server_version_num') AS t(v int); + + IF remote_version >= 170000 THEN + remotesql := format( + 'SELECT slot_name, lsn ' + 'FROM pg_create_logical_replication_slot(%L, %L, false, false, true)', + slot_name, plugin + ); + ELSE + remotesql := format( + 'SELECT slot_name, lsn ' + 'FROM pg_create_logical_replication_slot(%L, %L)', + slot_name, plugin + ); + END IF; IF verb THEN RAISE NOTICE '[QUERY] %', remotesql; @@ -1768,6 +1785,9 @@ BEGIN verb -- verbose ); RAISE NOTICE ' ✓ %', rpad('Creating subscription ' || sub_name || ' on node ' || rec.node_name || '...', 120, ' '); + -- Allow the apply worker on rec.node time to come up and + -- create its slot on new_node before the next iteration + -- (and before subsequent zodan phases poke this state). PERFORM pg_sleep(5); subscription_count := subscription_count + 1; EXCEPTION diff --git a/src/spock_sync.c b/src/spock_sync.c index 99a97aaa..50d16971 100644 --- a/src/spock_sync.c +++ b/src/spock_sync.c @@ -1483,7 +1483,17 @@ spock_sync_subscription(SpockSubscription *sub) } PG_CATCH(); { - ErrorData *edata = CopyErrorData(); + MemoryContext savecxt; + ErrorData *edata; + + /* + * CopyErrorData() requires that we are NOT running in + * ErrorContext, otherwise its assertion in elog.c trips on + * cassert builds and the apply worker dies with SIGABRT. + * Switch into our long-lived sync context first. + */ + savecxt = MemoryContextSwitchTo(myctx); + edata = CopyErrorData(); FlushErrorState(); elog(LOG, "SPOCK cswp error sub=%s slot=%s: %s", @@ -1502,6 +1512,7 @@ spock_sync_subscription(SpockSubscription *sub) } FreeErrorData(edata); + MemoryContextSwitchTo(savecxt); PG_RE_THROW(); } PG_END_TRY(); diff --git a/tests/tap/t/018_failover_slots.pl b/tests/tap/t/018_failover_slots.pl index 3f1a0256..e0417290 100644 --- a/tests/tap/t/018_failover_slots.pl +++ b/tests/tap/t/018_failover_slots.pl @@ -390,6 +390,70 @@ sub wait_until { }); ok($data_ok, 'Row (1, before_failover) replicated n1 -> n2 before failover'); +# ========================================================================== +# 14b. REGRESSION: read-only standby is queryable while spock is loaded +# +# A customer reported that after enabling spock with logical slot failover, +# the hot_standby could not be queried — basic SELECTs failed because of +# spock interactions on a recovery backend. Re-running the full +# slot-failover dance is not enough; we need explicit assertions that the +# standby answers user SELECT, spock catalog SELECT, and pg_replication_slots +# while it's still in recovery. Without these checks a future regression +# could quietly reintroduce the same bug. +# ========================================================================== + +# Wait for the standby to apply the row we just wrote on n1. +my $primary_wal_lsn = scalar_query(1, "SELECT pg_current_wal_lsn()"); +$primary_wal_lsn =~ s/\s+//g; +my $standby_caught_up = wait_until(60, 2, sub { + my $rl = qport($pg_bin, $host, $standby_port, $dbname, $db_user, + "SELECT pg_last_wal_replay_lsn() >= '$primary_wal_lsn'::pg_lsn"); + $rl =~ s/\s+//g; + return $rl eq 't'; +}); +ok($standby_caught_up, + "Standby applied WAL up to primary lsn $primary_wal_lsn"); + +# Standby must still be in recovery — confirms hot_standby mode and that +# no spock hook accidentally took the standby out of recovery. +my $still_in_recovery = qport($pg_bin, $host, $standby_port, + $dbname, $db_user, "SELECT pg_is_in_recovery()"); +$still_in_recovery =~ s/\s+//g; +is($still_in_recovery, 't', + 'Read-only standby is still in recovery (hot_standby mode)'); + +# 1) User-table SELECT against the standby returns the committed row. +my $val_on_standby = qport($pg_bin, $host, $standby_port, + $dbname, $db_user, "SELECT val FROM failover_test WHERE id = 1"); +$val_on_standby =~ s/\s+//g; +is($val_on_standby, 'before_failover', + 'Read-only standby returns committed user data (SELECT works)'); + +# 2) Spock catalog SELECT against the standby — the original customer +# failure mode was that spock.* reads errored out on a recovery backend. +my $standby_node_count = qport($pg_bin, $host, $standby_port, + $dbname, $db_user, "SELECT count(*) FROM spock.node"); +$standby_node_count =~ s/\s+//g; +ok(($standby_node_count =~ /^\d+$/) && $standby_node_count >= 1, + "Read-only standby returns spock.node ($standby_node_count rows)"); + +# 3) The synced logical slot is visible on the standby. +my $standby_slot_count = qport($pg_bin, $host, $standby_port, + $dbname, $db_user, + "SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$slot_name'"); +$standby_slot_count =~ s/\s+//g; +is($standby_slot_count, '1', + "Read-only standby returns synced slot '$slot_name' via pg_replication_slots"); + +# 4) Writes are rejected — the standby must remain read-only. +my $write_rc = system( + "$pg_bin/psql -X -h $host -p $standby_port -d $dbname -U $db_user " + . "-v ON_ERROR_STOP=1 " + . "-c \"INSERT INTO failover_test VALUES (999, 'must_fail')\" " + . ">/dev/null 2>&1"); +isnt($write_rc, 0, + 'Write against read-only standby is rejected (read-only enforced)'); + # ========================================================================== # 15. Verify invalidation_reason is NULL (slot is healthy on standby) # ==========================================================================