From 453e0e3f0ef3202386b553719f628cef93ff95a7 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 17 Jun 2020 11:05:42 -0400 Subject: [PATCH 001/334] Minor code cleanup for perform_base_backup(). Merge two calls to sendDir() that are exactly the same except for the fifth argument. Adjust comments to match. Also, don't bother checking whether tblspc_map_file is NULL. We initialize it in all cases, so it can't be. Patch by me, reviewed by Amit Kapila and Kyotaro Horiguchi. Discussion: http://postgr.es/m/CA+TgmoYq+59SJ2zBbP891ngWPA9fymOqntqYcweSDYXS2a620A@mail.gmail.com --- src/backend/replication/basebackup.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 62633e7ddcd5..efcf1e6eb56a 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -269,7 +269,7 @@ perform_base_backup(basebackup_options *opt) XLogRecPtr endptr; TimeLineID endtli; StringInfo labelfile; - StringInfo tblspc_map_file = NULL; + StringInfo tblspc_map_file; backup_manifest_info manifest; int datadirpathlen; List *tablespaces = NIL; @@ -424,25 +424,23 @@ perform_base_backup(basebackup_options *opt) if (ti->path == NULL) { struct stat statbuf; + bool sendtblspclinks = true; /* In the main tar, include the backup_label first... */ sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data, &manifest); - /* - * Send tablespace_map file if required and then the bulk of - * the files. - */ - if (tblspc_map_file && opt->sendtblspcmapfile) + /* Then the tablespace_map file, if required... */ + if (opt->sendtblspcmapfile) { sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data, &manifest); - sendDir(".", 1, false, tablespaces, false, - &manifest, NULL); + sendtblspclinks = false; } - else - sendDir(".", 1, false, tablespaces, true, - &manifest, NULL); + + /* Then the bulk of the files... */ + sendDir(".", 1, false, tablespaces, sendtblspclinks, + &manifest, NULL); /* ... and pg_control after everything else. */ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) From 2fd2effc50824a8775a088435a13f47b7a6f3b94 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 17 Jun 2020 11:39:17 -0400 Subject: [PATCH 002/334] Improve server code to read files as part of a base backup. Don't use fread(), since that doesn't necessarily set errno. We could use read() instead, but it's even better to use pg_pread(), which allows us to avoid some extra calls to seek to the desired location in the file. Also, advertise a wait event while reading from a file, as we do for most other places where we're reading data from files. Patch by me, reviewed by Hamid Akhtar. Discussion: http://postgr.es/m/CA+TgmobBw-3573vMosGj06r72ajHsYeKtksT_oTxH8XvTL7DxA@mail.gmail.com --- doc/src/sgml/monitoring.sgml | 4 + src/backend/postmaster/pgstat.c | 3 + src/backend/replication/basebackup.c | 143 ++++++++++++++------------- src/include/pgstat.h | 3 +- 4 files changed, 86 insertions(+), 67 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 89662cc0a367..dfa9d0d6410c 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1193,6 +1193,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + BaseBackupRead + Waiting for base backup to read from a file. + BufFileRead Waiting for a read from a buffered file. diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index e96134dac8aa..c022597bc09a 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3931,6 +3931,9 @@ pgstat_get_wait_io(WaitEventIO w) switch (w) { + case WAIT_EVENT_BASEBACKUP_READ: + event_name = "BaseBackupRead"; + break; case WAIT_EVENT_BUFFILE_READ: event_name = "BufFileRead"; break; diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index efcf1e6eb56a..096b0fcef0d1 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -81,6 +81,8 @@ static int compareWalFileNames(const ListCell *a, const ListCell *b); static void throttle(size_t increment); static void update_basebackup_progress(int64 delta); static bool is_checksummed_file(const char *fullpath, const char *filename); +static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, + const char *filename, bool partial_read_ok); /* Was the backup currently in-progress initiated in recovery mode? */ static bool backup_started_in_recovery = false; @@ -98,18 +100,6 @@ static char *statrelpath = NULL; */ #define THROTTLING_FREQUENCY 8 -/* - * Checks whether we encountered any error in fread(). fread() doesn't give - * any clue what has happened, so we check with ferror(). Also, neither - * fread() nor ferror() set errno, so we just throw a generic error. - */ -#define CHECK_FREAD_ERROR(fp, filename) \ -do { \ - if (ferror(fp)) \ - ereport(ERROR, \ - (errmsg("could not read from file \"%s\"", filename))); \ -} while (0) - /* The actual number of bytes, transfer of which may cause sleep. */ static uint64 throttling_sample; @@ -600,7 +590,7 @@ perform_base_backup(basebackup_options *opt) foreach(lc, walFileList) { char *walFileName = (char *) lfirst(lc); - FILE *fp; + int fd; char buf[TAR_SEND_SIZE]; size_t cnt; pgoff_t len = 0; @@ -608,8 +598,8 @@ perform_base_backup(basebackup_options *opt) snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName); XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); - fp = AllocateFile(pathbuf, "rb"); - if (fp == NULL) + fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY); + if (fd < 0) { int save_errno = errno; @@ -626,7 +616,7 @@ perform_base_backup(basebackup_options *opt) errmsg("could not open file \"%s\": %m", pathbuf))); } - if (fstat(fileno(fp), &statbuf) != 0) + if (fstat(fd, &statbuf) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", @@ -642,9 +632,10 @@ perform_base_backup(basebackup_options *opt) /* send the WAL file itself */ _tarWriteHeader(pathbuf, NULL, &statbuf, false); - while ((cnt = fread(buf, 1, - Min(sizeof(buf), wal_segment_size - len), - fp)) > 0) + while ((cnt = basebackup_read_file(fd, buf, + Min(sizeof(buf), + wal_segment_size - len), + len, pathbuf, true)) > 0) { CheckXLogRemoved(segno, tli); /* Send the chunk as a CopyData message */ @@ -660,8 +651,6 @@ perform_base_backup(basebackup_options *opt) break; } - CHECK_FREAD_ERROR(fp, pathbuf); - if (len != wal_segment_size) { CheckXLogRemoved(segno, tli); @@ -676,7 +665,7 @@ perform_base_backup(basebackup_options *opt) */ Assert(wal_segment_size % TAR_BLOCK_SIZE == 0); - FreeFile(fp); + CloseTransientFile(fd); /* * Mark file as archived, otherwise files can get archived again @@ -1575,7 +1564,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, backup_manifest_info *manifest, const char *spcoid) { - FILE *fp; + int fd; BlockNumber blkno = 0; bool block_retry = false; char buf[TAR_SEND_SIZE]; @@ -1594,8 +1583,8 @@ sendFile(const char *readfilename, const char *tarfilename, pg_checksum_init(&checksum_ctx, manifest->checksum_type); - fp = AllocateFile(readfilename, "rb"); - if (fp == NULL) + fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY); + if (fd < 0) { if (errno == ENOENT && missing_ok) return false; @@ -1637,8 +1626,27 @@ sendFile(const char *readfilename, const char *tarfilename, } } - while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0) + /* + * Loop until we read the amount of data the caller told us to expect. The + * file could be longer, if it was extended while we were sending it, but + * for a base backup we can ignore such extended data. It will be restored + * from WAL. + */ + while (len < statbuf->st_size) { + /* Try to read some more data. */ + cnt = basebackup_read_file(fd, buf, + Min(sizeof(buf), statbuf->st_size - len), + len, readfilename, true); + + /* + * If we hit end-of-file, a concurrent truncation must have occurred. + * That's not an error condition, because WAL replay will fix things + * up. + */ + if (cnt == 0) + break; + /* * The checksums are verified at block level, so we iterate over the * buffer in chunks of BLCKSZ, after making sure that @@ -1689,16 +1697,15 @@ sendFile(const char *readfilename, const char *tarfilename, */ if (block_retry == false) { - /* Reread the failed block */ - if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fseek in file \"%s\": %m", - readfilename))); - } + int reread_cnt; - if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ) + /* Reread the failed block */ + reread_cnt = + basebackup_read_file(fd, buf + BLCKSZ * i, + BLCKSZ, len + BLCKSZ * i, + readfilename, + false); + if (reread_cnt == 0) { /* * If we hit end-of-file, a concurrent @@ -1708,24 +1715,8 @@ sendFile(const char *readfilename, const char *tarfilename, * code that handles that case. (We must fix * up cnt first, though.) */ - if (feof(fp)) - { - cnt = BLCKSZ * i; - break; - } - - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not reread block %d of file \"%s\": %m", - blkno, readfilename))); - } - - if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fseek in file \"%s\": %m", - readfilename))); + cnt = BLCKSZ * i; + break; } /* Set flag so we know a retry was attempted */ @@ -1768,20 +1759,8 @@ sendFile(const char *readfilename, const char *tarfilename, len += cnt; throttle(cnt); - - if (feof(fp) || len >= statbuf->st_size) - { - /* - * Reached end of file. The file could be longer, if it was - * extended while we were sending it, but for a base backup we can - * ignore such extended data. It will be restored from WAL. - */ - break; - } } - CHECK_FREAD_ERROR(fp, readfilename); - /* If the file was truncated while we were sending it, pad it with zeros */ if (len < statbuf->st_size) { @@ -1810,7 +1789,7 @@ sendFile(const char *readfilename, const char *tarfilename, update_basebackup_progress(pad); } - FreeFile(fp); + CloseTransientFile(fd); if (checksum_failures > 1) { @@ -1996,3 +1975,35 @@ update_basebackup_progress(int64 delta) pgstat_progress_update_multi_param(nparam, index, val); } + +/* + * Read some data from a file, setting a wait event and reporting any error + * encountered. + * + * If partial_read_ok is false, also report an error if the number of bytes + * read is not equal to the number of bytes requested. + * + * Returns the number of bytes read. + */ +static int +basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, + const char *filename, bool partial_read_ok) +{ + int rc; + + pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ); + rc = pg_pread(fd, buf, nbytes, offset); + pgstat_report_wait_end(); + + if (rc < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", filename))); + if (!partial_read_ok && rc > 0 && rc != nbytes) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": read %d of %zu", + filename, rc, nbytes))); + + return rc; +} diff --git a/src/include/pgstat.h b/src/include/pgstat.h index c55dc1481ca5..13872013823e 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -913,7 +913,8 @@ typedef enum */ typedef enum { - WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO, + WAIT_EVENT_BASEBACKUP_READ = PG_WAIT_IO, + WAIT_EVENT_BUFFILE_READ, WAIT_EVENT_BUFFILE_WRITE, WAIT_EVENT_CONTROL_FILE_READ, WAIT_EVENT_CONTROL_FILE_SYNC, From fd49d53807575e009f7b66771d48c9356344d7d1 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 15 Jun 2020 18:23:10 -0700 Subject: [PATCH 003/334] Avoid potential spinlock in a signal handler as part of global barriers. On platforms without support for 64bit atomic operations where we also cannot rely on 64bit reads to have single copy atomicity, such atomics are implemented using a spinlock based fallback. That means it's not safe to even read such atomics from within a signal handler (since the signal handler might run when the spinlock already is held). To avoid this issue defer global barrier processing out of the signal handler. Instead of checking local / shared barrier generation to determine whether to set ProcSignalBarrierPending, introduce PROCSIGNAL_BARRIER and always set ProcSignalBarrierPending when receiving such a signal. Additionally avoid redundant work in ProcessProcSignalBarrier if ProcSignalBarrierPending is unnecessarily. Also do a small amount of other polishing. Author: Andres Freund Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20200609193723.eu5ilsjxwdpyxhgz@alap3.anarazel.de Backpatch: 13-, where the code was introduced. --- src/backend/storage/ipc/procsignal.c | 87 ++++++++++++++++------------ src/include/storage/procsignal.h | 1 + 2 files changed, 52 insertions(+), 36 deletions(-) diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index c809196d06a4..4fa385b0ece4 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -320,7 +320,7 @@ SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId) uint64 EmitProcSignalBarrier(ProcSignalBarrierType type) { - uint64 flagbit = UINT64CONST(1) << (uint64) type; + uint32 flagbit = 1 << (uint32) type; uint64 generation; /* @@ -363,7 +363,11 @@ EmitProcSignalBarrier(ProcSignalBarrierType type) pid_t pid = slot->pss_pid; if (pid != 0) + { + /* see SendProcSignal for details */ + slot->pss_signalFlags[PROCSIG_BARRIER] = true; kill(pid, SIGUSR1); + } } return generation; @@ -383,6 +387,8 @@ WaitForProcSignalBarrier(uint64 generation) { long timeout = 125L; + Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration)); + for (int i = NumProcSignalSlots - 1; i >= 0; i--) { volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; @@ -417,6 +423,23 @@ WaitForProcSignalBarrier(uint64 generation) pg_memory_barrier(); } +/* + * Handle receipt of an interrupt indicating a global barrier event. + * + * All the actual work is deferred to ProcessProcSignalBarrier(), because we + * cannot safely access the barrier generation inside the signal handler as + * 64bit atomics might use spinlock based emulation, even for reads. As this + * routine only gets called when PROCSIG_BARRIER is sent that won't cause a + * lot fo unnecessary work. + */ +static void +HandleProcSignalBarrierInterrupt(void) +{ + InterruptPending = true; + ProcSignalBarrierPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} + /* * Perform global barrier related interrupt checking. * @@ -428,22 +451,38 @@ WaitForProcSignalBarrier(uint64 generation) void ProcessProcSignalBarrier(void) { - uint64 generation; + uint64 local_gen; + uint64 shared_gen; uint32 flags; + Assert(MyProcSignalSlot); + /* Exit quickly if there's no work to do. */ if (!ProcSignalBarrierPending) return; ProcSignalBarrierPending = false; /* - * Read the current barrier generation, and then get the flags that are - * set for this backend. Note that pg_atomic_exchange_u32 is a full - * barrier, so we're guaranteed that the read of the barrier generation - * happens before we atomically extract the flags, and that any subsequent - * state changes happen afterward. + * It's not unlikely to process multiple barriers at once, before the + * signals for all the barriers have arrived. To avoid unnecessary work in + * response to subsequent signals, exit early if we already have processed + * all of them. + */ + local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration); + shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); + + Assert(local_gen <= shared_gen); + + if (local_gen == shared_gen) + return; + + /* + * Get and clear the flags that are set for this backend. Note that + * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the + * read of the barrier generation above happens before we atomically + * extract the flags, and that any subsequent state changes happen + * afterward. */ - generation = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0); /* @@ -466,7 +505,7 @@ ProcessProcSignalBarrier(void) * things have changed further, it'll get fixed up when this function is * next called. */ - pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, generation); + pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen); } static void @@ -505,27 +544,6 @@ CheckProcSignal(ProcSignalReason reason) return false; } -/* - * CheckProcSignalBarrier - check for new barriers we need to absorb - */ -static bool -CheckProcSignalBarrier(void) -{ - volatile ProcSignalSlot *slot = MyProcSignalSlot; - - if (slot != NULL) - { - uint64 mygen; - uint64 curgen; - - mygen = pg_atomic_read_u64(&slot->pss_barrierGeneration); - curgen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); - return (mygen != curgen); - } - - return false; -} - /* * procsignal_sigusr1_handler - handle SIGUSR1 signal. */ @@ -546,6 +564,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING)) HandleWalSndInitStopping(); + if (CheckProcSignal(PROCSIG_BARRIER)) + HandleProcSignalBarrierInterrupt(); + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE)) RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE); @@ -564,12 +585,6 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN)) RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); - if (CheckProcSignalBarrier()) - { - InterruptPending = true; - ProcSignalBarrierPending = true; - } - SetLatch(MyLatch); latch_sigusr1_handler(); diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index a0c0bc3ce553..5cb39697f38f 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -33,6 +33,7 @@ typedef enum PROCSIG_NOTIFY_INTERRUPT, /* listen/notify interrupt */ PROCSIG_PARALLEL_MESSAGE, /* message from cooperating parallel backend */ PROCSIG_WALSND_INIT_STOPPING, /* ask walsenders to prepare for shutdown */ + PROCSIG_BARRIER, /* global barrier interrupt */ /* Recovery conflict reasons */ PROCSIG_RECOVERY_CONFLICT_DATABASE, From 4d4ca24efe8ebda9547337f47dcb61d3163be765 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 8 Jun 2020 15:25:49 -0700 Subject: [PATCH 004/334] spinlock emulation: Fix bug when more than INT_MAX spinlocks are initialized. Once the counter goes negative we ended up with spinlocks that errored out on first use (due to check in tas_sema). Author: Andres Freund Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20200606023103.avzrctgv7476xj7i@alap3.anarazel.de Backpatch: 9.5- --- src/backend/storage/lmgr/spin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c index 4d2a4c6641aa..753943e46d62 100644 --- a/src/backend/storage/lmgr/spin.c +++ b/src/backend/storage/lmgr/spin.c @@ -106,7 +106,7 @@ SpinlockSemaInit(void) void s_init_lock_sema(volatile slock_t *lock, bool nested) { - static int counter = 0; + static uint32 counter = 0; *lock = ((++counter) % NUM_SPINLOCK_SEMAPHORES) + 1; } From 6924c37f772cd7701d3e1267a1fb3221ca159ba4 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 17 Jun 2020 15:23:55 -0700 Subject: [PATCH 005/334] Fix nbtree.h dedup state comment. Oversight in commit 0d861bbb. --- src/include/access/nbtree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4e2b056b5456..3b2bcb22a70e 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -758,7 +758,7 @@ typedef struct BTDedupStateData * will not become posting list tuples do not appear in the array (they * are implicitly unchanged by deduplication pass). */ - int nintervals; /* current size of intervals array */ + int nintervals; /* current number of intervals in array */ BTDedupInterval intervals[MaxIndexTuplesPerPage]; } BTDedupStateData; From d8b15eeb8a1acbe01b502ddd3390d7f1824c7a25 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 17 Jun 2020 18:29:29 -0400 Subject: [PATCH 006/334] Sync our copy of the timezone library with IANA release tzcode2020a. This absorbs a leap-second-related bug fix in localtime.c, and teaches zic to handle an expiration marker in the leapseconds file. Neither are of any interest to us (for the foreseeable future anyway), but we need to stay more or less in sync with upstream. Also adjust some over-eager changes in the README from commit 957338418. I have no intention of making changes that require C99 in this code, until such time as all the live back branches require C99. Otherwise back-patching will get too exciting. For the same reason, absorb assorted whitespace and other cosmetic changes from HEAD into the back branches; mostly this reflects use of improved versions of pgindent. All in all then, quite a boring update. But I figured I'd get it done while I was looking at this code. --- src/timezone/README | 18 ++++- src/timezone/localtime.c | 23 +++++- src/timezone/zic.c | 169 ++++++++++++++++++++++++--------------- 3 files changed, 142 insertions(+), 68 deletions(-) diff --git a/src/timezone/README b/src/timezone/README index 3c593933c1c8..9939aa6dd7ea 100644 --- a/src/timezone/README +++ b/src/timezone/README @@ -55,7 +55,7 @@ match properly on the old version. Time Zone code ============== -The code in this directory is currently synced with tzcode release 2019b. +The code in this directory is currently synced with tzcode release 2020a. There are many cosmetic (and not so cosmetic) differences from the original tzcode library, but diffs in the upstream version should usually be propagated to our version. Here are some notes about that. @@ -71,7 +71,14 @@ fixed that.) * We need the code to follow Postgres' portability conventions; this includes relying on configure's results rather than hand-hacked -#defines (see private.h). +#defines (see private.h in particular). + +* Similarly, avoid relying on features that may not exist on old +systems. In particular this means using Postgres' definitions of the int32 +and int64 typedefs, not int_fast32_t/int_fast64_t. Likewise we use +PG_INT32_MIN/MAX not INT32_MIN/MAX. (Once we desupport all PG versions +that don't require C99, it'd be practical to rely on and remove +this set of diffs; but that day is not yet.) * Since Postgres is typically built on a system that has its own copy of the functions, we must avoid conflicting with those. This @@ -109,6 +116,13 @@ to first run the tzcode source files through a sed filter like this: -e 's|^\*/| */|' \ -e 's/\bregister[ \t]//g' \ -e 's/\bATTRIBUTE_PURE[ \t]//g' \ + -e 's/int_fast32_t/int32/g' \ + -e 's/int_fast64_t/int64/g' \ + -e 's/intmax_t/int64/g' \ + -e 's/INT32_MIN/PG_INT32_MIN/g' \ + -e 's/INT32_MAX/PG_INT32_MAX/g' \ + -e 's/INTMAX_MIN/PG_INT64_MIN/g' \ + -e 's/INTMAX_MAX/PG_INT64_MAX/g' \ -e 's/struct[ \t]+tm\b/struct pg_tm/g' \ -e 's/\btime_t\b/pg_time_t/g' \ -e 's/lineno/lineno_t/g' \ diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c index 787f0b69d630..0f65f3c648e5 100644 --- a/src/timezone/localtime.c +++ b/src/timezone/localtime.c @@ -92,6 +92,7 @@ struct rule static struct pg_tm *gmtsub(pg_time_t const *, int32, struct pg_tm *); static bool increment_overflow(int *, int); static bool increment_overflow_time(pg_time_t *, int32); +static int64 leapcorr(struct state const *, pg_time_t); static struct pg_tm *timesub(pg_time_t const *, int32, struct state const *, struct pg_tm *); static bool typesequiv(struct state const *, int, int); @@ -477,12 +478,14 @@ tzloadbody(char const *name, char *canonname, struct state *sp, bool doextend, for (i = 0; i < ts->timecnt; i++) if (sp->timecnt == 0 - || sp->ats[sp->timecnt - 1] < ts->ats[i]) + || (sp->ats[sp->timecnt - 1] + < ts->ats[i] + leapcorr(sp, ts->ats[i]))) break; while (i < ts->timecnt && sp->timecnt < TZ_MAX_TIMES) { - sp->ats[sp->timecnt] = ts->ats[i]; + sp->ats[sp->timecnt] + = ts->ats[i] + leapcorr(sp, ts->ats[i]); sp->types[sp->timecnt] = (sp->typecnt + ts->types[i]); sp->timecnt++; @@ -1601,6 +1604,22 @@ increment_overflow_time(pg_time_t *tp, int32 j) return false; } +static int64 +leapcorr(struct state const *sp, pg_time_t t) +{ + struct lsinfo const *lp; + int i; + + i = sp->leapcnt; + while (--i >= 0) + { + lp = &sp->lsis[i]; + if (t >= lp->ls_trans) + return lp->ls_corr; + } + return 0; +} + /* * Find the next DST transition time in the given zone after the given time * diff --git a/src/timezone/zic.c b/src/timezone/zic.c index 9df81824a0f0..e5a3ca26f42e 100644 --- a/src/timezone/zic.c +++ b/src/timezone/zic.c @@ -125,13 +125,14 @@ static void warning(const char *string,...) pg_attribute_printf(1, 2); static void usage(FILE *stream, int status) pg_attribute_noreturn(); static void addtt(zic_t starttime, int type); static int addtype(zic_t, char const *, bool, bool, bool); -static void leapadd(zic_t, bool, int, int); +static void leapadd(zic_t, int, int); static void adjleap(void); static void associate(void); static void dolink(const char *, const char *, bool); static char **getfields(char *buf); static zic_t gethms(const char *string, const char *errstring); static zic_t getsave(char *, bool *); +static void inexpires(char **, int); static void infile(const char *filename); static void inleap(char **fields, int nfields); static void inlink(char **fields, int nfields); @@ -202,6 +203,7 @@ static int typecnt; #define LC_ZONE 1 #define LC_LINK 2 #define LC_LEAP 3 +#define LC_EXPIRES 4 /* * Which fields are which on a Zone line. @@ -267,6 +269,9 @@ static int typecnt; #define LP_ROLL 6 #define LEAP_FIELDS 7 +/* Expires lines are like Leap lines, except without CORR and ROLL fields. */ +#define EXPIRES_FIELDS 5 + /* * Year synonyms. */ @@ -312,6 +317,7 @@ static struct lookup const zi_line_codes[] = { }; static struct lookup const leap_line_codes[] = { {"Leap", LC_LEAP}, + {"Expires", LC_EXPIRES}, {NULL, 0} }; @@ -584,6 +590,12 @@ static zic_t const max_time = MAXVAL(zic_t, TIME_T_BITS_IN_FILE); static zic_t lo_time = MINVAL(zic_t, TIME_T_BITS_IN_FILE); static zic_t hi_time = MAXVAL(zic_t, TIME_T_BITS_IN_FILE); +/* The time specified by an Expires line, or negative if no such line. */ +static zic_t leapexpires = -1; + +/* The time specified by an #expires comment, or negative if no such line. */ +static zic_t comment_leapexpires = -1; + /* Set the time range of the output to TIMERANGE. Return true if successful. */ static bool @@ -1279,7 +1291,8 @@ infile(const char *name) } if (nfields == 0) { - /* nothing to do */ + if (name == leapsec && *buf == '#') + sscanf(buf, "#expires " INT64_FORMAT, &comment_leapexpires); } else if (wantcont) { @@ -1311,6 +1324,10 @@ infile(const char *name) inleap(fields, nfields); wantcont = false; break; + case LC_EXPIRES: + inexpires(fields, nfields); + wantcont = false; + break; default: /* "cannot happen" */ fprintf(stderr, _("%s: panic: Invalid l_value %d\n"), @@ -1634,8 +1651,8 @@ inzsub(char **fields, int nfields, bool iscont) return hasuntil; } -static void -inleap(char **fields, int nfields) +static zic_t +getleapdatetime(char **fields, int nfields, bool expire_line) { const char *cp; const struct lookup *lp; @@ -1651,11 +1668,6 @@ inleap(char **fields, int nfields) zic_t t; char xs; - if (nfields != LEAP_FIELDS) - { - error(_("wrong number of fields on Leap line")); - return; - } dayoff = 0; cp = fields[LP_YEAR]; if (sscanf(cp, "%d%c", &year, &xs) != 1) @@ -1664,13 +1676,16 @@ inleap(char **fields, int nfields) * Leapin' Lizards! */ error(_("invalid leaping year")); - return; + return -1; + } + if (!expire_line) + { + if (!leapseen || leapmaxyear < year) + leapmaxyear = year; + if (!leapseen || leapminyear > year) + leapminyear = year; + leapseen = true; } - if (!leapseen || leapmaxyear < year) - leapmaxyear = year; - if (!leapseen || leapminyear > year) - leapminyear = year; - leapseen = true; j = EPOCH_YEAR; while (j != year) { @@ -1689,7 +1704,7 @@ inleap(char **fields, int nfields) if ((lp = byword(fields[LP_MONTH], mon_names)) == NULL) { error(_("invalid month name")); - return; + return -1; } month = lp->l_value; j = TM_JANUARY; @@ -1704,56 +1719,70 @@ inleap(char **fields, int nfields) day <= 0 || day > len_months[isleap(year)][month]) { error(_("invalid day of month")); - return; + return -1; } dayoff = oadd(dayoff, day - 1); if (dayoff < min_time / SECSPERDAY) { error(_("time too small")); - return; + return -1; } if (dayoff > max_time / SECSPERDAY) { error(_("time too large")); - return; + return -1; } t = dayoff * SECSPERDAY; tod = gethms(fields[LP_TIME], _("invalid time of day")); - cp = fields[LP_CORR]; + t = tadd(t, tod); + if (t < 0) + error(_("leap second precedes Epoch")); + return t; +} + +static void +inleap(char **fields, int nfields) +{ + if (nfields != LEAP_FIELDS) + error(_("wrong number of fields on Leap line")); + else { - bool positive; - int count; + zic_t t = getleapdatetime(fields, nfields, false); - if (strcmp(cp, "") == 0) - { /* infile() turns "-" into "" */ - positive = false; - count = 1; - } - else if (strcmp(cp, "+") == 0) + if (0 <= t) { - positive = true; - count = 1; - } - else - { - error(_("illegal CORRECTION field on Leap line")); - return; - } - if ((lp = byword(fields[LP_ROLL], leap_types)) == NULL) - { - error(_("illegal Rolling/Stationary field on Leap line")); - return; - } - t = tadd(t, tod); - if (t < 0) - { - error(_("leap second precedes Epoch")); - return; + struct lookup const *lp = byword(fields[LP_ROLL], leap_types); + + if (!lp) + error(_("invalid Rolling/Stationary field on Leap line")); + else + { + int correction = 0; + + if (!fields[LP_CORR][0]) /* infile() turns "-" into "". */ + correction = -1; + else if (strcmp(fields[LP_CORR], "+") == 0) + correction = 1; + else + error(_("invalid CORRECTION field on Leap line")); + if (correction) + leapadd(t, correction, lp->l_value); + } } - leapadd(t, positive, lp->l_value, count); } } +static void +inexpires(char **fields, int nfields) +{ + if (nfields != EXPIRES_FIELDS) + error(_("wrong number of fields on Expires line")); + else if (0 <= leapexpires) + error(_("multiple Expires lines")); + else + leapexpires = getleapdatetime(fields, nfields, true); +} + static void inlink(char **fields, int nfields) { @@ -3369,12 +3398,11 @@ addtype(zic_t utoff, char const *abbr, bool isdst, bool ttisstd, bool ttisut) } static void -leapadd(zic_t t, bool positive, int rolling, int count) +leapadd(zic_t t, int correction, int rolling) { - int i, - j; + int i; - if (leapcnt + (positive ? count : 1) > TZ_MAX_LEAPS) + if (TZ_MAX_LEAPS <= leapcnt) { error(_("too many leap seconds")); exit(EXIT_FAILURE); @@ -3382,19 +3410,13 @@ leapadd(zic_t t, bool positive, int rolling, int count) for (i = 0; i < leapcnt; ++i) if (t <= trans[i]) break; - do - { - for (j = leapcnt; j > i; --j) - { - trans[j] = trans[j - 1]; - corr[j] = corr[j - 1]; - roll[j] = roll[j - 1]; - } - trans[i] = t; - corr[i] = positive ? 1 : -count; - roll[i] = rolling; - ++leapcnt; - } while (positive && --count != 0); + memmove(&trans[i + 1], &trans[i], (leapcnt - i) * sizeof *trans); + memmove(&corr[i + 1], &corr[i], (leapcnt - i) * sizeof *corr); + memmove(&roll[i + 1], &roll[i], (leapcnt - i) * sizeof *roll); + trans[i] = t; + corr[i] = correction; + roll[i] = rolling; + ++leapcnt; } static void @@ -3418,6 +3440,25 @@ adjleap(void) trans[i] = tadd(trans[i], last); last = corr[i] += last; } + + if (leapexpires < 0) + { + leapexpires = comment_leapexpires; + if (0 <= leapexpires) + warning(_("\"#expires\" is obsolescent; use \"Expires\"")); + } + + if (0 <= leapexpires) + { + leapexpires = oadd(leapexpires, last); + if (!(leapcnt == 0 || (trans[leapcnt - 1] < leapexpires))) + { + error(_("last Leap time does not precede Expires time")); + exit(EXIT_FAILURE); + } + if (leapexpires <= hi_time) + hi_time = leapexpires - 1; + } } static char * From 2b2a070d98b2f2c7ecc031e582cfefa400316ce3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 18 Jun 2020 10:40:10 +0900 Subject: [PATCH 007/334] Remove reset of testtablespace from pg_regress on Windows testtablespace is an extra path used as tablespace location in the main regression test suite, computed from --outputdir as defined by the caller of pg_regress (current directory if undefined). This special handling was introduced as of f10589e to be specific to MSVC, as we let pg_regress' Makefile handle this cleanup in other environments. This moves the cleanup to the MSVC script running regression tests instead where needed: check, installcheck and upgradecheck. I have also checked this patch on MSVC with repeated runs of each target. Author: Kyotaro Horiguchi, Michael Paquier Discussion: https://postgr.es/m/20200219.142519.437573253063431435.horikyota.ntt@gmail.com --- src/test/regress/pg_regress.c | 22 ---------------------- src/tools/msvc/vcregress.pl | 17 +++++++++++++++-- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index f11a3b9e26e6..c8d190d2489f 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -494,28 +494,6 @@ convert_sourcefiles_in(const char *source_subdir, const char *dest_dir, const ch snprintf(testtablespace, MAXPGPATH, "%s/testtablespace", outputdir); -#ifdef WIN32 - - /* - * On Windows only, clean out the test tablespace dir, or create it if it - * doesn't exist. On other platforms we expect the Makefile to take care - * of that. (We don't migrate that functionality in here because it'd be - * harder to cope with platform-specific issues such as SELinux.) - * - * XXX it would be better if pg_regress.c had nothing at all to do with - * testtablespace, and this were handled by a .BAT file or similar on - * Windows. See pgsql-hackers discussion of 2008-01-18. - */ - if (directory_exists(testtablespace)) - if (!rmtree(testtablespace, true)) - { - fprintf(stderr, _("\n%s: could not remove test tablespace \"%s\"\n"), - progname, testtablespace); - exit(2); - } - make_directory(testtablespace); -#endif - /* finally loop on each file and do the replacement */ for (name = names; *name; name++) { diff --git a/src/tools/msvc/vcregress.pl b/src/tools/msvc/vcregress.pl index 3365ee578c3d..d6763ad4ac57 100644 --- a/src/tools/msvc/vcregress.pl +++ b/src/tools/msvc/vcregress.pl @@ -123,6 +123,8 @@ sub installcheck_internal sub installcheck { my $schedule = shift || 'serial'; + + CleanupTablespaceDirectory(); installcheck_internal($schedule); return; } @@ -143,6 +145,7 @@ sub check "--temp-instance=./tmp_check"); push(@args, $maxconn) if $maxconn; push(@args, $temp_config) if $temp_config; + CleanupTablespaceDirectory(); system(@args); my $status = $? >> 8; exit $status if $status; @@ -570,8 +573,8 @@ sub upgradecheck $ENV{PGDATA} = "$data.old"; my $outputdir = "$tmp_root/regress"; my @EXTRA_REGRESS_OPTS = ("--outputdir=$outputdir"); - mkdir "$outputdir" || die $!; - mkdir "$outputdir/testtablespace" || die $!; + mkdir "$outputdir" || die $!; + CleanupTablespaceDirectory($outputdir); my $logdir = "$topdir/src/bin/pg_upgrade/log"; rmtree($logdir); @@ -737,6 +740,16 @@ sub InstallTemp return; } +sub CleanupTablespaceDirectory +{ + my $testdir = shift || getcwd(); + + my $testtablespace = "$testdir/testtablespace"; + + rmtree($testtablespace) if (-d $testtablespace); + mkdir($testtablespace); +} + sub usage { print STDERR From 9d402c73ade412bdeb9064c81fc4ed071c4e93f8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 18 Jun 2020 08:41:31 +0200 Subject: [PATCH 008/334] Expand tests for factorial Move from int4 to numeric test. (They were originally int4 functions, but were reimplemented for numeric in 04a4821adef38155b7920ba9eb83c4c3c29156f8.) Add some tests for edge cases. Discussion: https://www.postgresql.org/message-id/flat/6ce1df0e-86a3-e544-743a-f357ff663f68%402ndquadrant.com --- src/test/regress/expected/int4.out | 12 -------- src/test/regress/expected/numeric.out | 41 +++++++++++++++++++++++++++ src/test/regress/sql/int4.sql | 4 --- src/test/regress/sql/numeric.sql | 11 +++++++ 4 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/test/regress/expected/int4.out b/src/test/regress/expected/int4.out index c384af18ee89..77f43739a7c1 100644 --- a/src/test/regress/expected/int4.out +++ b/src/test/regress/expected/int4.out @@ -299,18 +299,6 @@ SELECT int4 '1000' < int4 '999' AS false; f (1 row) -SELECT 4! AS twenty_four; - twenty_four -------------- - 24 -(1 row) - -SELECT !!3 AS six; - six ------ - 6 -(1 row) - SELECT 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 AS ten; ten ----- diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out index c7fe63d03744..b255be7c8520 100644 --- a/src/test/regress/expected/numeric.out +++ b/src/test/regress/expected/numeric.out @@ -2315,3 +2315,44 @@ FROM (VALUES (0::numeric, 0::numeric), SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow ERROR: value overflows numeric format +-- +-- Tests for factorial +-- +SELECT 4!; + ?column? +---------- + 24 +(1 row) + +SELECT !!3; + ?column? +---------- + 6 +(1 row) + +SELECT factorial(15); + factorial +--------------- + 1307674368000 +(1 row) + +SELECT 100000!; +ERROR: value overflows numeric format +SELECT 0!; + ?column? +---------- + 1 +(1 row) + +SELECT -4!; + ?column? +---------- + 1 +(1 row) + +SELECT factorial(-4); + factorial +----------- + 1 +(1 row) + diff --git a/src/test/regress/sql/int4.sql b/src/test/regress/sql/int4.sql index a9e90a96c4c0..b00c9dea2a6d 100644 --- a/src/test/regress/sql/int4.sql +++ b/src/test/regress/sql/int4.sql @@ -114,10 +114,6 @@ SELECT int2 '2' * int4 '2' = int4 '16' / int2 '4' AS true; SELECT int4 '1000' < int4 '999' AS false; -SELECT 4! AS twenty_four; - -SELECT !!3 AS six; - SELECT 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 AS ten; SELECT 2 + 2 / 2 AS three; diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql index 41475a9a245f..1332a9cf07a6 100644 --- a/src/test/regress/sql/numeric.sql +++ b/src/test/regress/sql/numeric.sql @@ -1111,3 +1111,14 @@ FROM (VALUES (0::numeric, 0::numeric), (4232.820::numeric, 132.72000::numeric)) AS v(a, b); SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow + +-- +-- Tests for factorial +-- +SELECT 4!; +SELECT !!3; +SELECT factorial(15); +SELECT 100000!; +SELECT 0!; +SELECT -4!; +SELECT factorial(-4); From 0a40563eadc67472d6fd50dabf7002afa25c3330 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 18 Jun 2020 08:41:31 +0200 Subject: [PATCH 009/334] Disallow factorial of negative numbers The previous implementation returned 1 for all negative numbers, which is not sensible under any definition. Discussion: https://www.postgresql.org/message-id/flat/6ce1df0e-86a3-e544-743a-f357ff663f68%402ndquadrant.com --- src/backend/utils/adt/numeric.c | 4 ++++ src/test/regress/expected/numeric.out | 12 ++---------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index eea42398541b..5f23f2afac86 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -2946,6 +2946,10 @@ numeric_fac(PG_FUNCTION_ARGS) NumericVar fact; NumericVar result; + if (num < 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("factorial of a negative number is undefined"))); if (num <= 1) { res = make_result(&const_one); diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out index b255be7c8520..2f3ecb50a733 100644 --- a/src/test/regress/expected/numeric.out +++ b/src/test/regress/expected/numeric.out @@ -2345,14 +2345,6 @@ SELECT 0!; (1 row) SELECT -4!; - ?column? ----------- - 1 -(1 row) - +ERROR: factorial of a negative number is undefined SELECT factorial(-4); - factorial ------------ - 1 -(1 row) - +ERROR: factorial of a negative number is undefined From b48df818dcbd1a5e34ab7a2d9f98828b7b62140c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 18 Jun 2020 16:34:59 +0900 Subject: [PATCH 010/334] Fix oldest xmin and LSN computation across repslots after advancing Advancing a replication slot did not recompute the oldest xmin and LSN values across replication slots, preventing resource removal like segments not recycled at checkpoint time. The original commit that introduced the slot advancing in 9c7d06d never did the update of those oldest values, and b0afdca removed this code. This commit adds a TAP test to check segment recycling with advancing for physical slots, enforcing an extra segment switch before advancing to check if the segment gets correctly recycled after a checkpoint. Reported-by: Andres Freund Reviewed-by: Alexey Kondratov, Kyptaro Horiguchi Discussion: https://postgr.es/m/20200609171904.kpltxxvjzislidks@alap3.anarazel.de Backpatch-through: 11 --- src/backend/replication/slotfuncs.c | 7 +++++++ src/test/recovery/t/001_stream_rep.pl | 21 +++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 1b929a603e51..06e4955de73b 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -621,6 +621,13 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS) values[0] = NameGetDatum(&MyReplicationSlot->data.name); nulls[0] = false; + /* + * Recompute the minimum LSN and xmin across all slots to adjust with the + * advancing potentially done. + */ + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + ReplicationSlotRelease(); /* Return the reached position. */ diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl index 0c316c18082e..778f11b28b43 100644 --- a/src/test/recovery/t/001_stream_rep.pl +++ b/src/test/recovery/t/001_stream_rep.pl @@ -3,7 +3,7 @@ use warnings; use PostgresNode; use TestLib; -use Test::More tests => 35; +use Test::More tests => 36; # Initialize master node my $node_master = get_new_node('master'); @@ -364,15 +364,26 @@ sub replay_check qq[SELECT 1 FROM replayed WHERE val = $newval]); is($is_replayed, qq(1), "standby_2 didn't replay master value $newval"); +# Drop any existing slots on the primary, for the follow-up tests. +$node_master->safe_psql('postgres', + "SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots;"); + # Test physical slot advancing and its durability. Create a new slot on # the primary, not used by any of the standbys. This reserves WAL at creation. my $phys_slot = 'phys_slot'; $node_master->safe_psql('postgres', "SELECT pg_create_physical_replication_slot('$phys_slot', true);"); +# Generate some WAL, and switch to a new segment, used to check that +# the previous segment is correctly getting recycled as the slot advancing +# would recompute the minimum LSN calculated across all slots. +my $segment_removed = $node_master->safe_psql('postgres', + 'SELECT pg_walfile_name(pg_current_wal_lsn())'); +chomp($segment_removed); $node_master->psql( 'postgres', " CREATE TABLE tab_phys_slot (a int); - INSERT INTO tab_phys_slot VALUES (generate_series(1,10));"); + INSERT INTO tab_phys_slot VALUES (generate_series(1,10)); + SELECT pg_switch_wal();"); my $current_lsn = $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();"); chomp($current_lsn); @@ -392,3 +403,9 @@ sub replay_check chomp($phys_restart_lsn_post); ok( ($phys_restart_lsn_pre cmp $phys_restart_lsn_post) == 0, "physical slot advance persists across restarts"); + +# Check if the previous segment gets correctly recycled after the +# server stopped cleanly, causing a shutdown checkpoint to be generated. +my $master_data = $node_master->data_dir; +ok(!-f "$master_data/pg_wal/$segment_removed", + "WAL segment $segment_removed recycled after physical slot advancing"); From a3235a53ae9f6f21f823081c610b0901db6aa665 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 18 Jun 2020 16:27:18 -0400 Subject: [PATCH 011/334] Doc: document POSIX-style time zone specifications in full. We'd glossed over most of this complexity for years, but it's hard to avoid writing it all down now, so that we can explain what happens when there's no "posixrules" file in the IANA time zone database. That was at best a tiny minority situation till now, but it's likely to become quite common in the future, so we'd better explain it. Nonetheless, we don't really encourage people to use POSIX zone specs; picking a named zone is almost always what you really want, unless perhaps you're stuck with an out-of-date zone database. Therefore, let's shove all this detail into an appendix. Patch by me; thanks to Robert Haas for help with some awkward wording. Discussion: https://postgr.es/m/1390.1562258309@sss.pgh.pa.us --- doc/src/sgml/datatype.sgml | 38 +------ doc/src/sgml/datetime.sgml | 212 +++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+), 33 deletions(-) diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 3df189ad853c..49fb19ff9194 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -2478,7 +2478,7 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02' A time zone abbreviation, for example PST. Such a specification merely defines a particular offset from UTC, in contrast to full time zone names which can imply a set of daylight - savings transition-date rules as well. The recognized abbreviations + savings transition rules as well. The recognized abbreviations are listed in the pg_timezone_abbrevs view (see ). You cannot set the configuration parameters or @@ -2492,25 +2492,10 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02' In addition to the timezone names and abbreviations, PostgreSQL will accept POSIX-style time zone - specifications of the form STDoffset or - STDoffsetDST, where - STD is a zone abbreviation, offset is a - numeric offset in hours west from UTC, and DST is an - optional daylight-savings zone abbreviation, assumed to stand for one - hour ahead of the given offset. For example, if EST5EDT - were not already a recognized zone name, it would be accepted and would - be functionally equivalent to United States East Coast time. In this - syntax, a zone abbreviation can be a string of letters, or an - arbitrary string surrounded by angle brackets (<>). - When a daylight-savings zone abbreviation is present, - it is assumed to be used - according to the same daylight-savings transition rules used in the - IANA time zone database's posixrules entry. - In a standard PostgreSQL installation, - posixrules is the same as US/Eastern, so - that POSIX-style time zone specifications follow USA daylight-savings - rules. If needed, you can adjust this behavior by replacing the - posixrules file. + specifications, as described in + . This option is not + normally preferable to using a named time zone, but it may be + necessary if no suitable IANA time zone entry is available. @@ -2537,19 +2522,6 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02' above, this is not necessarily the same as local civil time on that date. - - One should be wary that the POSIX-style time zone feature can - lead to silently accepting bogus input, since there is no check on the - reasonableness of the zone abbreviations. For example, SET - TIMEZONE TO FOOBAR0 will work, leaving the system effectively using - a rather peculiar abbreviation for UTC. - Another issue to keep in mind is that in POSIX time zone names, - positive offsets are used for locations west of Greenwich. - Everywhere else, PostgreSQL follows the - ISO-8601 convention that positive timezone offsets are east - of Greenwich. - - In all cases, timezone names and abbreviations are recognized case-insensitively. (This is a change from PostgreSQL diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml index 7cce826e2d00..7da4d0b7789d 100644 --- a/doc/src/sgml/datetime.sgml +++ b/doc/src/sgml/datetime.sgml @@ -555,6 +555,218 @@ + + <acronym>POSIX</acronym> Time Zone Specifications + + + time zone + POSIX-style specification + + + + PostgreSQL can accept time zone specifications that + are written according to the POSIX standard's rules + for the TZ environment + variable. POSIX time zone specifications are + inadequate to deal with the complexity of real-world time zone history, + but there are sometimes reasons to use them. + + + + A POSIX time zone specification has the form + +STD offset DST dstoffset , rule + + (For readability, we show spaces between the fields, but spaces should + not be used in practice.) The fields are: + + + + STD is the zone abbreviation to be used + for standard time. + + + + + offset is the zone's standard-time offset + from UTC. + + + + + DST is the zone abbreviation to be used + for daylight-savings time. If this field and the following ones are + omitted, the zone uses a fixed UTC offset with no daylight-savings + rule. + + + + + dstoffset is the daylight-savings offset + from UTC. This field is typically omitted, since it defaults to one + hour less than the standard-time offset, + which is usually the right thing. + + + + + rule defines the rule for when daylight + savings is in effect, as described below. + + + + + + + In this syntax, a zone abbreviation can be a string of letters, such + as EST, or an arbitrary string surrounded by angle + brackets, such as <UTC-05>. + Note that the zone abbreviations given here are only used for output, + and even then only in some timestamp output formats. The zone + abbreviations recognized in timestamp input are determined as explained + in . + + + + The offset fields specify the hours, and optionally minutes and seconds, + difference from UTC. They have the format + hh:mm:ss + optionally with a leading sign (+ + or -). The positive sign is used for + zones west of Greenwich. (Note that this is the + opposite of the ISO-8601 sign convention used elsewhere in + PostgreSQL.) hh can have + one or two digits; mm + and ss (if used) must have two. + + + + The daylight-savings transition rule has the + format + +dstdate / dsttime , stddate / stdtime + + (As before, spaces should not be included in practice.) + The dstdate + and dsttime fields define when daylight-savings + time starts, while stddate + and stdtime define when standard time + starts. (In some cases, notably in zones south of the equator, the + former might be later in the year than the latter.) The date fields + have one of these formats: + + + n + + + A plain integer denotes a day of the year, counting from zero to + 364, or to 365 in leap years. + + + + + Jn + + + In this form, n counts from 1 to 365, + and February 29 is not counted even if it is present. (Thus, a + transition occurring on February 29 could not be specified this + way. However, days after February have the same numbers whether + it's a leap year or not, so that this form is usually more useful + than the plain-integer form for transitions on fixed dates.) + + + + + Mm.n.d + + + This form specifies a transition that always happens during the same + month and on the same day of the week. m + identifies the month, from 1 to 12. n + specifies the n'th occurrence of the + weekday identified by d. + n is a number between 1 and 4, or 5 + meaning the last occurrence of that weekday in the month (which + could be the fourth or the fifth). d is + a number between 0 and 6, with 0 indicating Sunday. + For example, M3.2.0 means the second + Sunday in March. + + + + + + + + + The M format is sufficient to describe many common + daylight-savings transition laws. But note that none of these variants + can deal with daylight-savings law changes, so in practice the + historical data stored for named time zones (in the IANA time zone + database) is necessary to interpret past time stamps correctly. + + + + + The time fields in a transition rule have the same format as the offset + fields described previously, except that they cannot contain signs. + They define the current local time at which the change to the other + time occurs. If omitted, they default to 02:00:00. + + + + If a daylight-savings abbreviation is given but the + transition rule field is omitted, + PostgreSQL attempts to determine the + transition times by consulting the posixrules file + in the IANA time zone database. This file has the same format as a + full time zone entry, but only its transition timing rules are used, + not its UTC offsets. Typically, this file has the same contents as the + US/Eastern file, so that POSIX-style time zone + specifications follow USA daylight-savings rules. If needed, you can + adjust this behavior by replacing the posixrules + file. + + + + + The facility to consult a posixrules file has + been deprecated by IANA, and it is likely to go away in the future. + One bug in this feature, which is unlikely to be fixed before it + disappears, is that it fails to apply DST rules to dates after 2038. + + + + + If the posixrules file is not present, + the fallback behavior is to use the + rule M3.2.0,M11.1.0, which corresponds to USA + practice as of 2020 (that is, spring forward on the second Sunday of + March, fall back on the first Sunday of November, both transitions + occurring at 2AM prevailing time). + + + + As an example, CET-1CEST,M3.5.0,M10.5.0/3 describes + current (as of 2020) timekeeping practice in Paris. This specification + says that standard time has the abbreviation CET and + is one hour ahead (east) of UTC; daylight savings time has the + abbreviation CEST and is implicitly two hours ahead + of UTC; daylight savings time begins on the last Sunday in March at 2AM + CET and ends on the last Sunday in October at 3AM CEST. + + + + One should be wary that it is easy to misspell a POSIX-style time zone + specification, since there is no check on the reasonableness of the + zone abbreviation(s). For example, SET TIMEZONE TO + FOOBAR0 will work, leaving the system effectively using a + rather peculiar abbreviation for UTC. + + + + History of Units From 3b37a6de027c27f1e4ac865aaa34dd92cf5dc7a1 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 8 Jun 2020 16:36:51 -0700 Subject: [PATCH 012/334] Add basic spinlock tests to regression tests. As s_lock_test, the already existing test for spinlocks, isn't run in an automated fashion (and doesn't test a normal backend environment), adding tests that are run as part of a normal regression run is a good idea. Particularly in light of several recent and upcoming spinlock related fixes. Currently the new tests are run as part of the pre-existing test_atomic_ops() test. That perhaps can be quibbled about, but for now seems ok. The only operations that s_lock_test tests but the new tests don't are the detection of a stuck spinlock and S_LOCK_FREE (which is otherwise unused, not implemented on all platforms, and will be removed). This currently contains a test for more than INT_MAX spinlocks (only run with --disable-spinlocks), to ensure the recent commit fixing a bug with more than INT_MAX spinlock initializations is correct. That test is somewhat slow, so we might want to disable it after a few days. It might be worth retiring s_lock_test after this. The added coverage of a stuck spinlock probably isn't worth the added complexity? Author: Andres Freund Discussion: https://postgr.es/m/20200606023103.avzrctgv7476xj7i@alap3.anarazel.de --- src/test/regress/regress.c | 109 +++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 960c155e5f23..9bea2ada24aa 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -34,6 +34,7 @@ #include "optimizer/optimizer.h" #include "optimizer/plancat.h" #include "port/atomics.h" +#include "storage/spin.h" #include "utils/builtins.h" #include "utils/geo_decls.h" #include "utils/memutils.h" @@ -794,6 +795,108 @@ test_atomic_uint64(void) EXPECT_EQ_U64(pg_atomic_fetch_and_u64(&var, ~0), 0); } +/* + * Perform, fairly minimal, testing of the spinlock implementation. + * + * It's likely worth expanding these to actually test concurrency etc, but + * having some regularly run tests is better than none. + */ +static void +test_spinlock(void) +{ + /* + * Basic tests for spinlocks, as well as the underlying operations. + * + * We embed the spinlock in a struct with other members to test that the + * spinlock operations don't perform too wide writes. + */ + { + struct test_lock_struct + { + char data_before[4]; + slock_t lock; + char data_after[4]; + } struct_w_lock; + + memcpy(struct_w_lock.data_before, "abcd", 4); + memcpy(struct_w_lock.data_after, "ef12", 4); + + /* test basic operations via the SpinLock* API */ + SpinLockInit(&struct_w_lock.lock); + SpinLockAcquire(&struct_w_lock.lock); + SpinLockRelease(&struct_w_lock.lock); + + /* test basic operations via underlying S_* API */ + S_INIT_LOCK(&struct_w_lock.lock); + S_LOCK(&struct_w_lock.lock); + S_UNLOCK(&struct_w_lock.lock); + + /* and that "contended" acquisition works */ + s_lock(&struct_w_lock.lock, "testfile", 17, "testfunc"); + S_UNLOCK(&struct_w_lock.lock); + + /* + * Check, using TAS directly, that a single spin cycle doesn't block + * when acquiring an already acquired lock. + */ +#ifdef TAS + S_LOCK(&struct_w_lock.lock); + + if (!TAS(&struct_w_lock.lock)) + elog(ERROR, "acquired already held spinlock"); + +#ifdef TAS_SPIN + if (!TAS_SPIN(&struct_w_lock.lock)) + elog(ERROR, "acquired already held spinlock"); +#endif /* defined(TAS_SPIN) */ + + S_UNLOCK(&struct_w_lock.lock); +#endif /* defined(TAS) */ + + /* + * Verify that after all of this the non-lock contents are still + * correct. + */ + if (memcmp(struct_w_lock.data_before, "abcd", 4) != 0) + elog(ERROR, "padding before spinlock modified"); + if (memcmp(struct_w_lock.data_after, "ef12", 4) != 0) + elog(ERROR, "padding after spinlock modified"); + } + + /* + * Ensure that allocating more than INT32_MAX emulated spinlocks + * works. That's interesting because the spinlock emulation uses a 32bit + * integer to map spinlocks onto semaphores. There've been bugs... + */ +#ifndef HAVE_SPINLOCKS + { + /* + * Initialize enough spinlocks to advance counter close to + * wraparound. It's too expensive to perform acquire/release for each, + * as those may be syscalls when the spinlock emulation is used (and + * even just atomic TAS would be expensive). + */ + for (uint32 i = 0; i < INT32_MAX - 100000; i++) + { + slock_t lock; + + SpinLockInit(&lock); + } + + for (uint32 i = 0; i < 200000; i++) + { + slock_t lock; + + SpinLockInit(&lock); + + SpinLockAcquire(&lock); + SpinLockRelease(&lock); + SpinLockAcquire(&lock); + SpinLockRelease(&lock); + } + } +#endif +} PG_FUNCTION_INFO_V1(test_atomic_ops); Datum @@ -805,6 +908,12 @@ test_atomic_ops(PG_FUNCTION_ARGS) test_atomic_uint64(); + /* + * Arguably this shouldn't be tested as part of this function, but it's + * closely enough related that that seems ok for now. + */ + test_spinlock(); + PG_RETURN_BOOL(true); } From cf1234a10e50ff9be0dc85184689ee4ebc57cd77 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 8 Jun 2020 16:50:37 -0700 Subject: [PATCH 013/334] Fix deadlock danger when atomic ops are done under spinlock. This was a danger only for --disable-spinlocks in combination with atomic operations unsupported by the current platform. While atomics.c was careful to signal that a separate semaphore ought to be used when spinlock emulation is active, spin.c didn't actually implement that mechanism. That's my (Andres') fault, it seems to have gotten lost during the development of the atomic operations support. Fix that issue and add test for nesting atomic operations inside a spinlock. Author: Andres Freund Discussion: https://postgr.es/m/20200605023302.g6v3ydozy5txifji@alap3.anarazel.de Backpatch: 9.5- --- src/backend/storage/lmgr/spin.c | 97 +++++++++++++++++++++++---------- src/test/regress/regress.c | 52 ++++++++++++++++++ 2 files changed, 119 insertions(+), 30 deletions(-) diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c index 753943e46d62..9f7eae933922 100644 --- a/src/backend/storage/lmgr/spin.c +++ b/src/backend/storage/lmgr/spin.c @@ -28,8 +28,24 @@ #ifndef HAVE_SPINLOCKS + +/* + * No TAS, so spinlocks are implemented as PGSemaphores. + */ + +#ifndef HAVE_ATOMICS +#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES) +#else +#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES) +#endif /* DISABLE_ATOMICS */ + PGSemaphore *SpinlockSemaArray; -#endif + +#else /* !HAVE_SPINLOCKS */ + +#define NUM_EMULATION_SEMAPHORES 0 + +#endif /* HAVE_SPINLOCKS */ /* * Report the amount of shared memory needed to store semaphores for spinlock @@ -38,34 +54,19 @@ PGSemaphore *SpinlockSemaArray; Size SpinlockSemaSize(void) { - return SpinlockSemas() * sizeof(PGSemaphore); + return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore); } -#ifdef HAVE_SPINLOCKS - /* * Report number of semaphores needed to support spinlocks. */ int SpinlockSemas(void) { - return 0; + return NUM_EMULATION_SEMAPHORES; } -#else /* !HAVE_SPINLOCKS */ -/* - * No TAS, so spinlocks are implemented as PGSemaphores. - */ - - -/* - * Report number of semaphores needed to support spinlocks. - */ -int -SpinlockSemas(void) -{ - return NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES; -} +#ifndef HAVE_SPINLOCKS /* * Initialize spinlock emulation. @@ -92,23 +93,59 @@ SpinlockSemaInit(void) /* * s_lock.h hardware-spinlock emulation using semaphores * - * We map all spinlocks onto a set of NUM_SPINLOCK_SEMAPHORES semaphores. - * It's okay to map multiple spinlocks onto one semaphore because no process - * should ever hold more than one at a time. We just need enough semaphores - * so that we aren't adding too much extra contention from that. + * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores. It's okay to + * map multiple spinlocks onto one semaphore because no process should ever + * hold more than one at a time. We just need enough semaphores so that we + * aren't adding too much extra contention from that. + * + * There is one exception to the restriction of only holding one spinlock at a + * time, which is that it's ok if emulated atomic operations are nested inside + * spinlocks. To avoid the danger of spinlocks and atomic using the same sema, + * we make sure "normal" spinlocks and atomics backed by spinlocks use + * distinct semaphores (see the nested argument to s_init_lock_sema). * * slock_t is just an int for this implementation; it holds the spinlock - * number from 1..NUM_SPINLOCK_SEMAPHORES. We intentionally ensure that 0 + * number from 1..NUM_EMULATION_SEMAPHORES. We intentionally ensure that 0 * is not a valid value, so that testing with this code can help find * failures to initialize spinlocks. */ +static inline void +s_check_valid(int lockndx) +{ + if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES)) + elog(ERROR, "invalid spinlock number: %d", lockndx); +} + void s_init_lock_sema(volatile slock_t *lock, bool nested) { static uint32 counter = 0; - - *lock = ((++counter) % NUM_SPINLOCK_SEMAPHORES) + 1; + uint32 offset; + uint32 sema_total; + uint32 idx; + + if (nested) + { + /* + * To allow nesting atomics inside spinlocked sections, use a + * different spinlock. See comment above. + */ + offset = 1 + NUM_SPINLOCK_SEMAPHORES; + sema_total = NUM_ATOMICS_SEMAPHORES; + } + else + { + offset = 1; + sema_total = NUM_SPINLOCK_SEMAPHORES; + } + + idx = (counter++ % sema_total) + offset; + + /* double check we did things correctly */ + s_check_valid(idx); + + *lock = idx; } void @@ -116,8 +153,8 @@ s_unlock_sema(volatile slock_t *lock) { int lockndx = *lock; - if (lockndx <= 0 || lockndx > NUM_SPINLOCK_SEMAPHORES) - elog(ERROR, "invalid spinlock number: %d", lockndx); + s_check_valid(lockndx); + PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]); } @@ -134,8 +171,8 @@ tas_sema(volatile slock_t *lock) { int lockndx = *lock; - if (lockndx <= 0 || lockndx > NUM_SPINLOCK_SEMAPHORES) - elog(ERROR, "invalid spinlock number: %d", lockndx); + s_check_valid(lockndx); + /* Note that TAS macros return 0 if *success* */ return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]); } diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 9bea2ada24aa..02397f2eb104 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -898,6 +898,56 @@ test_spinlock(void) #endif } +/* + * Verify that performing atomic ops inside a spinlock isn't a + * problem. Realistically that's only going to be a problem when both + * --disable-spinlocks and --disable-atomics are used, but it's cheap enough + * to just always test. + * + * The test works by initializing enough atomics that we'd conflict if there + * were an overlap between a spinlock and an atomic by holding a spinlock + * while manipulating more than NUM_SPINLOCK_SEMAPHORES atomics. + * + * NUM_TEST_ATOMICS doesn't really need to be more than + * NUM_SPINLOCK_SEMAPHORES, but it seems better to test a bit more + * extensively. + */ +static void +test_atomic_spin_nest(void) +{ + slock_t lock; +#define NUM_TEST_ATOMICS (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES + 27) + pg_atomic_uint32 atomics32[NUM_TEST_ATOMICS]; + pg_atomic_uint64 atomics64[NUM_TEST_ATOMICS]; + + SpinLockInit(&lock); + + for (int i = 0; i < NUM_TEST_ATOMICS; i++) + { + pg_atomic_init_u32(&atomics32[i], 0); + pg_atomic_init_u64(&atomics64[i], 0); + } + + /* just so it's not all zeroes */ + for (int i = 0; i < NUM_TEST_ATOMICS; i++) + { + EXPECT_EQ_U32(pg_atomic_fetch_add_u32(&atomics32[i], i), 0); + EXPECT_EQ_U64(pg_atomic_fetch_add_u64(&atomics64[i], i), 0); + } + + /* test whether we can do atomic op with lock held */ + SpinLockAcquire(&lock); + for (int i = 0; i < NUM_TEST_ATOMICS; i++) + { + EXPECT_EQ_U32(pg_atomic_fetch_sub_u32(&atomics32[i], i), i); + EXPECT_EQ_U32(pg_atomic_read_u32(&atomics32[i]), 0); + EXPECT_EQ_U64(pg_atomic_fetch_sub_u64(&atomics64[i], i), i); + EXPECT_EQ_U64(pg_atomic_read_u64(&atomics64[i]), 0); + } + SpinLockRelease(&lock); +} +#undef NUM_TEST_ATOMICS + PG_FUNCTION_INFO_V1(test_atomic_ops); Datum test_atomic_ops(PG_FUNCTION_ARGS) @@ -914,6 +964,8 @@ test_atomic_ops(PG_FUNCTION_ARGS) */ test_spinlock(); + test_atomic_spin_nest(); + PG_RETURN_BOOL(true); } From f219167910ad33dfd8f1b0bba15323d71a91c4e9 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 18 Jun 2020 19:40:09 -0700 Subject: [PATCH 014/334] Clean up includes of s_lock.h. Users of spinlocks should use spin.h, not s_lock.h. And lwlock.h hasn't utilized spinlocks for quite a while. Discussion: https://postgr.es/m/20200618183041.upyrd25eosecyf3x@alap3.anarazel.de --- src/backend/main/main.c | 1 - src/include/storage/condition_variable.h | 2 +- src/include/storage/lwlock.h | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/backend/main/main.c b/src/backend/main/main.c index da3dae9e250f..a4dd233c7f92 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -35,7 +35,6 @@ #include "common/username.h" #include "port/atomics.h" #include "postmaster/postmaster.h" -#include "storage/s_lock.h" #include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/help_config.h" diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h index c2be198f28e6..ad209acfac06 100644 --- a/src/include/storage/condition_variable.h +++ b/src/include/storage/condition_variable.h @@ -23,7 +23,7 @@ #define CONDITION_VARIABLE_H #include "storage/proclist_types.h" -#include "storage/s_lock.h" +#include "storage/spin.h" typedef struct { diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index c04ae971485e..af9b41795d26 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -20,7 +20,6 @@ #include "port/atomics.h" #include "storage/proclist_types.h" -#include "storage/s_lock.h" struct PGPROC; From 9bdb300dedf086cc54edf740088208e6b24307ef Mon Sep 17 00:00:00 2001 From: David Rowley Date: Fri, 19 Jun 2020 17:24:27 +1200 Subject: [PATCH 015/334] Fix EXPLAIN ANALYZE for parallel HashAgg plans Since 1f39bce02, HashAgg nodes have had the ability to spill to disk when memory consumption exceeds work_mem. That commit added new properties to EXPLAIN ANALYZE to show the maximum memory usage and disk usage, however, it didn't quite go as far as showing that information for parallel workers. Since workers may have experienced something very different from the main process, we should show this information per worker, as is done in Sort. Reviewed-by: Justin Pryzby Reviewed-by: Jeff Davis Discussion: https://postgr.es/m/CAApHDvpEKbfZa18mM1TD7qV6PG+w97pwCWq5tVD0dX7e11gRJw@mail.gmail.com Backpatch-through: 13, where the hashagg spilling code was added. --- src/backend/commands/explain.c | 110 ++++++++++++++++++++++++---- src/backend/executor/execParallel.c | 19 ++++- src/backend/executor/nodeAgg.c | 103 ++++++++++++++++++++++++++ src/include/executor/nodeAgg.h | 7 ++ src/include/nodes/execnodes.h | 22 ++++++ 5 files changed, 244 insertions(+), 17 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 9092b4b30944..67bdcb2b2785 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3051,29 +3051,111 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) Agg *agg = (Agg *) aggstate->ss.ps.plan; int64 memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024; - Assert(IsA(aggstate, AggState)); - if (agg->aggstrategy != AGG_HASHED && agg->aggstrategy != AGG_MIXED) return; - if (es->costs && aggstate->hash_planned_partitions > 0) + if (es->format != EXPLAIN_FORMAT_TEXT) { - ExplainPropertyInteger("Planned Partitions", NULL, - aggstate->hash_planned_partitions, es); + + if (es->costs && aggstate->hash_planned_partitions > 0) + { + ExplainPropertyInteger("Planned Partitions", NULL, + aggstate->hash_planned_partitions, es); + } + + if (!es->analyze) + return; + + /* EXPLAIN ANALYZE */ + ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); + if (aggstate->hash_batches_used > 0) + { + ExplainPropertyInteger("Disk Usage", "kB", + aggstate->hash_disk_used, es); + ExplainPropertyInteger("HashAgg Batches", NULL, + aggstate->hash_batches_used, es); + } } + else + { + bool gotone = false; - if (!es->analyze) - return; + if (es->costs && aggstate->hash_planned_partitions > 0) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Planned Partitions: %d", + aggstate->hash_planned_partitions); + gotone = true; + } + + if (!es->analyze) + { + if (gotone) + appendStringInfoChar(es->str, '\n'); + return; + } + + if (!gotone) + ExplainIndentText(es); + else + appendStringInfoString(es->str, " "); + + appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT " kB", + memPeakKb); - /* EXPLAIN ANALYZE */ - ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); - if (aggstate->hash_batches_used > 0) + if (aggstate->hash_batches_used > 0) + appendStringInfo(es->str, " Disk Usage: " UINT64_FORMAT " kB HashAgg Batches: %d", + aggstate->hash_disk_used, + aggstate->hash_batches_used); + appendStringInfoChar(es->str, '\n'); + } + + /* Display stats for each parallel worker */ + if (es->analyze && aggstate->shared_info != NULL) { - ExplainPropertyInteger("Disk Usage", "kB", - aggstate->hash_disk_used, es); - ExplainPropertyInteger("HashAgg Batches", NULL, - aggstate->hash_batches_used, es); + for (int n = 0; n < aggstate->shared_info->num_workers; n++) + { + AggregateInstrumentation *sinstrument; + uint64 hash_disk_used; + int hash_batches_used; + + sinstrument = &aggstate->shared_info->sinstrument[n]; + hash_disk_used = sinstrument->hash_disk_used; + hash_batches_used = sinstrument->hash_batches_used; + memPeakKb = (sinstrument->hash_mem_peak + 1023) / 1024; + + if (es->workers_state) + ExplainOpenWorker(n, es); + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + + appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT " kB", + memPeakKb); + + if (hash_batches_used > 0) + appendStringInfo(es->str, " Disk Usage: " UINT64_FORMAT " kB HashAgg Batches: %d", + hash_disk_used, hash_batches_used); + appendStringInfoChar(es->str, '\n'); + } + else + { + ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, + es); + if (hash_batches_used > 0) + { + ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used, + es); + ExplainPropertyInteger("HashAgg Batches", NULL, + hash_batches_used, es); + } + } + + if (es->workers_state) + ExplainCloseWorker(n, es); + } } } diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 41cb41481df6..382e78fb7fed 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -25,6 +25,7 @@ #include "executor/execParallel.h" #include "executor/executor.h" +#include "executor/nodeAgg.h" #include "executor/nodeAppend.h" #include "executor/nodeBitmapHeapscan.h" #include "executor/nodeCustom.h" @@ -288,7 +289,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt); break; - + case T_AggState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecAggEstimate((AggState *) planstate, e->pcxt); + break; default: break; } @@ -505,7 +509,10 @@ ExecParallelInitializeDSM(PlanState *planstate, /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt); break; - + case T_AggState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecAggInitializeDSM((AggState *) planstate, d->pcxt); + break; default: break; } @@ -1048,6 +1055,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, case T_HashState: ExecHashRetrieveInstrumentation((HashState *) planstate); break; + case T_AggState: + ExecAggRetrieveInstrumentation((AggState *) planstate); + break; default: break; } @@ -1336,7 +1346,10 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate, pwcxt); break; - + case T_AggState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecAggInitializeWorker((AggState *) planstate, pwcxt); + break; default: break; } diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 331acee28141..a20554ae65a6 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -240,6 +240,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/parallel.h" #include "catalog/objectaccess.h" #include "catalog/pg_aggregate.h" #include "catalog/pg_proc.h" @@ -4483,6 +4484,22 @@ ExecEndAgg(AggState *node) int numGroupingSets = Max(node->maxsets, 1); int setno; + /* + * When ending a parallel worker, copy the statistics gathered by the + * worker back into shared memory so that it can be picked up by the main + * process to report in EXPLAIN ANALYZE. + */ + if (node->shared_info && IsParallelWorker()) + { + AggregateInstrumentation *si; + + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); + si = &node->shared_info->sinstrument[ParallelWorkerNumber]; + si->hash_batches_used = node->hash_batches_used; + si->hash_disk_used = node->hash_disk_used; + si->hash_mem_peak = node->hash_mem_peak; + } + /* Make sure we have closed any open tuplesorts */ if (node->sort_in) @@ -4854,3 +4871,89 @@ aggregate_dummy(PG_FUNCTION_ARGS) fcinfo->flinfo->fn_oid); return (Datum) 0; /* keep compiler quiet */ } + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + + /* ---------------------------------------------------------------- + * ExecAggEstimate + * + * Estimate space required to propagate aggregate statistics. + * ---------------------------------------------------------------- + */ +void +ExecAggEstimate(AggState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation)); + size = add_size(size, offsetof(SharedAggInfo, sinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecAggInitializeDSM + * + * Initialize DSM space for aggregate statistics. + * ---------------------------------------------------------------- + */ +void +ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedAggInfo, sinstrument) + + pcxt->nworkers * sizeof(AggregateInstrumentation); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecAggInitializeWorker + * + * Attach worker to DSM space for aggregate statistics. + * ---------------------------------------------------------------- + */ +void +ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); +} + +/* ---------------------------------------------------------------- + * ExecAggRetrieveInstrumentation + * + * Transfer aggregate statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecAggRetrieveInstrumentation(AggState *node) +{ + Size size; + SharedAggInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedAggInfo, sinstrument) + + node->shared_info->num_workers * sizeof(AggregateInstrumentation); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h index 92c2337fd3ac..bb0805abe091 100644 --- a/src/include/executor/nodeAgg.h +++ b/src/include/executor/nodeAgg.h @@ -14,6 +14,7 @@ #ifndef NODEAGG_H #define NODEAGG_H +#include "access/parallel.h" #include "nodes/execnodes.h" @@ -323,4 +324,10 @@ extern void hash_agg_set_limits(double hashentrysize, uint64 input_groups, int used_bits, Size *mem_limit, uint64 *ngroups_limit, int *num_partitions); +/* parallel instrumentation support */ +extern void ExecAggEstimate(AggState *node, ParallelContext *pcxt); +extern void ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt); +extern void ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt); +extern void ExecAggRetrieveInstrumentation(AggState *node); + #endif /* NODEAGG_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 98e0072b8ad2..f5dfa32d55c4 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2101,6 +2101,27 @@ typedef struct GroupState bool grp_done; /* indicates completion of Group scan */ } GroupState; +/* --------------------- + * per-worker aggregate information + * --------------------- + */ +typedef struct AggregateInstrumentation +{ + Size hash_mem_peak; /* peak hash table memory usage */ + uint64 hash_disk_used; /* kB of disk space used */ + int hash_batches_used; /* batches used during entire execution */ +} AggregateInstrumentation; + +/* ---------------- + * Shared memory container for per-worker aggregate information + * ---------------- + */ +typedef struct SharedAggInfo +{ + int num_workers; + AggregateInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER]; +} SharedAggInfo; + /* --------------------- * AggState information * @@ -2190,6 +2211,7 @@ typedef struct AggState AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than * ->hash_pergroup */ ProjectionInfo *combinedproj; /* projection machinery */ + SharedAggInfo *shared_info; /* one entry per worker */ } AggState; /* ---------------- From f9e9704f09daf882f5a1cf1fbe3f5a3150ae2bb9 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 19 Jun 2020 17:15:52 +0900 Subject: [PATCH 016/334] Fix issues in invalidation of obsolete replication slots. This commit fixes the following issues. 1. There is the case where the slot is dropped while trying to invalidate it. InvalidateObsoleteReplicationSlots() did not handle this case, and which could cause checkpoint to fail. 2. InvalidateObsoleteReplicationSlots() could emit the same log message multiple times unnecessary. It should be logged only once. 3. When marking the slot as used, we always searched the target slot from all the replication slots even if we already found it. This could cause useless waste of cycles. Back-patch to v13 where these issues were added as a part of max_slot_wal_keep_size code. Author: Fujii Masao Reviewed-by: Kyotaro Horiguchi, Alvaro Herrera Discussion: https://postgr.es/m/66c05b67-3396-042c-1b41-bfa6c3ddcf82@oss.nttdata.com --- src/backend/replication/slot.c | 226 ++++++++++++++++++++++----------- 1 file changed, 154 insertions(+), 72 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 505445f2dc84..a7bbcf34991a 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -99,6 +99,9 @@ ReplicationSlot *MyReplicationSlot = NULL; int max_replication_slots = 0; /* the maximum number of replication * slots */ +static ReplicationSlot *SearchNamedReplicationSlot(const char *name); +static int ReplicationSlotAcquireInternal(ReplicationSlot *slot, + const char *name, SlotAcquireBehavior behavior); static void ReplicationSlotDropAcquired(void); static void ReplicationSlotDropPtr(ReplicationSlot *slot); @@ -322,77 +325,117 @@ ReplicationSlotCreate(const char *name, bool db_specific, } /* - * Find a previously created slot and mark it as used by this backend. + * Search for the named replication slot. * - * The return value is only useful if behavior is SAB_Inquire, in which - * it's zero if we successfully acquired the slot, or the PID of the - * owning process otherwise. If behavior is SAB_Error, then trying to - * acquire an owned slot is an error. If SAB_Block, we sleep until the - * slot is released by the owning process. + * Return the replication slot if found, otherwise NULL. + * + * The caller must hold ReplicationSlotControlLock in shared mode. */ -int -ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior) +static ReplicationSlot * +SearchNamedReplicationSlot(const char *name) { - ReplicationSlot *slot; - int active_pid; int i; + ReplicationSlot *slot = NULL; -retry: - Assert(MyReplicationSlot == NULL); + Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock, + LW_SHARED)); - /* - * Search for the named slot and mark it active if we find it. If the - * slot is already active, we exit the loop with active_pid set to the PID - * of the backend that owns it. - */ - active_pid = 0; - slot = NULL; - LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) { - /* - * This is the slot we want; check if it's active under some other - * process. In single user mode, we don't need this check. - */ - if (IsUnderPostmaster) - { - /* - * Get ready to sleep on it in case it is active. (We may end - * up not sleeping, but we don't want to do this while holding - * the spinlock.) - */ - ConditionVariablePrepareToSleep(&s->active_cv); - - SpinLockAcquire(&s->mutex); - - active_pid = s->active_pid; - if (active_pid == 0) - active_pid = s->active_pid = MyProcPid; - - SpinLockRelease(&s->mutex); - } - else - active_pid = MyProcPid; slot = s; - break; } } - LWLockRelease(ReplicationSlotControlLock); - /* If we did not find the slot, error out. */ - if (slot == NULL) + return slot; +} + +/* + * Find a previously created slot and mark it as used by this process. + * + * The return value is only useful if behavior is SAB_Inquire, in which + * it's zero if we successfully acquired the slot, -1 if the slot no longer + * exists, or the PID of the owning process otherwise. If behavior is + * SAB_Error, then trying to acquire an owned slot is an error. + * If SAB_Block, we sleep until the slot is released by the owning process. + */ +int +ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior) +{ + return ReplicationSlotAcquireInternal(NULL, name, behavior); +} + +/* + * Mark the specified slot as used by this process. + * + * Only one of slot and name can be specified. + * If slot == NULL, search for the slot with the given name. + * + * See comments about the return value in ReplicationSlotAcquire(). + */ +static int +ReplicationSlotAcquireInternal(ReplicationSlot *slot, const char *name, + SlotAcquireBehavior behavior) +{ + ReplicationSlot *s; + int active_pid; + + AssertArg((slot == NULL) ^ (name == NULL)); + +retry: + Assert(MyReplicationSlot == NULL); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + /* + * Search for the slot with the specified name if the slot to acquire is + * not given. If the slot is not found, we either return -1 or error out. + */ + s = slot ? slot : SearchNamedReplicationSlot(name); + if (s == NULL || !s->in_use) + { + LWLockRelease(ReplicationSlotControlLock); + + if (behavior == SAB_Inquire) + return -1; ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("replication slot \"%s\" does not exist", name))); + errmsg("replication slot \"%s\" does not exist", + name ? name : NameStr(slot->data.name)))); + } /* - * If we found the slot but it's already active in another backend, we - * either error out or retry after a short wait, as caller specified. + * This is the slot we want; check if it's active under some other + * process. In single user mode, we don't need this check. + */ + if (IsUnderPostmaster) + { + /* + * Get ready to sleep on the slot in case it is active if SAB_Block. + * (We may end up not sleeping, but we don't want to do this while + * holding the spinlock.) + */ + if (behavior == SAB_Block) + ConditionVariablePrepareToSleep(&s->active_cv); + + SpinLockAcquire(&s->mutex); + if (s->active_pid == 0) + s->active_pid = MyProcPid; + active_pid = s->active_pid; + SpinLockRelease(&s->mutex); + } + else + active_pid = MyProcPid; + LWLockRelease(ReplicationSlotControlLock); + + /* + * If we found the slot but it's already active in another process, we + * either error out, return the PID of the owning process, or retry + * after a short wait, as caller specified. */ if (active_pid != MyProcPid) { @@ -400,24 +443,24 @@ ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("replication slot \"%s\" is active for PID %d", - name, active_pid))); + NameStr(s->data.name), active_pid))); else if (behavior == SAB_Inquire) return active_pid; /* Wait here until we get signaled, and then restart */ - ConditionVariableSleep(&slot->active_cv, + ConditionVariableSleep(&s->active_cv, WAIT_EVENT_REPLICATION_SLOT_DROP); ConditionVariableCancelSleep(); goto retry; } - else - ConditionVariableCancelSleep(); /* no sleep needed after all */ + else if (behavior == SAB_Block) + ConditionVariableCancelSleep(); /* no sleep needed after all */ /* Let everybody know we've modified this slot */ - ConditionVariableBroadcast(&slot->active_cv); + ConditionVariableBroadcast(&s->active_cv); /* We made this slot active, so it's ours now. */ - MyReplicationSlot = slot; + MyReplicationSlot = s; /* success */ return 0; @@ -1100,43 +1143,82 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno) ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn = InvalidXLogRecPtr; NameData slotname; + int wspid; + int last_signaled_pid = 0; if (!s->in_use) continue; SpinLockAcquire(&s->mutex); - if (s->data.restart_lsn == InvalidXLogRecPtr || - s->data.restart_lsn >= oldestLSN) - { - SpinLockRelease(&s->mutex); - continue; - } - slotname = s->data.name; restart_lsn = s->data.restart_lsn; - SpinLockRelease(&s->mutex); + + if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN) + continue; LWLockRelease(ReplicationSlotControlLock); + /* Get ready to sleep on the slot in case it is active */ + ConditionVariablePrepareToSleep(&s->active_cv); + for (;;) { - int wspid = ReplicationSlotAcquire(NameStr(slotname), - SAB_Inquire); + /* + * Try to mark this slot as used by this process. + * + * Note that ReplicationSlotAcquireInternal(SAB_Inquire) + * should not cancel the prepared condition variable + * if this slot is active in other process. Because in this case + * we have to wait on that CV for the process owning + * the slot to be terminated, later. + */ + wspid = ReplicationSlotAcquireInternal(s, NULL, SAB_Inquire); - /* no walsender? success! */ - if (wspid == 0) + /* + * Exit the loop if we successfully acquired the slot or + * the slot was dropped during waiting for the owning process + * to be terminated. For example, the latter case is likely to + * happen when the slot is temporary because it's automatically + * dropped by the termination of the owning process. + */ + if (wspid <= 0) break; - ereport(LOG, - (errmsg("terminating walsender %d because replication slot \"%s\" is too far behind", - wspid, NameStr(slotname)))); - (void) kill(wspid, SIGTERM); + /* + * Signal to terminate the process that owns the slot. + * + * There is the race condition where other process may own + * the slot after the process using it was terminated and before + * this process owns it. To handle this case, we signal again + * if the PID of the owning process is changed than the last. + * + * XXX This logic assumes that the same PID is not reused + * very quickly. + */ + if (last_signaled_pid != wspid) + { + ereport(LOG, + (errmsg("terminating process %d because replication slot \"%s\" is too far behind", + wspid, NameStr(slotname)))); + (void) kill(wspid, SIGTERM); + last_signaled_pid = wspid; + } ConditionVariableTimedSleep(&s->active_cv, 10, WAIT_EVENT_REPLICATION_SLOT_DROP); } ConditionVariableCancelSleep(); + /* + * Do nothing here and start from scratch if the slot has + * already been dropped. + */ + if (wspid == -1) + { + CHECK_FOR_INTERRUPTS(); + goto restart; + } + ereport(LOG, (errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size", NameStr(slotname), From be14f884d57bc9c8ec8415edafea35ba5d31af59 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 19 Jun 2020 08:57:24 -0700 Subject: [PATCH 017/334] Fix deduplication "single value" strategy bug. It was possible for deduplication's single value strategy to mistakenly believe that a very small duplicate tuple counts as one of the six large tuples that it aims to leave behind after the page finally splits. This could cause slightly suboptimal space utilization with very low cardinality indexes, though only under fairly narrow conditions. To fix, be particular about what kind of tuple counts as a maxpostingsize-capped tuple. This avoids confusion in the event of a small tuple that gets "wedged" between two large tuples, where all tuples on the page are duplicates of the same value. Discussion: https://postgr.es/m/CAH2-Wz=Y+sgSFc-O3LpiZX-POx2bC+okec2KafERHuzdVa7-rQ@mail.gmail.com Backpatch: 13-, where deduplication was introduced (by commit 0d861bbb) --- src/backend/access/nbtree/nbtdedup.c | 41 +++++++++++++++++++--------- src/backend/access/nbtree/nbtsort.c | 2 ++ src/backend/access/nbtree/nbtxlog.c | 1 + src/include/access/nbtree.h | 1 + 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index b20faf693daa..f6be865b17e3 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -62,7 +62,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, Page page = BufferGetPage(buf); BTPageOpaque opaque; Page newpage; - int newpagendataitems = 0; OffsetNumber deletable[MaxIndexTuplesPerPage]; BTDedupState state; int ndeletable = 0; @@ -124,6 +123,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, */ state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; + state->nmaxitems = 0; state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); /* Metadata about base tuple of current pending posting list */ state->base = NULL; @@ -204,26 +204,25 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, * reset the state and move on without modifying the page. */ pagesaving += _bt_dedup_finish_pending(newpage, state); - newpagendataitems++; if (singlevalstrat) { /* * Single value strategy's extra steps. * - * Lower maxpostingsize for sixth and final item that might be - * deduplicated by current deduplication pass. When sixth - * item formed/observed, stop deduplicating items. + * Lower maxpostingsize for sixth and final large posting list + * tuple at the point where 5 maxpostingsize-capped tuples + * have either been formed or observed. * - * Note: It's possible that this will be reached even when - * current deduplication pass has yet to merge together some - * existing items. It doesn't matter whether or not the - * current call generated the maxpostingsize-capped duplicate - * tuples at the start of the page. + * When a sixth maxpostingsize-capped item is formed/observed, + * stop merging together tuples altogether. The few tuples + * that remain at the end of the page won't be merged together + * at all (at least not until after a future page split takes + * place). */ - if (newpagendataitems == 5) + if (state->nmaxitems == 5) _bt_singleval_fillfactor(page, state, newitemsz); - else if (newpagendataitems == 6) + else if (state->nmaxitems == 6) { state->deduplicate = false; singlevalstrat = false; /* won't be back here */ @@ -237,7 +236,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, /* Handle the last item */ pagesaving += _bt_dedup_finish_pending(newpage, state); - newpagendataitems++; /* * If no items suitable for deduplication were found, newpage must be @@ -404,7 +402,24 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup) (state->nhtids + nhtids) * sizeof(ItemPointerData)); if (mergedtupsz > state->maxpostingsize) + { + /* + * Count this as an oversized item for single value strategy, though + * only when there are 50 TIDs in the final posting list tuple. This + * limit (which is fairly arbitrary) avoids confusion about how many + * 1/6 of a page tuples have been encountered/created by the current + * deduplication pass. + * + * Note: We deliberately don't consider which deduplication pass + * merged together tuples to create this item (could be a previous + * deduplication pass, or current pass). See _bt_do_singleval() + * comments. + */ + if (state->nhtids > 50) + state->nmaxitems++; + return false; + } /* * Save heap TIDs to pending posting list tuple -- itup can be merged into diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 15f10a29d3da..c03998834d4a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1095,6 +1095,7 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, pfree(postingtuple); } + dstate->nmaxitems = 0; dstate->nhtids = 0; dstate->nitems = 0; dstate->phystupsize = 0; @@ -1310,6 +1311,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); dstate->deduplicate = true; /* unused */ + dstate->nmaxitems = 0; /* unused */ dstate->maxpostingsize = 0; /* set later */ /* Metadata about base tuple of current pending posting list */ dstate->base = NULL; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 87a8612c28c4..5bec59d448dd 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -483,6 +483,7 @@ btree_xlog_dedup(XLogReaderState *record) state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; /* unused */ + state->nmaxitems = 0; /* unused */ /* Conservatively use larger maxpostingsize than primary */ state->maxpostingsize = BTMaxItemSize(page); state->base = NULL; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3b2bcb22a70e..79506c748b2e 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -739,6 +739,7 @@ typedef struct BTDedupStateData { /* Deduplication status info for entire pass over page */ bool deduplicate; /* Still deduplicating page? */ + int nmaxitems; /* Number of max-sized tuples so far */ Size maxpostingsize; /* Limit on size of final tuple */ /* Metadata about base tuple of current pending posting list */ From 816cbb59e3008112c5b217af7b9213b7a09881bf Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Fri, 19 Jun 2020 12:55:43 -0400 Subject: [PATCH 018/334] Adjust some glossary terms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mostly in response to Jürgen Purtz critique of previous definitions, though I added many other changes. Author: Álvaro Herrera Reviewed-by: Jürgen Purtz Reviewed-by: Justin Pryzby Reviewed-by: Erik Rijkers Discussion: https://postgr.es/m/c1e06008-2132-30f4-9b38-877e8683d418@purtz.de --- doc/src/sgml/glossary.sgml | 399 ++++++++++++++++++++++--------------- 1 file changed, 240 insertions(+), 159 deletions(-) diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index 25b03f3b370f..c7c931c17e1b 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -23,7 +23,7 @@ - Aggregate function + Aggregate function (routine) A function that @@ -39,6 +39,11 @@ + + Analytic function + + + Analyze (operation) @@ -54,14 +59,13 @@ (Don't confuse this term with the ANALYZE option to the command.) + + For more information, see + . + - - Analytic function - - - Atomic @@ -98,8 +102,7 @@ An element with a certain name and data type found within a - tuple or - table. + tuple. @@ -389,40 +392,33 @@ - - Data directory + + Database - The base directory on the filesystem of a - server that contains all - data files and subdirectories associated with an - instance (with the - exception of tablespaces). - The environment variable PGDATA is commonly used to - refer to the - data directory. - - - An instance's storage - space comprises the data directory plus any additional tablespaces. + A named collection of + local SQL objects. For more information, see - . + . - - Database + + Database cluster - A named collection of - SQL objects. + A collection of databases and global SQL objects, + and their common static and dynamic metadata. + Sometimes referred to as a + cluster. - For more information, see - . + In PostgreSQL, the term + cluster is also sometimes used to refer to an instance. + (Don't confuse this term with the SQL command CLUSTER.) @@ -432,6 +428,31 @@ + + Data directory + + + The base directory on the filesystem of a + server that contains all + data files and subdirectories associated with a + database cluster + (with the exception of + tablespaces, + and optionally WAL). + The environment variable PGDATA is commonly used to + refer to the data directory. + + + A cluster's storage + space comprises the data directory plus any additional tablespaces. + + + For more information, see + . + + + + Data page @@ -578,7 +599,7 @@ - Foreign table + Foreign table (relation) A relation which appears to have @@ -631,12 +652,20 @@ - Function + Function (routine) - Any defined transformation of data. Many functions are already defined - within PostgreSQL itself, but user-defined - ones can also be added. + A type of routine that receives zero or more arguments, returns zero or more + output values, and is constrained to run within one transaction. + Functions are invoked as part of a query, for example via + SELECT. + Certain functions can return + sets; those are + called set-returning functions. + + + Functions can also be used for + triggers to invoke. For more information, see @@ -689,13 +718,12 @@ - Index + Index (relation) A relation that contains data derived from a table - (or relation types - such as a materialized view). + or materialized view. Its internal structure supports fast retrieval of and access to the original data. @@ -724,14 +752,12 @@ Instance - A set of databases and accompanying global SQL objects that are stored in - the same data directory - in a single server. - If running, one + A group of backend and auxiliary processes that communicate using + a common shared memory area. One postmaster process - manages a group of backend and auxiliary processes that communicate - using a common shared memory - area. Many instances can run on the same + manages the instance; one instance manages exactly one + database cluster + with all its databases. Many instances can run on the same server as long as their TCP ports do not conflict. @@ -739,14 +765,10 @@ The instance handles all key features of a DBMS: read and write access to files and shared memory, assurance of the ACID properties, - connections to client processes, + connections to + client processes, privilege verification, crash recovery, replication, etc. - - In PostgreSQL, the term - cluster is also sometimes used to refer to an instance. - (Don't confuse this term with the SQL command CLUSTER.) - @@ -769,8 +791,10 @@ Join - An SQL keyword used in SELECT statements for - combining data from multiple relations. + An operation and SQL keyword used in + queries + for combining data from multiple + relations. @@ -781,10 +805,10 @@ A means of identifying a row within a table or - relation by + other relation by values contained within one or more attributes - in that table. + in that relation. @@ -813,15 +837,6 @@ - - Log record - - - Archaic term for a WAL record. - - - - Logged @@ -855,6 +870,15 @@ + + Log record + + + Archaic term for a WAL record. + + + + Master (server) @@ -883,12 +907,13 @@ - Materialized view + Materialized view (relation) A relation that is - defined in the same way that a view - is, but stores data in the same way that a + defined by a SELECT statement + (just like a view), + but stores data in the same way that a table does. It cannot be modified via INSERT, UPDATE, or DELETE operations. @@ -949,6 +974,8 @@ One of several disjoint (not overlapping) subsets of a larger set. + + In reference to a partitioned table: @@ -961,16 +988,18 @@ - In reference to a window function: + In reference to a window function + in a query, a partition is a user-defined criterion that identifies which neighboring - rows can be considered by the - function. + rows + of the query's result set + can be considered by the function. - Partitioned table + Partitioned table (relation) A relation that is @@ -997,20 +1026,6 @@ - - Primary (server) - - - When two or more databases - are linked via replication, - the server - that is considered the authoritative source of information is called - the primary, - also known as a master. - - - - Primary key @@ -1031,19 +1046,29 @@ + + Primary (server) + + + When two or more databases + are linked via replication, + the server + that is considered the authoritative source of information is called + the primary, + also known as a master. + + + + - Procedure + Procedure (routine) - A defined set of instructions for manipulating data within a - database. - A procedure can - be written in a variety of programming languages. They are - similar to functions, - but are different in that they must be invoked via the CALL - command rather than the SELECT or PERFORM - commands, and they are allowed to make transactional statements such + A type of routine. + Their distinctive qualities are that they do not return values, + and that they are allowed to make transactional statements such as COMMIT and ROLLBACK. + They are invoked via the CALL command. For more information, see @@ -1115,6 +1140,11 @@ indexes are all relations. + More generically, a relation is a set of tuples; for example, + the result of a query is also a relation. + + + In PostgreSQL, Class is an archaic synonym for relation. @@ -1155,16 +1185,23 @@ Result set - A data structure transmitted from a - backend process to - a client upon the - completion of an SQL - command, usually a SELECT but it can be an + A relation transmitted + from a backend process + to a client upon the + completion of an SQL command, usually a + SELECT but it can be an INSERT, UPDATE, or DELETE command if the RETURNING - clause is specified. The data structure consists of zero or more - rows with the same ordered set of - attributes. + clause is specified. + + + The fact that a result set is a relation means that a query can be used + in the definition of another query, becoming a + subquery. + + + + @@ -1216,6 +1253,27 @@ + + Routine + + + A defined set of instructions stored in the database system + that can be invoked for execution. + A routine can be written in a variety of programming + languages. Routines can be + functions + (including set-returning functions and + trigger functions), + aggregate functions, + and procedures. + + + Many routines are already defined within PostgreSQL + itself, but user-defined ones can also be added. + + + + Row @@ -1248,16 +1306,7 @@ Each SQL object must reside in exactly one schema. - The names of SQL objects of the same type in the same schema are enforced - to be unique. - There is no restriction on reusing a name in multiple schemas. - - - All system-defined SQL objects reside in schema pg_catalog, - and commonly many user-defined SQL objects reside in the default schema - public, - but it is common and recommended that other schemas are created to hold - application-specific SQL objects. + All system-defined SQL objects reside in schema pg_catalog. @@ -1299,6 +1348,19 @@ + + Sequence (relation) + + + A type of relation that is used to generate values. + Typically the generated values are sequential non-repeating numbers. + They are commonly used to generate surrogate + primary key + values. + + + + + 00:00:00+1559 + 24:00:00-1559 1 microsecond From 9e496768b8a7303ea07888ea1baae8e2a57dda7b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 3 Aug 2020 14:02:35 -0400 Subject: [PATCH 251/334] Remove unnecessary "DISTINCT" in psql's queries for \dAc and \dAf. A moment's examination of these queries is sufficient to see that they do not produce duplicate rows, unless perhaps there's catalog corruption. Using DISTINCT anyway is inefficient and confusing; moreover it sets a poor example for anyone who refers to psql -E output to see how to query the catalogs. --- src/bin/psql/describe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 57266f4fc351..d81f1575bf4c 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -6071,7 +6071,7 @@ listOperatorClasses(const char *access_method_pattern, initPQExpBuffer(&buf); printfPQExpBuffer(&buf, - "SELECT DISTINCT" + "SELECT\n" " am.amname AS \"%s\",\n" " pg_catalog.format_type(c.opcintype, NULL) AS \"%s\",\n" " CASE\n" @@ -6166,7 +6166,7 @@ listOperatorFamilies(const char *access_method_pattern, initPQExpBuffer(&buf); printfPQExpBuffer(&buf, - "SELECT DISTINCT" + "SELECT\n" " am.amname AS \"%s\",\n" " CASE\n" " WHEN pg_catalog.pg_opfamily_is_visible(f.oid)\n" From a451b7d44249b8655db8d40476ace9f84d76ab88 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 3 Aug 2020 13:04:42 -0700 Subject: [PATCH 252/334] Add nbtree page deletion assertion. Add a documenting assertion that's similar to the nearby assertion added by commit cd8c73a3. This conveys that the entire call to _bt_pagedel() does no work if it isn't possible to get a descent stack for the initial scanblkno page. --- src/backend/access/nbtree/nbtpage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 70bac0052fc6..53dff3268083 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1697,6 +1697,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) { ReleaseBuffer(leafbuf); + Assert(ndeleted == 0); return ndeleted; } From 9a9db08ae46209edcc5ecb120328a2bf92fd6069 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 3 Aug 2020 15:54:38 -0700 Subject: [PATCH 253/334] Fix replica backward scan race condition. It was possible for the logic used by backward scans (which must reason about concurrent page splits/deletions in its own peculiar way) to become confused when running on a replica. Concurrent replay of a WAL record that describes the second phase of page deletion could cause _bt_walk_left() to get confused. btree_xlog_unlink_page() simply failed to adhere to the same locking protocol that we use on the primary, which is obviously wrong once you consider these two disparate functions together. This bug is present in all stable branches. More concretely, the problem was that nothing stopped _bt_walk_left() from observing inconsistencies between the deletion's target page and its original sibling pages when running on a replica. This is true even though the second phase of page deletion is supposed to work as a single atomic action. Queries running on replicas raised "could not find left sibling of block %u in index %s" can't-happen errors when they went back to their scan's "original" page and observed that the page has not been marked deleted (even though it really was concurrently deleted). There is no evidence that this actually happened in the real world. The issue came to light during unrelated feature development work. Note that _bt_walk_left() is the only code that cares about the difference between a half-dead page and a fully deleted page that isn't also exclusively used by nbtree VACUUM (unless you include contrib/amcheck code). It seems very likely that backward scans are the only thing that could become confused by the inconsistency. Even amcheck's complex bt_right_page_check_scankey() dance was unaffected. To fix, teach btree_xlog_unlink_page() to lock the left sibling, target, and right sibling pages in that order before releasing any locks (just like _bt_unlink_halfdead_page()). This is the simplest possible approach. There doesn't seem to be any opportunity to be more clever about lock acquisition in the REDO routine, and it hardly seems worth the trouble in any case. This fix might enable contrib/amcheck verification of leaf page sibling links with only an AccessShareLock on the relation. An amcheck patch from Andrey Borodin was rejected back in January because it clashed with btree_xlog_unlink_page()'s lax approach to locking pages. It now seems likely that the real problem was with btree_xlog_unlink_page(), not the patch. This is a low severity, low likelihood bug, so no backpatch. Author: Michail Nikolaev Diagnosed-By: Michail Nikolaev Discussion: https://postgr.es/m/CANtu0ohkR-evAWbpzJu54V8eCOtqjJyYp3PQ_SGoBTRGXWhWRw@mail.gmail.com --- src/backend/access/nbtree/README | 18 ++++++ src/backend/access/nbtree/nbtxlog.c | 88 ++++++++++++++++++----------- 2 files changed, 72 insertions(+), 34 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 32ad9e339a29..9d5fc424a574 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -572,6 +572,24 @@ replay of page deletion records does not hold a write lock on the target leaf page throughout; only the primary needs to block out concurrent writers that insert on to the page being deleted.) +There are also locking differences between the primary and WAL replay +for the first stage of a page split (i.e. same-level differences in +locking). Replay of the first phase of a page split can get away with +locking and updating the original right sibling page (which is also the +new right sibling page's right sibling) after locks on the original page +and its new right sibling have been released. Again, this is okay +because there are no writers. Page deletion WAL replay cannot get away +with being lax about same-level locking during replay, though -- doing +so risks confusing concurrent backwards scans. + +Page deletion's second phase locks the left sibling page, target page, +and right page in order on the standby, just like on the primary. This +allows backwards scans running on a standby to reason about page +deletion on the leaf level; a page cannot appear deleted without that +being reflected in the sibling pages. It's probably possible to be more +lax about how locks are acquired on the standby during the second phase +of page deletion, but that hardly seems worth it. + During recovery all index scans start with ignore_killed_tuples = false and we never set kill_prior_tuple. We do this because the oldest xmin on the standby server can be older than the oldest xmin on the primary diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 5d346da84fde..09d1b0e3419a 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -774,7 +774,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; - Buffer buffer; + Buffer leftbuf; + Buffer target; + Buffer rightbuf; Page page; BTPageOpaque pageop; @@ -783,46 +785,39 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) /* * In normal operation, we would lock all the pages this WAL record - * touches before changing any of them. In WAL replay, it should be okay - * to lock just one page at a time, since no concurrent index updates can - * be happening, and readers should not care whether they arrive at the - * target page or not (since it's surely empty). + * touches before changing any of them. In WAL replay, we at least lock + * the pages in the same standard left-to-right order (leftsib, target, + * rightsib), and don't release the sibling locks until the target is + * marked deleted. + * + * btree_xlog_split() can get away with fixing its right sibling page's + * left link last of all, after dropping all other locks. We prefer to + * avoid dropping locks on same-level pages early compared to normal + * operation. This keeps things simple for backwards scans. See + * nbtree/README. */ - /* Fix left-link of right sibling */ - if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) - { - page = (Page) BufferGetPage(buffer); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - pageop->btpo_prev = leftsib; - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); - /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { - if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = (Page) BufferGetPage(leftbuf); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_next = rightsib; PageSetLSN(page, lsn); - MarkBufferDirty(buffer); + MarkBufferDirty(leftbuf); } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); } + else + leftbuf = InvalidBuffer; /* Rewrite target page as empty deleted page */ - buffer = XLogInitBufferForRedo(record, 0); - page = (Page) BufferGetPage(buffer); + target = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(target); - _bt_pageinit(page, BufferGetPageSize(buffer)); + _bt_pageinit(page, BufferGetPageSize(target)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; @@ -832,8 +827,27 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_cycleid = 0; PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(target); + + /* Fix left-link of right sibling */ + if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(rightbuf); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_prev = leftsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(rightbuf); + } + + /* Release siblings */ + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + if (BufferIsValid(rightbuf)) + UnlockReleaseBuffer(rightbuf); + + /* Release target */ + UnlockReleaseBuffer(target); /* * If we deleted a parent of the targeted leaf page, instead of the leaf @@ -845,13 +859,19 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) /* * There is no real data on the page, so we just re-create it from * scratch using the information from the WAL record. + * + * Note that we don't end up here when the target page is also the + * leafbuf page. There is no need to add a dummy hikey item with a + * top parent link when deleting leafbuf because it's the last page + * we'll delete in the subtree undergoing deletion. */ - IndexTupleData trunctuple; + Buffer leafbuf; + IndexTupleData trunctuple; - buffer = XLogInitBufferForRedo(record, 3); - page = (Page) BufferGetPage(buffer); + leafbuf = XLogInitBufferForRedo(record, 3); + page = (Page) BufferGetPage(leafbuf); - _bt_pageinit(page, BufferGetPageSize(buffer)); + _bt_pageinit(page, BufferGetPageSize(leafbuf)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; @@ -870,8 +890,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(leafbuf); + UnlockReleaseBuffer(leafbuf); } /* Update metapage if needed */ From dd877998d498c511352bd3640fd57f041c90ea62 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 4 Aug 2020 14:36:01 +0900 Subject: [PATCH 254/334] Make new SSL TAP test for channel_binding more robust The test would fail in an environment including a certificate file in ~/.postgresql/. bdd6e9b fixed a similar failure, and d6e612f introduced the same problem again with a new test. Author: Kyotaro Horiguchi Discussion: https://postgr.es/m/20200804.120033.31225582282178001.horikyota.ntt@gmail.com Backpatch-through: 13 --- src/test/ssl/t/002_scram.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/ssl/t/002_scram.pl b/src/test/ssl/t/002_scram.pl index 01231f8ba0f0..20ab0d5b0bc2 100644 --- a/src/test/ssl/t/002_scram.pl +++ b/src/test/ssl/t/002_scram.pl @@ -97,7 +97,7 @@ copy("ssl/client.key", $client_tmp_key); chmod 0600, $client_tmp_key; test_connect_fails( - "sslcert=ssl/client.crt sslkey=$client_tmp_key hostaddr=$SERVERHOSTADDR", + "sslcert=ssl/client.crt sslkey=$client_tmp_key sslrootcert=invalid hostaddr=$SERVERHOSTADDR", "dbname=certdb user=ssltestuser channel_binding=require", qr/channel binding required, but server authenticated client without channel binding/, "Cert authentication and channel_binding=require"); From 0f76294260b92849c4958fb706ecd5b5cd73e40e Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 4 Aug 2020 15:20:31 -0400 Subject: [PATCH 255/334] Increase hard-wired timeout values in ecpg regression tests. A couple of test cases had connect_timeout=14, a value that seems to have been plucked from a hat. While it's more than sufficient for normal cases, slow/overloaded buildfarm machines can get a timeout failure here, as per recent report from "sungazer". Increase to 180 seconds, which is in line with our typical timeouts elsewhere in the regression tests. Back-patch to 9.6; the code looks different in 9.5, and this doesn't seem to be quite worth the effort to adapt to that. Report: https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=sungazer&dt=2020-08-04%2007%3A12%3A22 --- src/interfaces/ecpg/test/connect/test1.pgc | 2 +- src/interfaces/ecpg/test/connect/test5.pgc | 2 +- src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr | 2 +- src/interfaces/ecpg/test/expected/connect-test1.c | 2 +- src/interfaces/ecpg/test/expected/connect-test1.stderr | 2 +- src/interfaces/ecpg/test/expected/connect-test5.c | 2 +- src/interfaces/ecpg/test/expected/connect-test5.stderr | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/interfaces/ecpg/test/connect/test1.pgc b/src/interfaces/ecpg/test/connect/test1.pgc index 82cdfb8fc56e..961bd72ef2a9 100644 --- a/src/interfaces/ecpg/test/connect/test1.pgc +++ b/src/interfaces/ecpg/test/connect/test1.pgc @@ -46,7 +46,7 @@ exec sql end declare section; exec sql connect to unix:postgresql://localhost/ecpg2_regression user regress_ecpg_user1 using "connectpw"; exec sql disconnect; - exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=14 user regress_ecpg_user1; + exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 user regress_ecpg_user1; exec sql disconnect; /* wrong db */ diff --git a/src/interfaces/ecpg/test/connect/test5.pgc b/src/interfaces/ecpg/test/connect/test5.pgc index 2e34ab84fc62..e712fa87783f 100644 --- a/src/interfaces/ecpg/test/connect/test5.pgc +++ b/src/interfaces/ecpg/test/connect/test5.pgc @@ -55,7 +55,7 @@ exec sql end declare section; exec sql connect to 'unix:postgresql://localhost/ecpg2_regression' as main user :user USING "connectpw"; exec sql disconnect main; - exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=14&client_encoding=latin1 as main user regress_ecpg_user1/connectpw; + exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180&client_encoding=latin1 as main user regress_ecpg_user1/connectpw; exec sql disconnect main; exec sql connect to "unix:postgresql://200.46.204.71/ecpg2_regression" as main user regress_ecpg_user1/connectpw; diff --git a/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr b/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr index b334537b6005..853453d980ec 100644 --- a/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr +++ b/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr @@ -48,7 +48,7 @@ [NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: ecpg_finish: connection ecpg2_regression closed [NO_PID]: sqlca: code: 0, state: 00000 -[NO_PID]: ECPGconnect: opening database ecpg2_regression on port with options connect_timeout=14 for user regress_ecpg_user1 +[NO_PID]: ECPGconnect: opening database ecpg2_regression on port with options connect_timeout=180 for user regress_ecpg_user1 [NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: ecpg_finish: connection ecpg2_regression closed [NO_PID]: sqlca: code: 0, state: 00000 diff --git a/src/interfaces/ecpg/test/expected/connect-test1.c b/src/interfaces/ecpg/test/expected/connect-test1.c index 894273339cd2..ffd24e2fc8f4 100644 --- a/src/interfaces/ecpg/test/expected/connect-test1.c +++ b/src/interfaces/ecpg/test/expected/connect-test1.c @@ -93,7 +93,7 @@ main(void) #line 47 "test1.pgc" - { ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=14" , "regress_ecpg_user1" , NULL , NULL, 0); } + { ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180" , "regress_ecpg_user1" , NULL , NULL, 0); } #line 49 "test1.pgc" { ECPGdisconnect(__LINE__, "CURRENT");} diff --git a/src/interfaces/ecpg/test/expected/connect-test1.stderr b/src/interfaces/ecpg/test/expected/connect-test1.stderr index c5cbf749efea..1986fc54adc2 100644 --- a/src/interfaces/ecpg/test/expected/connect-test1.stderr +++ b/src/interfaces/ecpg/test/expected/connect-test1.stderr @@ -48,7 +48,7 @@ [NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: ecpg_finish: connection ecpg2_regression closed [NO_PID]: sqlca: code: 0, state: 00000 -[NO_PID]: ECPGconnect: opening database ecpg2_regression on port with options connect_timeout=14 for user regress_ecpg_user1 +[NO_PID]: ECPGconnect: opening database ecpg2_regression on port with options connect_timeout=180 for user regress_ecpg_user1 [NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: ecpg_finish: connection ecpg2_regression closed [NO_PID]: sqlca: code: 0, state: 00000 diff --git a/src/interfaces/ecpg/test/expected/connect-test5.c b/src/interfaces/ecpg/test/expected/connect-test5.c index b44104854da0..6ae5b589dea4 100644 --- a/src/interfaces/ecpg/test/expected/connect-test5.c +++ b/src/interfaces/ecpg/test/expected/connect-test5.c @@ -121,7 +121,7 @@ main(void) #line 56 "test5.pgc" - { ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=14 & client_encoding=latin1" , "regress_ecpg_user1" , "connectpw" , "main", 0); } + { ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 & client_encoding=latin1" , "regress_ecpg_user1" , "connectpw" , "main", 0); } #line 58 "test5.pgc" { ECPGdisconnect(__LINE__, "main");} diff --git a/src/interfaces/ecpg/test/expected/connect-test5.stderr b/src/interfaces/ecpg/test/expected/connect-test5.stderr index cefdb0739e5b..a54df175fbf0 100644 --- a/src/interfaces/ecpg/test/expected/connect-test5.stderr +++ b/src/interfaces/ecpg/test/expected/connect-test5.stderr @@ -61,7 +61,7 @@ [NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: ecpg_finish: connection main closed [NO_PID]: sqlca: code: 0, state: 00000 -[NO_PID]: ECPGconnect: opening database ecpg2_regression on port with options connect_timeout=14 & client_encoding=latin1 for user regress_ecpg_user1 +[NO_PID]: ECPGconnect: opening database ecpg2_regression on port with options connect_timeout=180 & client_encoding=latin1 for user regress_ecpg_user1 [NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: ecpg_finish: connection main closed [NO_PID]: sqlca: code: 0, state: 00000 From f47b5e139579a77c1f7c63400f01ea39d515e8c8 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 5 Aug 2020 02:15:34 +0300 Subject: [PATCH 256/334] Remove btree page items after page unlink Currently, page unlink leaves remaining items "as is", but replay of corresponding WAL-record re-initializes page leaving it with no items. For the sake of consistency, this commit makes primary delete all the items during page unlink as well. Thanks to this change, we now don't mask contents of deleted btree page for WAL consistency checking. Discussion: https://postgr.es/m/CAPpHfdt_OTyQpXaPJcWzV2N-LNeNJseNB-K_A66qG%3DL518VTFw%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Peter Geoghegan --- contrib/amcheck/verify_nbtree.c | 7 ++----- src/backend/access/nbtree/nbtpage.c | 9 +++++++++ src/backend/access/nbtree/nbtxlog.c | 10 +--------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index e4d501a85d1f..c9f9e755dccc 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2864,11 +2864,8 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * As noted at the beginning of _bt_binsrch(), an internal page must have * children, since there must always be a negative infinity downlink * (there may also be a highkey). In the case of non-rightmost leaf - * pages, there must be at least a highkey. Deleted pages on replica - * might contain no items, because page unlink re-initializes - * page-to-be-deleted. Deleted pages with no items might be on primary - * too due to preceding recovery, but on primary new deletions can't - * happen concurrently to amcheck. + * pages, there must be at least a highkey. The exceptions are deleted + * pages, which contain no items. * * This is correct when pages are half-dead, since internal pages are * never half-dead, and leaf pages must have a high key when half-dead diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 53dff3268083..d5db9aaa3a13 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -2058,6 +2058,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, BTMetaPageData *metad = NULL; ItemId itemid; Page page; + PageHeader header; BTPageOpaque opaque; bool rightsib_is_rightmost; int targetlevel; @@ -2327,6 +2328,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = ReadNewTransactionId(); + /* + * Remove the remaining tuples on the page. This keeps things simple for + * WAL consistency checking. + */ + header = (PageHeader) page; + header->pd_lower = SizeOfPageHeaderData; + header->pd_upper = header->pd_special; + /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 09d1b0e3419a..be0fa450f31d 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -1051,15 +1051,7 @@ btree_mask(char *pagedata, BlockNumber blkno) maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_ISDELETED(maskopaq)) - { - /* - * Mask page content on a DELETED page since it will be re-initialized - * during replay. See btree_xlog_unlink_page() for details. - */ - mask_page_content(page); - } - else if (P_ISLEAF(maskopaq)) + if (P_ISLEAF(maskopaq)) { /* * In btree leaf pages, it is possible to modify the LP_FLAGS without From 7a980dfc6c15add6ec3309932cf3061bb6745f65 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 5 Aug 2020 15:38:55 -0400 Subject: [PATCH 257/334] Fix matching of sub-partitions when a partitioned plan is stale. Since we no longer require AccessExclusiveLock to add a partition, the executor may see that a partitioned table has more partitions than the planner saw. ExecCreatePartitionPruneState's code for matching up the partition lists in such cases was faulty, and would misbehave if the planner had successfully pruned any partitions from the query. (Thus, trouble would occur only if a partition addition happens concurrently with a query that uses both static and dynamic partition pruning.) This led to an Assert failure in debug builds, and probably to crashes or query misbehavior in production builds. To repair the bug, just explicitly skip zeroes in the plan's relid_map[] list. I also made some cosmetic changes to make the code more readable (IMO anyway). Also, convert the cross-checking Assert to a regular test-and-elog, since it's now apparent that this logic is more fragile than one would like. Currently, there's no way to repeatably exercise this code, except with manual use of a debugger to stop the backend between planning and execution. Hence, no test case in this patch. We oughta do something about that testability gap, but that's for another day. Amit Langote and Tom Lane, per report from Justin Pryzby. Oversight in commit 898e5e329; backpatch to v12 where that appeared. Discussion: https://postgr.es/m/20200802181131.GA27754@telsasoft.com --- src/backend/executor/execPartition.c | 47 +++++++++++++++++++++------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index fb6ce4905681..79fcbd6b0665 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -1667,26 +1667,51 @@ ExecCreatePartitionPruneState(PlanState *planstate, * present in the one used to construct subplan_map and * subpart_map. So we must construct new and longer arrays * where the partitions that were originally present map to - * the same place, and any added indexes map to -1, as if the - * new partitions had been pruned. + * the same sub-structures, and any added partitions map to + * -1, as if the new partitions had been pruned. + * + * Note: pinfo->relid_map[] may contain InvalidOid entries for + * partitions pruned by the planner. We cannot tell exactly + * which of the partdesc entries these correspond to, but we + * don't have to; just skip over them. The non-pruned + * relid_map entries, however, had better be a subset of the + * partdesc entries and in the same order. */ pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts); - for (pp_idx = 0; pp_idx < partdesc->nparts; ++pp_idx) + for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++) { - if (pinfo->relid_map[pd_idx] != partdesc->oids[pp_idx]) - { - pprune->subplan_map[pp_idx] = -1; - pprune->subpart_map[pp_idx] = -1; - } - else + /* Skip any InvalidOid relid_map entries */ + while (pd_idx < pinfo->nparts && + !OidIsValid(pinfo->relid_map[pd_idx])) + pd_idx++; + + if (pd_idx < pinfo->nparts && + pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx]) { + /* match... */ pprune->subplan_map[pp_idx] = pinfo->subplan_map[pd_idx]; pprune->subpart_map[pp_idx] = - pinfo->subpart_map[pd_idx++]; + pinfo->subpart_map[pd_idx]; + pd_idx++; + } + else + { + /* this partdesc entry is not in the plan */ + pprune->subplan_map[pp_idx] = -1; + pprune->subpart_map[pp_idx] = -1; } } - Assert(pd_idx == pinfo->nparts); + + /* + * It might seem that we need to skip any trailing InvalidOid + * entries in pinfo->relid_map before checking that we scanned + * all of the relid_map. But we will have skipped them above, + * because they must correspond to some partdesc->oids + * entries; we just couldn't tell which. + */ + if (pd_idx != pinfo->nparts) + elog(ERROR, "could not match partition child tables to plan elements"); } /* present_parts is also subject to later modification */ From a6775352476ac92d6b3eb3ae2dfd2775e3622afe Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Wed, 5 Aug 2020 17:12:10 -0400 Subject: [PATCH 258/334] doc: clarify "state" table reference in tutorial Reported-by: Vyacheslav Shablistyy Discussion: https://postgr.es/m/159586122762.680.1361378513036616007@wrigleys.postgresql.org Backpatch-through: 9.5 --- doc/src/sgml/advanced.sgml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/advanced.sgml b/doc/src/sgml/advanced.sgml index f6c4627c3e0f..d77312600f7b 100644 --- a/doc/src/sgml/advanced.sgml +++ b/doc/src/sgml/advanced.sgml @@ -628,8 +628,9 @@ CREATE TABLE capitals ( parent, cities. The type of the column name is text, a native PostgreSQL - type for variable length character strings. State capitals have - an extra column, state, that shows their state. In + type for variable length character strings. The + capitals table has + an extra column, state, which shows their states. In PostgreSQL, a table can inherit from zero or more other tables. From bab150045bd9766869f471ede88734ea0989261c Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 6 Aug 2020 14:13:03 -0400 Subject: [PATCH 259/334] Register llvm_shutdown using on_proc_exit, not before_shmem_exit. This seems more correct, because other before_shmem_exit calls may expect the infrastructure that is needed to run queries and access the database to be working, and also because this cleanup has nothing to do with shared memory. There are no known user-visible consequences to this, though, apart from what was previous fixed by commit 303640199d0436c5e7acdf50b837a027b5726594 and back-patched as commit bcbc27251d35336a6442761f59638138a772b839 and commit f7013683d9bb663a6a917421b1374306a32f165b, so for now, no back-patch. Bharath Rupireddy Discussion: http://postgr.es/m/CALj2ACWk7j4F2v2fxxYfrroOF=AdFNPr1WsV+AGtHAFQOqm_pw@mail.gmail.com --- src/backend/jit/llvm/llvmjit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/jit/llvm/llvmjit.c b/src/backend/jit/llvm/llvmjit.c index af8b34aaaf3c..43bed78a5299 100644 --- a/src/backend/jit/llvm/llvmjit.c +++ b/src/backend/jit/llvm/llvmjit.c @@ -683,7 +683,7 @@ llvm_session_initialize(void) } #endif - before_shmem_exit(llvm_shutdown, 0); + on_proc_exit(llvm_shutdown, 0); llvm_session_initialized = true; From d5e96520ffca8eeeefc11f8fc82af610f68e63a8 Mon Sep 17 00:00:00 2001 From: David Rowley Date: Fri, 7 Aug 2020 10:22:18 +1200 Subject: [PATCH 260/334] Fix bogus EXPLAIN output for Hash Aggregate 9bdb300de modified the EXPLAIN output for Hash Aggregate to show details from parallel workers. However, it neglected to consider that a given parallel worker may not have assisted with the given Hash Aggregate. This can occur when workers fail to start or during Parallel Append with enable_partitionwise_join enabled when only a single worker is working on a non-parallel aware sub-plan. It could also happen if a worker simply wasn't fast enough to get any work done before other processes went and finished all the work. The bogus output came from the fact that ExplainOpenWorker() skipped showing any details for non-initialized workers but show_hashagg_info() did show details from the worker. This meant that the worker properties that were shown were not properly attributed to the worker that they belong to. In passing, we also now don't show Hash Aggregate properties for the leader process when it did not contribute any work to the Hash Aggregate. This can occur either during Parallel Append when only a parallel worker worked on a given sub plan or with parallel_leader_participation set to off. This aims to make the behavior of Hash Aggregate's EXPLAIN output more similar to Sort's. Reported-by: Justin Pryzby Discussion: https://postgr.es/m/20200805012105.GZ28072%40telsasoft.com Backpatch-through: 13, where the original breakage was introduced --- src/backend/commands/explain.c | 63 ++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 1e565fd33755..30e0a7ee7f21 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3063,15 +3063,19 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) ExplainPropertyInteger("Planned Partitions", NULL, aggstate->hash_planned_partitions, es); - if (!es->analyze) - return; - - /* EXPLAIN ANALYZE */ - ExplainPropertyInteger("HashAgg Batches", NULL, - aggstate->hash_batches_used, es); - ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); - ExplainPropertyInteger("Disk Usage", "kB", - aggstate->hash_disk_used, es); + /* + * During parallel query the leader may have not helped out. We + * detect this by checking how much memory it used. If we find it + * didn't do any work then we don't show its properties. + */ + if (es->analyze && aggstate->hash_mem_peak > 0) + { + ExplainPropertyInteger("HashAgg Batches", NULL, + aggstate->hash_batches_used, es); + ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es); + ExplainPropertyInteger("Disk Usage", "kB", + aggstate->hash_disk_used, es); + } } else { @@ -3085,26 +3089,32 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) gotone = true; } - if (!es->analyze) + /* + * During parallel query the leader may have not helped out. We + * detect this by checking how much memory it used. If we find it + * didn't do any work then we don't show its properties. + */ + if (es->analyze && aggstate->hash_mem_peak > 0) { - if (gotone) - appendStringInfoChar(es->str, '\n'); - return; - } + if (!gotone) + ExplainIndentText(es); + else + appendStringInfoString(es->str, " "); - if (!gotone) - ExplainIndentText(es); - else - appendStringInfoString(es->str, " "); + appendStringInfo(es->str, "Batches: %d Memory Usage: " INT64_FORMAT "kB", + aggstate->hash_batches_used, memPeakKb); + gotone = true; - appendStringInfo(es->str, "Batches: %d Memory Usage: " INT64_FORMAT "kB", - aggstate->hash_batches_used, memPeakKb); + /* Only display disk usage if we spilled to disk */ + if (aggstate->hash_batches_used > 1) + { + appendStringInfo(es->str, " Disk Usage: " UINT64_FORMAT "kB", + aggstate->hash_disk_used); + } + } - /* Only display disk usage if we spilled to disk */ - if (aggstate->hash_batches_used > 1) - appendStringInfo(es->str, " Disk Usage: " UINT64_FORMAT "kB", - aggstate->hash_disk_used); - appendStringInfoChar(es->str, '\n'); + if (gotone) + appendStringInfoChar(es->str, '\n'); } /* Display stats for each parallel worker */ @@ -3117,6 +3127,9 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) int hash_batches_used; sinstrument = &aggstate->shared_info->sinstrument[n]; + /* Skip workers that didn't do anything */ + if (sinstrument->hash_mem_peak == 0) + continue; hash_disk_used = sinstrument->hash_disk_used; hash_batches_used = sinstrument->hash_batches_used; memPeakKb = (sinstrument->hash_mem_peak + 1023) / 1024; From c254d8d7b20bf629420b407a5451c3b32d1a7b0b Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 6 Aug 2020 15:25:49 -0700 Subject: [PATCH 261/334] amcheck: Sanitize metapage's allequalimage field. This will be helpful if it ever proves necessary to revoke an opclass's support for deduplication. Backpatch: 13-, where nbtree deduplication was introduced. --- contrib/amcheck/verify_nbtree.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index c9f9e755dccc..384a8ac747e1 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -305,8 +305,20 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, errmsg("index \"%s\" lacks a main relation fork", RelationGetRelationName(indrel)))); - /* Check index, possibly against table it is an index on */ + /* Extract metadata from metapage, and sanitize it in passing */ _bt_metaversion(indrel, &heapkeyspace, &allequalimage); + if (allequalimage && !heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version", + RelationGetRelationName(indrel)))); + if (allequalimage && !_bt_allequalimage(indrel, false)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe", + RelationGetRelationName(indrel)))); + + /* Check index, possibly against table it is an index on */ bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, heapallindexed, rootdescend); } From 3a3be80641c01e675d0ed484f15df8ec536d0a06 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 6 Aug 2020 16:23:52 -0700 Subject: [PATCH 262/334] Remove obsolete amcheck comment. Oversight in commit d114cc53871. --- contrib/amcheck/verify_nbtree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 384a8ac747e1..b87a3cb4717c 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -903,7 +903,6 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) * tuple. * * - That downlink to block was encountered in parent where that's expected. - * (Limited to readonly callers.) * * - That high keys of child pages matches corresponding pivot keys in parent. * From 199cec9779504c08aaa8159c6308283156547409 Mon Sep 17 00:00:00 2001 From: Etsuro Fujita Date: Fri, 7 Aug 2020 14:45:00 +0900 Subject: [PATCH 263/334] Fix yet another issue with step generation in partition pruning. Commit 13838740f fixed some issues with step generation in partition pruning, but there was yet another one: get_steps_using_prefix() assumes that clauses in the passed-in prefix list are sorted in ascending order of their partition key numbers, but the caller failed to ensure this for range partitioning, which led to an assertion failure in debug builds. Adjust the caller function to arrange the clauses in the prefix list in the required order for range partitioning. Back-patch to v11, like the previous commit. Patch by me, reviewed by Amit Langote. Discussion: https://postgr.es/m/CAPmGK16jkXiFG0YqMbU66wte-oJTfW6D1HaNvQf%3D%2B5o9%3Dm55wQ%40mail.gmail.com --- src/backend/partitioning/partprune.c | 138 ++++++++++-------- src/test/regress/expected/partition_prune.out | 10 ++ src/test/regress/sql/partition_prune.sql | 5 + 3 files changed, 96 insertions(+), 57 deletions(-) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 253c69064982..6268623d5699 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -1362,7 +1362,6 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context, List *eq_clauses = btree_clauses[BTEqualStrategyNumber]; List *le_clauses = btree_clauses[BTLessEqualStrategyNumber]; List *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber]; - bool pk_has_clauses[PARTITION_MAX_KEYS]; int strat; /* @@ -1382,10 +1381,15 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context, foreach(lc, btree_clauses[strat]) { PartClauseInfo *pc = lfirst(lc); + ListCell *eq_start; + ListCell *le_start; + ListCell *ge_start; ListCell *lc1; List *prefix = NIL; List *pc_steps; bool prefix_valid = true; + bool pk_has_clauses; + int keyno; /* * If this is a clause for the first partition key, @@ -1410,79 +1414,96 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context, continue; } - /* (Re-)initialize the pk_has_clauses array */ - Assert(pc->keyno > 0); - for (i = 0; i < pc->keyno; i++) - pk_has_clauses[i] = false; + eq_start = list_head(eq_clauses); + le_start = list_head(le_clauses); + ge_start = list_head(ge_clauses); /* - * Expressions from = clauses can always be in the - * prefix, provided they're from an earlier key. + * We arrange clauses into prefix in ascending order + * of their partition key numbers. */ - foreach(lc1, eq_clauses) + for (keyno = 0; keyno < pc->keyno; keyno++) { - PartClauseInfo *eqpc = lfirst(lc1); + pk_has_clauses = false; - if (eqpc->keyno == pc->keyno) - break; - if (eqpc->keyno < pc->keyno) + /* + * Expressions from = clauses can always be in the + * prefix, provided they're from an earlier key. + */ + for_each_cell(lc1, eq_clauses, eq_start) { - prefix = lappend(prefix, eqpc); - pk_has_clauses[eqpc->keyno] = true; - } - } + PartClauseInfo *eqpc = lfirst(lc1); - /* - * If we're generating steps for keyno == pc->keyno) + if (eqpc->keyno == keyno) + { + prefix = lappend(prefix, eqpc); + pk_has_clauses = true; + } + else + { + Assert(eqpc->keyno > keyno); break; - if (lepc->keyno < pc->keyno) + } + } + eq_start = lc1; + + /* + * If we're generating steps for keyno] = true; + PartClauseInfo *lepc = lfirst(lc1); + + if (lepc->keyno == keyno) + { + prefix = lappend(prefix, lepc); + pk_has_clauses = true; + } + else + { + Assert(lepc->keyno > keyno); + break; + } } + le_start = lc1; } - } - /* - * If we're generating steps for >/>= strategy, we can - * add other >= clauses to the prefix, provided - * they're from an earlier key. - */ - if (strat == BTGreaterStrategyNumber || - strat == BTGreaterEqualStrategyNumber) - { - foreach(lc1, ge_clauses) + /* + * If we're generating steps for >/>= strategy, we + * can add other >= clauses to the prefix, + * provided they're from an earlier key. + */ + if (strat == BTGreaterStrategyNumber || + strat == BTGreaterEqualStrategyNumber) { - PartClauseInfo *gepc = lfirst(lc1); - - if (gepc->keyno == pc->keyno) - break; - if (gepc->keyno < pc->keyno) + for_each_cell(lc1, ge_clauses, ge_start) { - prefix = lappend(prefix, gepc); - pk_has_clauses[gepc->keyno] = true; + PartClauseInfo *gepc = lfirst(lc1); + + if (gepc->keyno == keyno) + { + prefix = lappend(prefix, gepc); + pk_has_clauses = true; + } + else + { + Assert(gepc->keyno > keyno); + break; + } } + ge_start = lc1; } - } - /* - * Check whether every earlier partition key has at - * least one clause. - */ - for (i = 0; i < pc->keyno; i++) - { - if (!pk_has_clauses[i]) + /* + * If this key has no clauses, prefix is not valid + * anymore. + */ + if (!pk_has_clauses) { prefix_valid = false; break; @@ -2241,6 +2262,9 @@ match_clause_to_partition_key(GeneratePruningStepsContext *context, * non-NULL, but they must ensure that prefix contains at least one clause * for each of the partition keys other than those specified in step_nullkeys * and step_lastkeyno. + * + * For both cases, callers must also ensure that clauses in prefix are sorted + * in ascending order of their partition key numbers. */ static List * get_steps_using_prefix(GeneratePruningStepsContext *context, diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 687cf8c5f415..50d2a7e4b975 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -3711,6 +3711,16 @@ explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b Filter: ((a >= 1) AND (b >= 1) AND (b >= 2) AND (c >= 2) AND (d >= 0)) (2 rows) +-- Test that get_steps_using_prefix() handles a prefix that contains multiple +-- clauses for the partition key b (ie, b >= 1 and b = 2) (This also tests +-- that the caller arranges clauses in that prefix in the required order) +explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b = 2 and c = 2 and d >= 0; + QUERY PLAN +------------------------------------------------------------------------ + Seq Scan on rp_prefix_test3_p2 rp_prefix_test3 + Filter: ((a >= 1) AND (b >= 1) AND (d >= 0) AND (b = 2) AND (c = 2)) +(2 rows) + create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops); create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0); create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1); diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 93ef9dc1f340..1e904a8c5b7b 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -1080,6 +1080,11 @@ create table rp_prefix_test3_p2 partition of rp_prefix_test3 for values from (2, -- clauses for the partition key b (ie, b >= 1 and b >= 2) explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b >= 2 and c >= 2 and d >= 0; +-- Test that get_steps_using_prefix() handles a prefix that contains multiple +-- clauses for the partition key b (ie, b >= 1 and b = 2) (This also tests +-- that the caller arranges clauses in that prefix in the required order) +explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b = 2 and c = 2 and d >= 0; + create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops); create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0); create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1); From 3df92bbd1dba98f72e3f005406463b0718193a0f Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 7 Aug 2020 09:53:27 -0700 Subject: [PATCH 264/334] Rename nbtree split REDO routine variables. Make the nbtree page split REDO routine variable names consistent with _bt_split() (which handles the original execution of page splits). These names make the code easier to follow by making the distinction between the original page and the left half of the split clear. (The left half of the split page is a temp page that REDO creates to replace the origpage contents.) Also reduce the elevel used when adding a new high key to the temp page from PANIC to ERROR to be consistent. We already only raise an ERROR when data item PageAddItem() temp page calls fail. --- src/backend/access/nbtree/nbtxlog.c | 96 ++++++++++++++--------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index be0fa450f31d..1fd639246328 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -256,20 +256,20 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); - Buffer lbuf; + Buffer buf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; - BlockNumber leftsib; - BlockNumber rightsib; - BlockNumber rnext; + BlockNumber origpagenumber; + BlockNumber rightpagenumber; + BlockNumber spagenumber; - XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); - XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); - if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) - rnext = P_NONE; + XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber); + XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber); + if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &spagenumber)) + spagenumber = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page @@ -287,8 +287,8 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); - ropaque->btpo_prev = leftsib; - ropaque->btpo_next = rnext; + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = spagenumber; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; @@ -298,8 +298,8 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); - /* Now reconstruct left (original) sibling page */ - if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) + /* Now reconstruct original page (left half of split) */ + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we @@ -309,15 +309,15 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) * checking possible. See also _bt_restore_page(), which does the * same for the right page. */ - Page lpage = (Page) BufferGetPage(lbuf); - BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + Page origpage = (Page) BufferGetPage(buf); + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); OffsetNumber off; IndexTuple newitem = NULL, left_hikey = NULL, nposting = NULL; Size newitemsz = 0, left_hikeysz = 0; - Page newlpage; + Page leftpage; OffsetNumber leftoff, replacepostingoff = InvalidOffsetNumber; @@ -340,8 +340,8 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* Use mutable, aligned newitem copy in _bt_swap_posting() */ newitem = CopyIndexTuple(newitem); - itemid = PageGetItemId(lpage, replacepostingoff); - oposting = (IndexTuple) PageGetItem(lpage, itemid); + itemid = PageGetItemId(origpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(origpage, itemid); nposting = _bt_swap_posting(newitem, oposting, xlrec->postingoff); } @@ -359,16 +359,16 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) Assert(datalen == 0); - newlpage = PageGetTempPageCopySpecial(lpage); + leftpage = PageGetTempPageCopySpecial(origpage); - /* Set high key */ + /* Add high key tuple from WAL record to temp page */ leftoff = P_HIKEY; - if (PageAddItem(newlpage, (Item) left_hikey, left_hikeysz, - P_HIKEY, false, false) == InvalidOffsetNumber) - elog(PANIC, "failed to add high key to left page after split"); + if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); - for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstrightoff; off++) + for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++) { ItemId itemid; Size itemsz; @@ -379,7 +379,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) { Assert(newitemonleft || xlrec->firstrightoff == xlrec->newitemoff); - if (PageAddItem(newlpage, (Item) nposting, + if (PageAddItem(leftpage, (Item) nposting, MAXALIGN(IndexTupleSize(nposting)), leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new posting list item to left page after split"); @@ -390,16 +390,16 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* add the new item if it was inserted on left page */ else if (newitemonleft && off == xlrec->newitemoff) { - if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, + if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } - itemid = PageGetItemId(lpage, off); + itemid = PageGetItemId(origpage, off); itemsz = ItemIdGetLength(itemid); - item = (IndexTuple) PageGetItem(lpage, itemid); - if (PageAddItem(newlpage, (Item) item, itemsz, leftoff, + item = (IndexTuple) PageGetItem(origpage, itemid); + if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); @@ -408,31 +408,31 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) /* cope with possibility that newitem goes at the end */ if (newitemonleft && off == xlrec->newitemoff) { - if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, + if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } - PageRestoreTempPage(newlpage, lpage); + PageRestoreTempPage(leftpage, origpage); /* Fix opaque fields */ - lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; + oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) - lopaque->btpo_flags |= BTP_LEAF; - lopaque->btpo_next = rightsib; - lopaque->btpo_cycleid = 0; + oopaque->btpo_flags |= BTP_LEAF; + oopaque->btpo_next = rightpagenumber; + oopaque->btpo_cycleid = 0; - PageSetLSN(lpage, lsn); - MarkBufferDirty(lbuf); + PageSetLSN(origpage, lsn); + MarkBufferDirty(buf); } /* * We no longer need the buffers. They must be released together, so that * readers cannot observe two inconsistent halves. */ - if (BufferIsValid(lbuf)) - UnlockReleaseBuffer(lbuf); + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); UnlockReleaseBuffer(rbuf); /* @@ -443,22 +443,22 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ - if (rnext != P_NONE) + if (spagenumber != P_NONE) { - Buffer buffer; + Buffer sbuf; - if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO) { - Page page = (Page) BufferGetPage(buffer); - BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + Page spage = (Page) BufferGetPage(sbuf); + BTPageOpaque spageop = (BTPageOpaque) PageGetSpecialPointer(spage); - pageop->btpo_prev = rightsib; + spageop->btpo_prev = rightpagenumber; - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); + PageSetLSN(spage, lsn); + MarkBufferDirty(sbuf); } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + if (BufferIsValid(sbuf)) + UnlockReleaseBuffer(sbuf); } } From 6f0b632f083ba08fabb6c496caf733802cee9d2e Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 7 Aug 2020 14:30:41 -0400 Subject: [PATCH 265/334] Support testing of cases where table schemas change after planning. We have various cases where we allow DDL on tables to be performed with less than full AccessExclusiveLock. This requires concurrent queries to be able to cope with the DDL change mid-flight, but up to now we had no repeatable way to test such cases. To improve that, invent a test module that allows halting a backend after planning and then resuming execution once we've done desired actions in another session. (The same approach could be used to inject delays in other places, if there's a suitable hook available.) This commit includes a single test case, which is meant to exercise the previously-untestable ExecCreatePartitionPruneState code repaired by commit 7a980dfc6. We'd probably not bother with this if that were the only foreseen benefit, but I expect additional test cases will use this infrastructure in the future. Test module by Andy Fan, partition-addition test case by me. Discussion: https://postgr.es/m/20200802181131.GA27754@telsasoft.com --- src/test/modules/Makefile | 1 + src/test/modules/delay_execution/.gitignore | 3 + src/test/modules/delay_execution/Makefile | 21 ++++ .../modules/delay_execution/delay_execution.c | 104 ++++++++++++++++++ .../expected/partition-addition.out | 21 ++++ .../specs/partition-addition.spec | 38 +++++++ 6 files changed, 188 insertions(+) create mode 100644 src/test/modules/delay_execution/.gitignore create mode 100644 src/test/modules/delay_execution/Makefile create mode 100644 src/test/modules/delay_execution/delay_execution.c create mode 100644 src/test/modules/delay_execution/expected/partition-addition.out create mode 100644 src/test/modules/delay_execution/specs/partition-addition.spec diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 29de73c06062..1428529b041a 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global SUBDIRS = \ brin \ commit_ts \ + delay_execution \ dummy_index_am \ dummy_seclabel \ snapshot_too_old \ diff --git a/src/test/modules/delay_execution/.gitignore b/src/test/modules/delay_execution/.gitignore new file mode 100644 index 000000000000..ba2160b66ceb --- /dev/null +++ b/src/test/modules/delay_execution/.gitignore @@ -0,0 +1,3 @@ +# Generated subdirectories +/output_iso/ +/tmp_check_iso/ diff --git a/src/test/modules/delay_execution/Makefile b/src/test/modules/delay_execution/Makefile new file mode 100644 index 000000000000..f270aebf3a55 --- /dev/null +++ b/src/test/modules/delay_execution/Makefile @@ -0,0 +1,21 @@ +# src/test/modules/delay_execution/Makefile + +PGFILEDESC = "delay_execution - allow delay between parsing and execution" + +MODULE_big = delay_execution +OBJS = \ + $(WIN32RES) \ + delay_execution.o + +ISOLATION = partition-addition + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/delay_execution +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/delay_execution/delay_execution.c b/src/test/modules/delay_execution/delay_execution.c new file mode 100644 index 000000000000..03ea23d0f266 --- /dev/null +++ b/src/test/modules/delay_execution/delay_execution.c @@ -0,0 +1,104 @@ +/*------------------------------------------------------------------------- + * + * delay_execution.c + * Test module to allow delay between parsing and execution of a query. + * + * The delay is implemented by taking and immediately releasing a specified + * advisory lock. If another process has previously taken that lock, the + * current process will be blocked until the lock is released; otherwise, + * there's no effect. This allows an isolationtester script to reliably + * test behaviors where some specified action happens in another backend + * between parsing and execution of any desired query. + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/delay_execution/delay_execution.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "optimizer/planner.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/inval.h" + + +PG_MODULE_MAGIC; + +/* GUC: advisory lock ID to use. Zero disables the feature. */ +static int post_planning_lock_id = 0; + +/* Save previous planner hook user to be a good citizen */ +static planner_hook_type prev_planner_hook = NULL; + +/* Module load/unload functions */ +void _PG_init(void); +void _PG_fini(void); + + +/* planner_hook function to provide the desired delay */ +static PlannedStmt * +delay_execution_planner(Query *parse, const char *query_string, + int cursorOptions, ParamListInfo boundParams) +{ + PlannedStmt *result; + + /* Invoke the planner, possibly via a previous hook user */ + if (prev_planner_hook) + result = prev_planner_hook(parse, query_string, cursorOptions, + boundParams); + else + result = standard_planner(parse, query_string, cursorOptions, + boundParams); + + /* If enabled, delay by taking and releasing the specified lock */ + if (post_planning_lock_id != 0) + { + DirectFunctionCall1(pg_advisory_lock_int8, + Int64GetDatum((int64) post_planning_lock_id)); + DirectFunctionCall1(pg_advisory_unlock_int8, + Int64GetDatum((int64) post_planning_lock_id)); + + /* + * Ensure that we notice any pending invalidations, since the advisory + * lock functions don't do this. + */ + AcceptInvalidationMessages(); + } + + return result; +} + +/* Module load function */ +void +_PG_init(void) +{ + /* Set up the GUC to control which lock is used */ + DefineCustomIntVariable("delay_execution.post_planning_lock_id", + "Sets the advisory lock ID to be locked/unlocked after planning.", + "Zero disables the delay.", + &post_planning_lock_id, + 0, + 0, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + /* Install our hook */ + prev_planner_hook = planner_hook; + planner_hook = delay_execution_planner; +} + +/* Module unload function (pro forma, not used currently) */ +void +_PG_fini(void) +{ + planner_hook = prev_planner_hook; +} diff --git a/src/test/modules/delay_execution/expected/partition-addition.out b/src/test/modules/delay_execution/expected/partition-addition.out new file mode 100644 index 000000000000..7c91090eeff8 --- /dev/null +++ b/src/test/modules/delay_execution/expected/partition-addition.out @@ -0,0 +1,21 @@ +Parsed test spec with 2 sessions + +starting permutation: s2lock s1exec s2addp s2unlock +step s2lock: SELECT pg_advisory_lock(12345); +pg_advisory_lock + + +step s1exec: LOAD 'delay_execution'; + SET delay_execution.post_planning_lock_id = 12345; + SELECT * FROM foo WHERE a <> 1 AND a <> (SELECT 3); +step s2addp: CREATE TABLE foo2 (LIKE foo); + ALTER TABLE foo ATTACH PARTITION foo2 FOR VALUES IN (2); + INSERT INTO foo VALUES (2, 'ADD2'); +step s2unlock: SELECT pg_advisory_unlock(12345); +pg_advisory_unlock + +t +step s1exec: <... completed> +a b + +4 GHI diff --git a/src/test/modules/delay_execution/specs/partition-addition.spec b/src/test/modules/delay_execution/specs/partition-addition.spec new file mode 100644 index 000000000000..2a0948247e32 --- /dev/null +++ b/src/test/modules/delay_execution/specs/partition-addition.spec @@ -0,0 +1,38 @@ +# Test addition of a partition with less-than-exclusive locking. + +setup +{ + CREATE TABLE foo (a int, b text) PARTITION BY LIST(a); + CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1); + CREATE TABLE foo3 PARTITION OF foo FOR VALUES IN (3); + CREATE TABLE foo4 PARTITION OF foo FOR VALUES IN (4); + INSERT INTO foo VALUES (1, 'ABC'); + INSERT INTO foo VALUES (3, 'DEF'); + INSERT INTO foo VALUES (4, 'GHI'); +} + +teardown +{ + DROP TABLE foo; +} + +# The SELECT will be planned with just the three partitions shown above, +# of which we expect foo1 to be pruned at planning and foo3 at execution. +# Then we'll block, and by the time the query is actually executed, +# partition foo2 will also exist. We expect that not to be scanned. +# This test is specifically designed to check ExecCreatePartitionPruneState's +# code for matching up the partition lists in such cases. + +session "s1" +step "s1exec" { LOAD 'delay_execution'; + SET delay_execution.post_planning_lock_id = 12345; + SELECT * FROM foo WHERE a <> 1 AND a <> (SELECT 3); } + +session "s2" +step "s2lock" { SELECT pg_advisory_lock(12345); } +step "s2unlock" { SELECT pg_advisory_unlock(12345); } +step "s2addp" { CREATE TABLE foo2 (LIKE foo); + ALTER TABLE foo ATTACH PARTITION foo2 FOR VALUES IN (2); + INSERT INTO foo VALUES (2, 'ADD2'); } + +permutation "s2lock" "s1exec" "s2addp" "s2unlock" From cea3d55898655582e3a3835a7bed2c3a1b002fef Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Fri, 7 Aug 2020 17:24:40 -0400 Subject: [PATCH 266/334] Remove PROC_IN_ANALYZE and derived flags These flags are unused and always have been. Discussion: https://postgr.es/m/20200805235549.GA8118@alvherre.pgsql --- src/backend/commands/analyze.c | 13 +------------ src/include/storage/proc.h | 3 +-- src/include/storage/procarray.h | 7 ------- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 924ef37c8163..e0fa73ba7909 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -247,11 +247,8 @@ analyze_rel(Oid relid, RangeVar *relation, } /* - * OK, let's do it. First let other backends know I'm in ANALYZE. + * OK, let's do it. First, initialize progress reporting. */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyPgXact->vacuumFlags |= PROC_IN_ANALYZE; - LWLockRelease(ProcArrayLock); pgstat_progress_start_command(PROGRESS_COMMAND_ANALYZE, RelationGetRelid(onerel)); @@ -279,14 +276,6 @@ analyze_rel(Oid relid, RangeVar *relation, relation_close(onerel, NoLock); pgstat_progress_end_command(); - - /* - * Reset my PGXACT flag. Note: we need this here, and not in vacuum_rel, - * because the vacuum flag is cleared by the end-of-xact code. - */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyPgXact->vacuumFlags &= ~PROC_IN_ANALYZE; - LWLockRelease(ProcArrayLock); } /* diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index b20e2ad4f6aa..5ceb2494bae7 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -52,7 +52,6 @@ struct XidCache */ #define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ #define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ -#define PROC_IN_ANALYZE 0x04 /* currently running analyze */ #define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ #define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical * decoding outside xact */ @@ -60,7 +59,7 @@ struct XidCache /* flags reset at EOXact */ #define PROC_VACUUM_STATE_MASK \ - (PROC_IN_VACUUM | PROC_IN_ANALYZE | PROC_VACUUM_FOR_WRAPAROUND) + (PROC_IN_VACUUM | PROC_VACUUM_FOR_WRAPAROUND) /* * We allow a small number of "weak" relation locks (AccessShareLock, diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index a5c7d0c0644a..01040d76e122 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -29,8 +29,6 @@ */ #define PROCARRAY_VACUUM_FLAG 0x02 /* currently running lazy * vacuum */ -#define PROCARRAY_ANALYZE_FLAG 0x04 /* currently running - * analyze */ #define PROCARRAY_LOGICAL_DECODING_FLAG 0x10 /* currently doing logical * decoding outside xact */ @@ -42,7 +40,6 @@ * have no corresponding PROC flag equivalent. */ #define PROCARRAY_PROC_FLAGS_MASK (PROCARRAY_VACUUM_FLAG | \ - PROCARRAY_ANALYZE_FLAG | \ PROCARRAY_LOGICAL_DECODING_FLAG) /* Use the following flags as an input "flags" to GetOldestXmin function */ @@ -50,10 +47,6 @@ #define PROCARRAY_FLAGS_DEFAULT PROCARRAY_LOGICAL_DECODING_FLAG /* Ignore vacuum backends */ #define PROCARRAY_FLAGS_VACUUM PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG -/* Ignore analyze backends */ -#define PROCARRAY_FLAGS_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG -/* Ignore both vacuum and analyze backends */ -#define PROCARRAY_FLAGS_VACUUM_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG extern Size ProcArrayShmemSize(void); extern void CreateSharedProcArray(void); From 0a7d771f0f63eb120e7f0a60aecd543ab25ba197 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 7 Aug 2020 15:27:56 -0700 Subject: [PATCH 267/334] Make nbtree split REDO locking match original execution. Make the nbtree page split REDO routine consistent with original execution in its approach to acquiring and releasing buffer locks (at least for pages on the tree level of the page being split). This brings btree_xlog_split() in line with btree_xlog_unlink_page(), which was taught to couple buffer locks by commit 9a9db08a. Note that the precise order in which we both acquire and release sibling buffer locks in btree_xlog_split() now matches original execution exactly (the precise order in which the locks are released probably doesn't matter much, but we might as well be consistent about it). The rule for nbtree REDO routines from here on is that same-level locks should be acquired in an order that's consistent with original execution. It's not practical to have a similar rule for cross-level page locks, since for the most part original execution holds those locks for a period that spans multiple atomic actions/WAL records. It's also not necessary, because clearly the cross-level lock coupling is only truly needed during original execution because of the presence of concurrent inserters. This is not a bug fix (unlike the similar aforementioned commit, commit 9a9db08a). The immediate reason to tighten things up in this area is to enable an upcoming enhancement to contrib/amcheck that allows it to verify that sibling links are in agreement with only an AccessShareLock (this check produced false positives when run on a replica server on account of the inconsistency fixed by this commit). But that's not the only reason to be stricter here. It is generally useful to make locking on replicas be as close to what happens during original execution as practically possible. It makes it less likely that hard to catch bugs will slip in in the future. The previous state of affairs seems to be a holdover from before the introduction of Hot Standby, when buffer lock acquisitions during recovery were totally unnecessary. See also: commit 3bbf668d, which tightened things up in this area a few years after the introduction of Hot Standby. Discussion: https://postgr.es/m/CAH2-Wz=465cJj11YXD9RKH8z=nhQa2dofOZ_23h67EXUGOJ00Q@mail.gmail.com --- src/backend/access/nbtree/README | 23 +++--------- src/backend/access/nbtree/nbtxlog.c | 58 ++++++++++++++--------------- 2 files changed, 35 insertions(+), 46 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 9d5fc424a574..abce31a5a96b 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -572,23 +572,12 @@ replay of page deletion records does not hold a write lock on the target leaf page throughout; only the primary needs to block out concurrent writers that insert on to the page being deleted.) -There are also locking differences between the primary and WAL replay -for the first stage of a page split (i.e. same-level differences in -locking). Replay of the first phase of a page split can get away with -locking and updating the original right sibling page (which is also the -new right sibling page's right sibling) after locks on the original page -and its new right sibling have been released. Again, this is okay -because there are no writers. Page deletion WAL replay cannot get away -with being lax about same-level locking during replay, though -- doing -so risks confusing concurrent backwards scans. - -Page deletion's second phase locks the left sibling page, target page, -and right page in order on the standby, just like on the primary. This -allows backwards scans running on a standby to reason about page -deletion on the leaf level; a page cannot appear deleted without that -being reflected in the sibling pages. It's probably possible to be more -lax about how locks are acquired on the standby during the second phase -of page deletion, but that hardly seems worth it. +WAL replay holds same-level locks in a way that matches the approach +taken during original execution, though. This prevent readers from +observing same-level inconsistencies. It's probably possible to be more +lax about how same-level locks are acquired during recovery (most kinds +of readers could still move right to recover if we didn't couple +same-level locks), but we prefer to be conservative here. During recovery all index scans start with ignore_killed_tuples = false and we never set kill_prior_tuple. We do this because the oldest xmin diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 1fd639246328..dbec58d5249c 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -172,10 +172,10 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting, * Insertion to an internal page finishes an incomplete split at the child * level. Clear the incomplete-split flag in the child. Note: during * normal operation, the child and parent pages are locked at the same - * time, so that clearing the flag and inserting the downlink appear - * atomic to other backends. We don't bother with that during replay, - * because readers don't care about the incomplete-split flag and there - * cannot be updates happening. + * time (the locks are coupled), so that clearing the flag and inserting + * the downlink appear atomic to other backends. We don't bother with + * that during replay, because readers don't care about the + * incomplete-split flag and there cannot be updates happening. */ if (!isleaf) _bt_clear_incomplete_split(record, 1); @@ -272,9 +272,17 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) spagenumber = P_NONE; /* - * Clear the incomplete split flag on the left sibling of the child page - * this is a downlink for. (Like in btree_xlog_insert, this can be done - * before locking the other pages) + * Clear the incomplete split flag on the appropriate child page one level + * down when origpage/buf is an internal page (there must have been + * cascading page splits during original execution in the event of an + * internal page split). This is like the corresponding btree_xlog_insert + * call for internal pages. We're not clearing the incomplete split flag + * for the current page split here (you can think of this as part of the + * insert of newitem that the page split action needs to perform in + * passing). + * + * Like in btree_xlog_insert, this can be done before locking other pages. + * We never need to couple cross-level locks in REDO routines. */ if (!isleaf) _bt_clear_incomplete_split(record, 3); @@ -427,22 +435,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) MarkBufferDirty(buf); } - /* - * We no longer need the buffers. They must be released together, so that - * readers cannot observe two inconsistent halves. - */ - if (BufferIsValid(buf)) - UnlockReleaseBuffer(buf); - UnlockReleaseBuffer(rbuf); - - /* - * Fix left-link of the page to the right of the new right sibling. - * - * Note: in normal operation, we do this while still holding lock on the - * two split pages. However, that's not necessary for correctness in WAL - * replay, because no other index update can be in progress, and readers - * will cope properly when following an obsolete left-link. - */ + /* Fix left-link of the page to the right of the new right sibling */ if (spagenumber != P_NONE) { Buffer sbuf; @@ -460,6 +453,14 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) if (BufferIsValid(sbuf)) UnlockReleaseBuffer(sbuf); } + + /* + * Finally, release the remaining buffers. sbuf, rbuf, and buf must be + * released together, so that readers cannot observe inconsistencies. + */ + UnlockReleaseBuffer(rbuf); + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); } static void @@ -733,6 +734,11 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) PageSetLSN(page, lsn); MarkBufferDirty(buffer); } + + /* + * Don't need to couple cross-level locks in REDO routines, so release + * lock on internal page immediately + */ if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); @@ -789,12 +795,6 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) * the pages in the same standard left-to-right order (leftsib, target, * rightsib), and don't release the sibling locks until the target is * marked deleted. - * - * btree_xlog_split() can get away with fixing its right sibling page's - * left link last of all, after dropping all other locks. We prefer to - * avoid dropping locks on same-level pages early compared to normal - * operation. This keeps things simple for backwards scans. See - * nbtree/README. */ /* Fix right-link of left sibling, if any */ From 7259736a6e5b7c7588fff9578370736a6648acbb Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Sat, 8 Aug 2020 07:34:39 +0530 Subject: [PATCH 268/334] Implement streaming mode in ReorderBuffer. Instead of serializing the transaction to disk after reaching the logical_decoding_work_mem limit in memory, we consume the changes we have in memory and invoke stream API methods added by commit 45fdc9738b. However, sometimes if we have incomplete toast or speculative insert we spill to the disk because we can't generate the complete tuple and stream. And, as soon as we get the complete tuple we stream the transaction including the serialized changes. We can do this incremental processing thanks to having assignments (associating subxact with toplevel xacts) in WAL right away, and thanks to logging the invalidation messages at each command end. These features are added by commits 0bead9af48 and c55040ccd0 respectively. Now that we can stream in-progress transactions, the concurrent aborts may cause failures when the output plugin consults catalogs (both system and user-defined). We handle such failures by returning ERRCODE_TRANSACTION_ROLLBACK sqlerrcode from system table scan APIs to the backend or WALSender decoding a specific uncommitted transaction. The decoding logic on the receipt of such a sqlerrcode aborts the decoding of the current transaction and continue with the decoding of other transactions. We have ReorderBufferTXN pointer in each ReorderBufferChange by which we know which xact it belongs to. The output plugin can use this to decide which changes to discard in case of stream_abort_cb (e.g. when a subxact gets discarded). We also provide a new option via SQL APIs to fetch the changes being streamed. Author: Dilip Kumar, Tomas Vondra, Amit Kapila, Nikhil Sontakke Reviewed-by: Amit Kapila, Kuntal Ghosh, Ajin Cherian Tested-by: Neha Sharma, Mahendra Singh Thalor and Ajin Cherian Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com --- contrib/test_decoding/Makefile | 2 +- contrib/test_decoding/expected/stream.out | 94 ++ contrib/test_decoding/expected/truncate.out | 6 + contrib/test_decoding/sql/stream.sql | 30 + contrib/test_decoding/sql/truncate.sql | 1 + contrib/test_decoding/test_decoding.c | 13 + doc/src/sgml/logicaldecoding.sgml | 9 +- doc/src/sgml/test-decoding.sgml | 22 + src/backend/access/heap/heapam.c | 13 + src/backend/access/heap/heapam_visibility.c | 42 +- src/backend/access/index/genam.c | 53 + src/backend/access/table/tableam.c | 8 + src/backend/access/transam/xact.c | 19 + src/backend/replication/logical/decode.c | 17 +- src/backend/replication/logical/logical.c | 10 + .../replication/logical/reorderbuffer.c | 981 ++++++++++++++++-- src/include/access/heapam_xlog.h | 1 + src/include/access/tableam.h | 55 + src/include/access/xact.h | 4 + src/include/replication/logical.h | 1 + src/include/replication/reorderbuffer.h | 56 +- 21 files changed, 1331 insertions(+), 106 deletions(-) create mode 100644 contrib/test_decoding/expected/stream.out create mode 100644 contrib/test_decoding/sql/stream.sql diff --git a/contrib/test_decoding/Makefile b/contrib/test_decoding/Makefile index f439c582a5f9..ed9a3d6c0ede 100644 --- a/contrib/test_decoding/Makefile +++ b/contrib/test_decoding/Makefile @@ -5,7 +5,7 @@ PGFILEDESC = "test_decoding - example of a logical decoding output plugin" REGRESS = ddl xact rewrite toast permissions decoding_in_xact \ decoding_into_rel binary prepared replorigin time messages \ - spill slot truncate + spill slot truncate stream ISOLATION = mxact delayed_startup ondisk_startup concurrent_ddl_dml \ oldest_xmin snapshot_transfer subxact_without_top diff --git a/contrib/test_decoding/expected/stream.out b/contrib/test_decoding/expected/stream.out new file mode 100644 index 000000000000..9a5d7e7c4399 --- /dev/null +++ b/contrib/test_decoding/expected/stream.out @@ -0,0 +1,94 @@ +SET synchronous_commit = on; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +CREATE TABLE stream_test(data text); +-- consume DDL +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + data +------ +(0 rows) + +-- streaming test with sub-transaction +BEGIN; +savepoint s1; +SELECT 'msg5' FROM pg_logical_emit_message(true, 'test', repeat('a', 50)); + ?column? +---------- + msg5 +(1 row) + +INSERT INTO stream_test SELECT repeat('a', 2000) || g.i FROM generate_series(1, 35) g(i); +TRUNCATE table stream_test; +rollback to s1; +INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i); +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); + data +---------------------------------------------------------- + opening a streamed block for transaction + streaming message: transactional: 1 prefix: test, sz: 50 + closing a streamed block for transaction + aborting streamed (sub)transaction + opening a streamed block for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + closing a streamed block for transaction + committing streamed transaction +(27 rows) + +-- streaming test for toast changes +ALTER TABLE stream_test ALTER COLUMN data set storage external; +-- consume DDL +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + data +------ +(0 rows) + +INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); + data +------------------------------------------ + opening a streamed block for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + streaming change for transaction + closing a streamed block for transaction + committing streamed transaction +(13 rows) + +DROP TABLE stream_test; +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + diff --git a/contrib/test_decoding/expected/truncate.out b/contrib/test_decoding/expected/truncate.out index 1cf2ae835c84..e64d377214ab 100644 --- a/contrib/test_decoding/expected/truncate.out +++ b/contrib/test_decoding/expected/truncate.out @@ -25,3 +25,9 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc COMMIT (9 rows) +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + diff --git a/contrib/test_decoding/sql/stream.sql b/contrib/test_decoding/sql/stream.sql new file mode 100644 index 000000000000..8abc30de0afc --- /dev/null +++ b/contrib/test_decoding/sql/stream.sql @@ -0,0 +1,30 @@ +SET synchronous_commit = on; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + +CREATE TABLE stream_test(data text); + +-- consume DDL +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + +-- streaming test with sub-transaction +BEGIN; +savepoint s1; +SELECT 'msg5' FROM pg_logical_emit_message(true, 'test', repeat('a', 50)); +INSERT INTO stream_test SELECT repeat('a', 2000) || g.i FROM generate_series(1, 35) g(i); +TRUNCATE table stream_test; +rollback to s1; +INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i); +COMMIT; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); + +-- streaming test for toast changes +ALTER TABLE stream_test ALTER COLUMN data set storage external; +-- consume DDL +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + +INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); + +DROP TABLE stream_test; +SELECT pg_drop_replication_slot('regression_slot'); diff --git a/contrib/test_decoding/sql/truncate.sql b/contrib/test_decoding/sql/truncate.sql index 5aecdf0881f5..5633854e0dfc 100644 --- a/contrib/test_decoding/sql/truncate.sql +++ b/contrib/test_decoding/sql/truncate.sql @@ -11,3 +11,4 @@ TRUNCATE tab1, tab1 RESTART IDENTITY CASCADE; TRUNCATE tab1, tab2; SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); +SELECT pg_drop_replication_slot('regression_slot'); diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index dbef52a3af47..34745150e9ba 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -122,6 +122,7 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, { ListCell *option; TestDecodingData *data; + bool enable_streaming = false; data = palloc0(sizeof(TestDecodingData)); data->context = AllocSetContextCreate(ctx->context, @@ -212,6 +213,16 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, errmsg("could not parse value \"%s\" for parameter \"%s\"", strVal(elem->arg), elem->defname))); } + else if (strcmp(elem->defname, "stream-changes") == 0) + { + if (elem->arg == NULL) + continue; + else if (!parse_bool(strVal(elem->arg), &enable_streaming)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not parse value \"%s\" for parameter \"%s\"", + strVal(elem->arg), elem->defname))); + } else { ereport(ERROR, @@ -221,6 +232,8 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, elem->arg ? strVal(elem->arg) : "(null)"))); } } + + ctx->streaming &= enable_streaming; } /* cleanup this plugin's resources */ diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml index 791a62b57c9b..1571d71a5b6c 100644 --- a/doc/src/sgml/logicaldecoding.sgml +++ b/doc/src/sgml/logicaldecoding.sgml @@ -433,9 +433,12 @@ typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb); ALTER TABLE user_catalog_table SET (user_catalog_table = true); CREATE TABLE another_catalog_table(data text) WITH (user_catalog_table = true); - Any actions leading to transaction ID assignment are prohibited. That, among others, - includes writing to tables, performing DDL changes, and - calling pg_current_xact_id(). + Note that access to user catalog tables or regular system catalog tables + in the output plugins has to be done via the systable_* + scan APIs only. Access via the heap_* scan APIs will + error out. Additionally, any actions leading to transaction ID assignment + are prohibited. That, among others, includes writing to tables, performing + DDL changes, and calling pg_current_xact_id(). diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml index 8356a3d67b31..fe7c9783facd 100644 --- a/doc/src/sgml/test-decoding.sgml +++ b/doc/src/sgml/test-decoding.sgml @@ -39,4 +39,26 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'i + + We can also get the changes of the in-progress transaction and the typical + output, might be: + + +postgres[33712]=#* SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'stream-changes', '1'); + lsn | xid | data +-----------+-----+-------------------------------------------------- + 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/16B21F8 | 503 | streaming change for TXN 503 + 0/16B2300 | 503 | streaming change for TXN 503 + 0/16B2408 | 503 | streaming change for TXN 503 + 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 + 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/16BECA8 | 503 | streaming change for TXN 503 + 0/16BEDB0 | 503 | streaming change for TXN 503 + 0/16BEEB8 | 503 | streaming change for TXN 503 + 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 +(10 rows) + + + diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 5eef225f5c79..00169006fb1f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1299,6 +1299,16 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg_internal("only heap AM is supported"))); + /* + * We don't expect direct calls to heap_getnext with valid CheckXidAlive + * for catalog or regular tables. See detailed comments in xact.c where + * these variables are declared. Normally we have such a check at tableam + * level API but this is called from many places so we need to ensure it + * here. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected heap_getnext call during logical decoding"); + /* Note: no locking manipulations needed */ if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) @@ -1956,6 +1966,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, { xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; bufflags |= REGBUF_KEEP_DATA; + + if (IsToastRelation(relation)) + xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; } XLogBeginInsert(); diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index dba10890aabe..c77128087cf7 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -1571,8 +1571,25 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, htup, buffer, &cmin, &cmax); + /* + * If we haven't resolved the combocid to cmin/cmax, that means we + * have not decoded the combocid yet. That means the cmin is + * definitely in the future, and we're not supposed to see the tuple + * yet. + * + * XXX This only applies to decoding of in-progress transactions. In + * regular logical decoding we only execute this code at commit time, + * at which point we should have seen all relevant combocids. So + * ideally, we should error out in this case but in practice, this + * won't happen. If we are too worried about this then we can add an + * elog inside ResolveCminCmaxDuringDecoding. + * + * XXX For the streaming case, we can track the largest combocid + * assigned, and error out based on this (when unable to resolve + * combocid below that observed maximum value). + */ if (!resolved) - elog(ERROR, "could not resolve cmin/cmax of catalog tuple"); + return false; Assert(cmin != InvalidCommandId); @@ -1642,10 +1659,25 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, htup, buffer, &cmin, &cmax); - if (!resolved) - elog(ERROR, "could not resolve combocid to cmax"); - - Assert(cmax != InvalidCommandId); + /* + * If we haven't resolved the combocid to cmin/cmax, that means we + * have not decoded the combocid yet. That means the cmax is + * definitely in the future, and we're still supposed to see the + * tuple. + * + * XXX This only applies to decoding of in-progress transactions. In + * regular logical decoding we only execute this code at commit time, + * at which point we should have seen all relevant combocids. So + * ideally, we should error out in this case but in practice, this + * won't happen. If we are too worried about this then we can add an + * elog inside ResolveCminCmaxDuringDecoding. + * + * XXX For the streaming case, we can track the largest combocid + * assigned, and error out based on this (when unable to resolve + * combocid below that observed maximum value). + */ + if (!resolved || cmax == InvalidCommandId) + return true; if (cmax >= snapshot->curcid) return true; /* deleted after scan started */ diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index dfba5ae39ae9..e3164e674a7b 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -28,6 +28,7 @@ #include "lib/stringinfo.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/lsyscache.h" @@ -429,9 +430,36 @@ systable_beginscan(Relation heapRelation, sysscan->iscan = NULL; } + /* + * If CheckXidAlive is set then set a flag to indicate that system table + * scan is in-progress. See detailed comments in xact.c where these + * variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = true; + return sysscan; } +/* + * HandleConcurrentAbort - Handle concurrent abort of the CheckXidAlive. + * + * Error out, if CheckXidAlive is aborted. We can't directly use + * TransactionIdDidAbort as after crash such transaction might not have been + * marked as aborted. See detailed comments in xact.c where the variable + * is declared. + */ +static inline void +HandleConcurrentAbort() +{ + if (TransactionIdIsValid(CheckXidAlive) && + !TransactionIdIsInProgress(CheckXidAlive) && + !TransactionIdDidCommit(CheckXidAlive)) + ereport(ERROR, + (errcode(ERRCODE_TRANSACTION_ROLLBACK), + errmsg("transaction aborted during system catalog scan"))); +} + /* * systable_getnext --- get next tuple in a heap-or-index scan * @@ -481,6 +509,12 @@ systable_getnext(SysScanDesc sysscan) } } + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + return htup; } @@ -517,6 +551,12 @@ systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup) sysscan->slot, freshsnap); + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + return result; } @@ -545,6 +585,13 @@ systable_endscan(SysScanDesc sysscan) if (sysscan->snapshot) UnregisterSnapshot(sysscan->snapshot); + /* + * Reset the bsysscan flag at the end of the systable scan. See + * detailed comments in xact.c where these variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = false; + pfree(sysscan); } @@ -643,6 +690,12 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) if (htup && sysscan->iscan->xs_recheck) elog(ERROR, "system catalog scans with lossy index conditions are not implemented"); + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + return htup; } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 3afb63b1fe4d..c63831976575 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -248,6 +248,14 @@ table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid) Relation rel = scan->rs_rd; const TableAmRoutine *tableam = rel->rd_tableam; + /* + * We don't expect direct calls to table_tuple_get_latest_tid with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding"); + /* * Since this can be called with user-supplied TID, don't trust the input * too much. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index d4f7c29847f4..727d61603593 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -82,6 +82,19 @@ bool XactDeferrable; int synchronous_commit = SYNCHRONOUS_COMMIT_ON; +/* + * CheckXidAlive is a xid value pointing to a possibly ongoing (sub) + * transaction. Currently, it is used in logical decoding. It's possible + * that such transactions can get aborted while the decoding is ongoing in + * which case we skip decoding that particular transaction. To ensure that we + * check whether the CheckXidAlive is aborted after fetching the tuple from + * system tables. We also ensure that during logical decoding we never + * directly access the tableam or heap APIs because we are checking for the + * concurrent aborts only in systable_* APIs. + */ +TransactionId CheckXidAlive = InvalidTransactionId; +bool bsysscan = false; + /* * When running as a parallel worker, we place only a single * TransactionStateData on the parallel worker's state stack, and the XID @@ -2680,6 +2693,9 @@ AbortTransaction(void) /* Forget about any active REINDEX. */ ResetReindexState(s->nestingLevel); + /* Reset logical streaming state. */ + ResetLogicalStreamingState(); + /* If in parallel mode, clean up workers and exit parallel mode. */ if (IsInParallelMode()) { @@ -4982,6 +4998,9 @@ AbortSubTransaction(void) /* Forget about any active REINDEX. */ ResetReindexState(s->nestingLevel); + /* Reset logical streaming state. */ + ResetLogicalStreamingState(); + /* Exit from parallel mode, if necessary. */ if (IsInParallelMode()) { diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index f3a1c31a2921..f21f61d5e10b 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -724,7 +724,9 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); } /* @@ -791,7 +793,8 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); } /* @@ -848,7 +851,8 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); } /* @@ -884,7 +888,7 @@ DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) memcpy(change->data.truncate.relids, xlrec->relids, xlrec->nrelids * sizeof(Oid)); ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), - buf->origptr, change); + buf->origptr, change, false); } /* @@ -984,7 +988,7 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = false; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), - buf->origptr, change); + buf->origptr, change, false); /* move to the next xl_multi_insert_tuple entry */ data += datalen; @@ -1022,7 +1026,8 @@ DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); } diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 05d24b93da02..42f284b33f6b 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1442,3 +1442,13 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) SpinLockRelease(&MyReplicationSlot->mutex); } } + +/* + * Clear logical streaming state during (sub)transaction abort. + */ +void +ResetLogicalStreamingState(void) +{ + CheckXidAlive = InvalidTransactionId; + bsysscan = false; +} diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index ce6e62152f03..5b7afe6d9e9c 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -178,6 +178,21 @@ typedef struct ReorderBufferDiskChange /* data follows */ } ReorderBufferDiskChange; +#define IsSpecInsert(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \ +) +#define IsSpecConfirm(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \ +) +#define IsInsertOrUpdate(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INSERT) || \ + ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \ +) + /* * Maximum number of changes kept in memory, per transaction. After that, * changes are spooled to disk. @@ -236,6 +251,7 @@ static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, char *change); static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); static void ReorderBufferCleanupSerializedTXNs(const char *slotname); static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, XLogSegNo segno); @@ -244,6 +260,16 @@ static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap); static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, ReorderBufferTXN *txn, CommandId cid); +/* + * --------------------------------------- + * Streaming support functions + * --------------------------------------- + */ +static inline bool ReorderBufferCanStream(ReorderBuffer *rb); +static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb); +static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn); + /* --------------------------------------- * toast reassembly support * --------------------------------------- @@ -367,6 +393,9 @@ ReorderBufferGetTXN(ReorderBuffer *rb) dlist_init(&txn->tuplecids); dlist_init(&txn->subtxns); + /* InvalidCommandId is not zero, so set it explicitly */ + txn->command_id = InvalidCommandId; + return txn; } @@ -416,13 +445,15 @@ ReorderBufferGetChange(ReorderBuffer *rb) } /* - * Free an ReorderBufferChange. + * Free a ReorderBufferChange and update memory accounting, if requested. */ void -ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change) +ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, + bool upd_mem) { /* update memory accounting info */ - ReorderBufferChangeMemoryUpdate(rb, change, false); + if (upd_mem) + ReorderBufferChangeMemoryUpdate(rb, change, false); /* free contained data */ switch (change->action) @@ -624,16 +655,102 @@ ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, } /* - * Queue a change into a transaction so it can be replayed upon commit. + * Record the partial change for the streaming of in-progress transactions. We + * can stream only complete changes so if we have a partial change like toast + * table insert or speculative insert then we mark such a 'txn' so that it + * can't be streamed. We also ensure that if the changes in such a 'txn' are + * above logical_decoding_work_mem threshold then we stream them as soon as we + * have a complete change. + */ +static void +ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, + bool toast_insert) +{ + ReorderBufferTXN *toptxn; + + /* + * The partial changes need to be processed only while streaming + * in-progress transactions. + */ + if (!ReorderBufferCanStream(rb)) + return; + + /* Get the top transaction. */ + if (txn->toptxn != NULL) + toptxn = txn->toptxn; + else + toptxn = txn; + + /* + * Set the toast insert bit whenever we get toast insert to indicate a + * partial change and clear it when we get the insert or update on main + * table (Both update and insert will do the insert in the toast table). + */ + if (toast_insert) + toptxn->txn_flags |= RBTXN_HAS_TOAST_INSERT; + else if (rbtxn_has_toast_insert(toptxn) && + IsInsertOrUpdate(change->action)) + toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT; + + /* + * Set the spec insert bit whenever we get the speculative insert to + * indicate the partial change and clear the same on speculative confirm. + */ + if (IsSpecInsert(change->action)) + toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT; + else if (IsSpecConfirm(change->action)) + { + /* + * Speculative confirm change must be preceded by speculative + * insertion. + */ + Assert(rbtxn_has_spec_insert(toptxn)); + toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT; + } + + /* + * Stream the transaction if it is serialized before and the changes are + * now complete in the top-level transaction. + * + * The reason for doing the streaming of such a transaction as soon as we + * get the complete change for it is that previously it would have reached + * the memory threshold and wouldn't get streamed because of incomplete + * changes. Delaying such transactions would increase apply lag for them. + */ + if (ReorderBufferCanStartStreaming(rb) && + !(rbtxn_has_incomplete_tuple(toptxn)) && + rbtxn_is_serialized(txn)) + ReorderBufferStreamTXN(rb, toptxn); +} + +/* + * Queue a change into a transaction so it can be replayed upon commit or will be + * streamed when we reach logical_decoding_work_mem threshold. */ void ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, - ReorderBufferChange *change) + ReorderBufferChange *change, bool toast_insert) { ReorderBufferTXN *txn; txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + /* + * While streaming the previous changes we have detected that the + * transaction is aborted. So there is no point in collecting further + * changes for it. + */ + if (txn->concurrent_abort) + { + /* + * We don't need to update memory accounting for this change as we + * have not added it to the queue yet. + */ + ReorderBufferReturnChange(rb, change, false); + return; + } + change->lsn = lsn; change->txn = txn; @@ -645,6 +762,9 @@ ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, /* update memory accounting information */ ReorderBufferChangeMemoryUpdate(rb, change, true); + /* process partial change */ + ReorderBufferProcessPartialChange(rb, txn, change, toast_insert); + /* check the memory limits and evict something if needed */ ReorderBufferCheckMemoryLimit(rb); } @@ -674,7 +794,7 @@ ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, change->data.msg.message = palloc(message_size); memcpy(change->data.msg.message, message, message_size); - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); MemoryContextSwitchTo(oldcontext); } @@ -763,6 +883,38 @@ AssertTXNLsnOrder(ReorderBuffer *rb) #endif } +/* + * AssertChangeLsnOrder + * + * Check ordering of changes in the (sub)transaction. + */ +static void +AssertChangeLsnOrder(ReorderBufferTXN *txn) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_lsn = txn->first_lsn; + + dlist_foreach(iter, &txn->changes) + { + ReorderBufferChange *cur_change; + + cur_change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(cur_change->lsn != InvalidXLogRecPtr); + Assert(txn->first_lsn <= cur_change->lsn); + + if (txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_change->lsn <= txn->end_lsn); + + Assert(prev_lsn <= cur_change->lsn); + + prev_lsn = cur_change->lsn; + } +#endif +} + /* * ReorderBufferGetOldestTXN * Return oldest transaction in reorderbuffer @@ -1018,6 +1170,9 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, *iter_state = NULL; + /* Check ordering of changes in the toplevel transaction. */ + AssertChangeLsnOrder(txn); + /* * Calculate the size of our heap: one element for every transaction that * contains changes. (Besides the transactions already in the reorder @@ -1032,6 +1187,9 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + /* Check ordering of changes in this subtransaction. */ + AssertChangeLsnOrder(cur_txn); + if (cur_txn->nentries > 0) nr_txns++; } @@ -1148,7 +1306,7 @@ ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) { change = dlist_container(ReorderBufferChange, node, dlist_pop_head_node(&state->old_change)); - ReorderBufferReturnChange(rb, change); + ReorderBufferReturnChange(rb, change, true); Assert(dlist_is_empty(&state->old_change)); } @@ -1234,7 +1392,7 @@ ReorderBufferIterTXNFinish(ReorderBuffer *rb, change = dlist_container(ReorderBufferChange, node, dlist_pop_head_node(&state->old_change)); - ReorderBufferReturnChange(rb, change); + ReorderBufferReturnChange(rb, change, true); Assert(dlist_is_empty(&state->old_change)); } @@ -1280,7 +1438,7 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) /* Check we're not mixing changes from different transactions. */ Assert(change->txn == txn); - ReorderBufferReturnChange(rb, change); + ReorderBufferReturnChange(rb, change, true); } /* @@ -1297,7 +1455,7 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) Assert(change->txn == txn); Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); - ReorderBufferReturnChange(rb, change); + ReorderBufferReturnChange(rb, change, true); } /* @@ -1309,6 +1467,15 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) dlist_delete(&txn->base_snapshot_node); } + /* + * Cleanup the snapshot for the last streamed run. + */ + if (txn->snapshot_now != NULL) + { + Assert(rbtxn_is_streamed(txn)); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + } + /* * Remove TXN from its containing list. * @@ -1334,6 +1501,91 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) ReorderBufferReturnTXN(rb, txn); } +/* + * Discard changes from a transaction (and subtransactions), after streaming + * them. Keep the remaining info - transactions, tuplecids, invalidations and + * snapshots. + */ +static void +ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferTruncateTXN(rb, subtxn); + } + + /* cleanup changes in the toplevel txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + /* remove the change from it's containing list */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Mark the transaction as streamed. + * + * The toplevel transaction, identified by (toptxn==NULL), is marked as + * streamed always, even if it does not contain any changes (that is, when + * all the changes are in subtransactions). + * + * For subtransactions, we only mark them as streamed when there are + * changes in them. + * + * We do it this way because of aborts - we don't want to send aborts for + * XIDs the downstream is not aware of. And of course, it always knows + * about the toplevel xact (we send the XID in all messages), but we never + * stream XIDs of empty subxacts. + */ + if ((!txn->toptxn) || (txn->nentries_mem != 0)) + txn->txn_flags |= RBTXN_IS_STREAMED; + + /* + * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any + * memory. We could also keep the hash table and update it with new ctid + * values, but this seems simpler and good enough for now. + */ + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + /* If this txn is serialized then clean the disk space. */ + if (rbtxn_is_serialized(txn)) + { + ReorderBufferRestoreCleanup(rb, txn); + txn->txn_flags &= ~RBTXN_IS_SERIALIZED; + } + + /* also reset the number of entries in the transaction */ + txn->nentries_mem = 0; + txn->nentries = 0; +} + /* * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by * HeapTupleSatisfiesHistoricMVCC. @@ -1485,57 +1737,191 @@ ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap) } /* - * Perform the replay of a transaction and its non-aborted subtransactions. - * - * Subtransactions previously have to be processed by - * ReorderBufferCommitChild(), even if previously assigned to the toplevel - * transaction with ReorderBufferAssignChild. - * - * We currently can only decode a transaction's contents when its commit - * record is read because that's the only place where we know about cache - * invalidations. Thus, once a toplevel commit is read, we iterate over the top - * and subtransactions (using a k-way merge) and replay the changes in lsn - * order. + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. */ -void -ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, - XLogRecPtr commit_lsn, XLogRecPtr end_lsn, - TimestampTz commit_time, - RepOriginId origin_id, XLogRecPtr origin_lsn) +static void +ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn) { - ReorderBufferTXN *txn; - volatile Snapshot snapshot_now; - volatile CommandId command_id = FirstCommandId; - bool using_subtxn; - ReorderBufferIterTXNState *volatile iterstate = NULL; + /* we should only call this for previously streamed transactions */ + Assert(rbtxn_is_streamed(txn)); - txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, - false); + ReorderBufferStreamTXN(rb, txn); - /* unknown transaction, nothing to replay */ - if (txn == NULL) - return; + rb->stream_commit(rb, txn, txn->final_lsn); - txn->final_lsn = commit_lsn; - txn->end_lsn = end_lsn; - txn->commit_time = commit_time; - txn->origin_id = origin_id; - txn->origin_lsn = origin_lsn; + ReorderBufferCleanupTXN(rb, txn); +} +/* + * Set xid to detect concurrent aborts. + * + * While streaming an in-progress transaction there is a possibility that the + * (sub)transaction might get aborted concurrently. In such case if the + * (sub)transaction has catalog update then we might decode the tuple using + * wrong catalog version. For example, suppose there is one catalog tuple with + * (xmin: 500, xmax: 0). Now, the transaction 501 updates the catalog tuple + * and after that we will have two tuples (xmin: 500, xmax: 501) and + * (xmin: 501, xmax: 0). Now, if 501 is aborted and some other transaction + * say 502 updates the same catalog tuple then the first tuple will be changed + * to (xmin: 500, xmax: 502). So, the problem is that when we try to decode + * the tuple inserted/updated in 501 after the catalog update, we will see the + * catalog tuple with (xmin: 500, xmax: 502) as visible because it will + * consider that the tuple is deleted by xid 502 which is not visible to our + * snapshot. And when we will try to decode with that catalog tuple, it can + * lead to a wrong result or a crash. So, it is necessary to detect + * concurrent aborts to allow streaming of in-progress transactions. + * + * For detecting the concurrent abort we set CheckXidAlive to the current + * (sub)transaction's xid for which this change belongs to. And, during + * catalog scan we can check the status of the xid and if it is aborted we will + * report a specific error so that we can stop streaming current transaction + * and discard the already streamed changes on such an error. We might have + * already streamed some of the changes for the aborted (sub)transaction, but + * that is fine because when we decode the abort we will stream abort message + * to truncate the changes in the subscriber. + */ +static inline void +SetupCheckXidLive(TransactionId xid) +{ /* - * If this transaction has no snapshot, it didn't make any changes to the - * database, so there's nothing to decode. Note that - * ReorderBufferCommitChild will have transferred any snapshots from - * subtransactions if there were any. + * If the input transaction id is already set as a CheckXidAlive then + * nothing to do. */ - if (txn->base_snapshot == NULL) - { - Assert(txn->ninvalidations == 0); - ReorderBufferCleanupTXN(rb, txn); + if (TransactionIdEquals(CheckXidAlive, xid)) return; + + /* + * setup CheckXidAlive if it's not committed yet. We don't check if the + * xid is aborted. That will happen during catalog access. + */ + if (!TransactionIdDidCommit(xid)) + CheckXidAlive = xid; + else + CheckXidAlive = InvalidTransactionId; +} + +/* + * Helper function for ReorderBufferProcessTXN for applying change. + */ +static inline void +ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change, + bool streaming) +{ + if (streaming) + rb->stream_change(rb, txn, relation, change); + else + rb->apply_change(rb, txn, relation, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the truncate. + */ +static inline void +ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn, + int nrelations, Relation *relations, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_truncate(rb, txn, nrelations, relations, change); + else + rb->apply_truncate(rb, txn, nrelations, relations, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the message. + */ +static inline void +ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); + else + rb->message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); +} + +/* + * Function to store the command id and snapshot at the end of the current + * stream so that we can reuse the same while sending the next stream. + */ +static inline void +ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, CommandId command_id) +{ + txn->command_id = command_id; + + /* Avoid copying if it's already copied. */ + if (snapshot_now->copied) + txn->snapshot_now = snapshot_now; + else + txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); +} + +/* + * Helper function for ReorderBufferProcessTXN to handle the concurrent + * abort of the streaming transaction. This resets the TXN such that it + * can be used to stream the remaining data of transaction being processed. + */ +static void +ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, + CommandId command_id, + XLogRecPtr last_lsn, + ReorderBufferChange *specinsert) +{ + /* Discard the changes that we just streamed */ + ReorderBufferTruncateTXN(rb, txn); + + /* Free all resources allocated for toast reconstruction */ + ReorderBufferToastReset(rb, txn); + + /* Return the spec insert change if it is not NULL */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; } - snapshot_now = txn->base_snapshot; + /* Stop the stream. */ + rb->stream_stop(rb, txn, last_lsn); + + /* Remember the command ID and snapshot for the streaming run */ + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); +} + +/* + * Helper function for ReorderBufferCommit and ReorderBufferStreamTXN. + * + * Send data of a transaction (and its subtransactions) to the + * output plugin. We iterate over the top and subtransactions (using a k-way + * merge) and replay the changes in lsn order. + * + * If streaming is true then data will be sent using stream API. + */ +static void +ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn, + volatile Snapshot snapshot_now, + volatile CommandId command_id, + bool streaming) +{ + bool using_subtxn; + MemoryContext ccxt = CurrentMemoryContext; + ReorderBufferIterTXNState *volatile iterstate = NULL; + volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; + ReorderBufferChange *volatile specinsert = NULL; + volatile bool stream_started = false; + ReorderBufferTXN *volatile curtxn = NULL; /* build data to be able to lookup the CommandIds of catalog tuples */ ReorderBufferBuildTupleCidHash(rb, txn); @@ -1558,14 +1944,15 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, PG_TRY(); { ReorderBufferChange *change; - ReorderBufferChange *specinsert = NULL; if (using_subtxn) - BeginInternalSubTransaction("replay"); + BeginInternalSubTransaction(streaming ? "stream" : "replay"); else StartTransactionCommand(); - rb->begin(rb, txn); + /* We only need to send begin/commit for non-streamed transactions. */ + if (!streaming) + rb->begin(rb, txn); ReorderBufferIterTXNInit(rb, txn, &iterstate); while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL) @@ -1573,6 +1960,36 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, Relation relation = NULL; Oid reloid; + /* + * We can't call start stream callback before processing first + * change. + */ + if (prev_lsn == InvalidXLogRecPtr) + { + if (streaming) + { + txn->origin_id = change->origin_id; + rb->stream_start(rb, txn, change->lsn); + stream_started = true; + } + } + + /* + * Enforce correct ordering of changes, merged from multiple + * subtransactions. The changes may have the same LSN due to + * MULTI_INSERT xlog records. + */ + Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn); + + prev_lsn = change->lsn; + + /* Set the current xid to detect concurrent aborts. */ + if (streaming) + { + curtxn = change->txn; + SetupCheckXidLive(curtxn->xid); + } + switch (change->action) { case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: @@ -1649,7 +2066,8 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (!IsToastRelation(relation)) { ReorderBufferToastReplace(rb, txn, relation, change); - rb->apply_change(rb, txn, relation, change); + ReorderBufferApplyChange(rb, txn, relation, change, + streaming); /* * Only clear reassembled toast chunks if we're sure @@ -1685,11 +2103,11 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, */ if (specinsert != NULL) { - ReorderBufferReturnChange(rb, specinsert); + ReorderBufferReturnChange(rb, specinsert, true); specinsert = NULL; } - if (relation != NULL) + if (RelationIsValid(relation)) { RelationClose(relation); relation = NULL; @@ -1714,7 +2132,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, /* clear out a pending (and thus failed) speculation */ if (specinsert != NULL) { - ReorderBufferReturnChange(rb, specinsert); + ReorderBufferReturnChange(rb, specinsert, true); specinsert = NULL; } @@ -1747,7 +2165,10 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, relations[nrelations++] = relation; } - rb->apply_truncate(rb, txn, nrelations, relations, change); + /* Apply the truncate. */ + ReorderBufferApplyTruncate(rb, txn, nrelations, + relations, change, + streaming); for (i = 0; i < nrelations; i++) RelationClose(relations[i]); @@ -1756,10 +2177,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, } case REORDER_BUFFER_CHANGE_MESSAGE: - rb->message(rb, txn, change->lsn, true, - change->data.msg.prefix, - change->data.msg.message_size, - change->data.msg.message); + ReorderBufferApplyMessage(rb, txn, change, streaming); break; case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: @@ -1790,7 +2208,6 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, snapshot_now = change->data.snapshot; } - /* and continue with the new one */ SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); break; @@ -1837,7 +2254,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, */ if (specinsert) { - ReorderBufferReturnChange(rb, specinsert); + ReorderBufferReturnChange(rb, specinsert, true); specinsert = NULL; } @@ -1845,14 +2262,35 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, ReorderBufferIterTXNFinish(rb, iterstate); iterstate = NULL; - /* call commit callback */ - rb->commit(rb, txn, commit_lsn); + /* + * Done with current changes, send the last message for this set of + * changes depending upon streaming mode. + */ + if (streaming) + { + if (stream_started) + { + rb->stream_stop(rb, txn, prev_lsn); + stream_started = false; + } + } + else + rb->commit(rb, txn, commit_lsn); /* this is just a sanity check against bad output plugin behaviour */ if (GetCurrentTransactionIdIfAny() != InvalidTransactionId) elog(ERROR, "output plugin used XID %u", GetCurrentTransactionId()); + /* + * Remember the command ID and snapshot for the next set of changes in + * streaming mode. + */ + if (streaming) + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + else if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + /* cleanup */ TeardownHistoricSnapshot(false); @@ -1870,14 +2308,27 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); - if (snapshot_now->copied) - ReorderBufferFreeSnap(rb, snapshot_now); + /* + * If we are streaming the in-progress transaction then discard the + * changes that we just streamed, and mark the transactions as + * streamed (if they contained changes). Otherwise, remove all the + * changes and deallocate the ReorderBufferTXN. + */ + if (streaming) + { + ReorderBufferTruncateTXN(rb, txn); - /* remove potential on-disk data, and deallocate */ - ReorderBufferCleanupTXN(rb, txn); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + ReorderBufferCleanupTXN(rb, txn); } PG_CATCH(); { + MemoryContext ecxt = MemoryContextSwitchTo(ccxt); + ErrorData *errdata = CopyErrorData(); + /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */ if (iterstate) ReorderBufferIterTXNFinish(rb, iterstate); @@ -1896,15 +2347,106 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); - if (snapshot_now->copied) - ReorderBufferFreeSnap(rb, snapshot_now); + /* + * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent + * abort of the (sub)transaction we are streaming. We need to do the + * cleanup and return gracefully on this error, see SetupCheckXidLive. + */ + if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK) + { + /* + * This error can only occur when we are sending the data in + * streaming mode and the streaming is not finished yet. + */ + Assert(streaming); + Assert(stream_started); + + /* Cleanup the temporary error state. */ + FlushErrorState(); + FreeErrorData(errdata); + errdata = NULL; + curtxn->concurrent_abort = true; + + /* Reset the TXN so that it is allowed to stream remaining data. */ + ReorderBufferResetTXN(rb, txn, snapshot_now, + command_id, prev_lsn, + specinsert); + } + else + { + ReorderBufferCleanupTXN(rb, txn); + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); +} - /* remove potential on-disk data, and deallocate */ - ReorderBufferCleanupTXN(rb, txn); +/* + * Perform the replay of a transaction and its non-aborted subtransactions. + * + * Subtransactions previously have to be processed by + * ReorderBufferCommitChild(), even if previously assigned to the toplevel + * transaction with ReorderBufferAssignChild. + * + * This interface is called once a toplevel commit is read for both streamed + * as well as non-streamed transactions. + */ +void +ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + Snapshot snapshot_now; + CommandId command_id = FirstCommandId; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); - PG_RE_THROW(); + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + /* + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. + * + * Called after everything (origin ID, LSN, ...) is stored in the + * transaction to avoid passing that information directly. + */ + if (rbtxn_is_streamed(txn)) + { + ReorderBufferStreamCommit(rb, txn); + return; } - PG_END_TRY(); + + /* + * If this transaction has no snapshot, it didn't make any changes to the + * database, so there's nothing to decode. Note that + * ReorderBufferCommitChild will have transferred any snapshots from + * subtransactions if there were any. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + ReorderBufferCleanupTXN(rb, txn); + return; + } + + snapshot_now = txn->base_snapshot; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now, + command_id, false); } /* @@ -1931,6 +2473,22 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) if (txn == NULL) return; + /* For streamed transactions notify the remote node about the abort. */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_abort(rb, txn, lsn); + + /* + * We might have decoded changes for this transaction that could load + * the cache as per the current transaction's view (consider DDL's + * happened in this transaction). We don't want the decoding of future + * transactions to use those cache entries so execute invalidations. + */ + if (txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + } + /* cosmetic... */ txn->final_lsn = lsn; @@ -2000,6 +2558,10 @@ ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) if (txn == NULL) return; + /* For streamed transactions notify the remote node about the abort. */ + if (rbtxn_is_streamed(txn)) + rb->stream_abort(rb, txn, lsn); + /* cosmetic... */ txn->final_lsn = lsn; @@ -2082,7 +2644,7 @@ ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, change->data.snapshot = snap; change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT; - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); } /* @@ -2131,12 +2693,21 @@ ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, change->data.command_id = cid; change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID; - ReorderBufferQueueChange(rb, xid, lsn, change); + ReorderBufferQueueChange(rb, xid, lsn, change, false); } /* - * Update the memory accounting info. We track memory used by the whole - * reorder buffer and the transaction containing the change. + * Update memory counters to account for the new or removed change. + * + * We update two counters - in the reorder buffer, and in the transaction + * containing the change. The reorder buffer counter allows us to quickly + * decide if we reached the memory limit, the transaction counter allows + * us to quickly pick the largest transaction for eviction. + * + * When streaming is enabled, we need to update the toplevel transaction + * counters instead - we don't really care about subtransactions as we + * can't stream them individually anyway, and we only pick toplevel + * transactions for eviction. So only toplevel transactions matter. */ static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, @@ -2144,6 +2715,8 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, bool addition) { Size sz; + ReorderBufferTXN *txn; + ReorderBufferTXN *toptxn = NULL; Assert(change->txn); @@ -2155,19 +2728,41 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID) return; + txn = change->txn; + + /* If streaming supported, update the total size in top level as well. */ + if (ReorderBufferCanStream(rb)) + { + if (txn->toptxn != NULL) + toptxn = txn->toptxn; + else + toptxn = txn; + } + sz = ReorderBufferChangeSize(change); if (addition) { - change->txn->size += sz; + txn->size += sz; rb->size += sz; + + /* Update the total size in the top transaction. */ + if (toptxn) + toptxn->total_size += sz; } else { - Assert((rb->size >= sz) && (change->txn->size >= sz)); - change->txn->size -= sz; + Assert((rb->size >= sz) && (txn->size >= sz)); + txn->size -= sz; rb->size -= sz; + + /* Update the total size in the top transaction. */ + if (toptxn) + toptxn->total_size -= sz; } + + Assert(txn->size <= rb->size); + Assert((txn->size >= 0) && (rb->size >= 0)); } /* @@ -2387,6 +2982,51 @@ ReorderBufferLargestTXN(ReorderBuffer *rb) return largest; } +/* + * Find the largest toplevel transaction to evict (by streaming). + * + * This can be seen as an optimized version of ReorderBufferLargestTXN, which + * should give us the same transaction (because we don't update memory account + * for subtransaction with streaming, so it's always 0). But we can simply + * iterate over the limited number of toplevel transactions. + * + * Note that, we skip transactions that contains incomplete changes. There + * is a scope of optimization here such that we can select the largest transaction + * which has complete changes. But that will make the code and design quite complex + * and that might not be worth the benefit. If we plan to stream the transactions + * that contains incomplete changes then we need to find a way to partially + * stream/truncate the transaction changes in-memory and build a mechanism to + * partially truncate the spilled files. Additionally, whenever we partially + * stream the transaction we need to maintain the last streamed lsn and next time + * we need to restore from that segment and the offset in WAL. As we stream the + * changes from the top transaction and restore them subtransaction wise, we need + * to even remember the subxact from where we streamed the last change. + */ +static ReorderBufferTXN * +ReorderBufferLargestTopTXN(ReorderBuffer *rb) +{ + dlist_iter iter; + Size largest_size = 0; + ReorderBufferTXN *largest = NULL; + + /* Find the largest top-level transaction. */ + dlist_foreach(iter, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, node, iter.cur); + + if ((largest != NULL || txn->total_size > largest_size) && + (txn->total_size > 0) && !(rbtxn_has_incomplete_tuple(txn))) + { + largest = txn; + largest_size = txn->total_size; + } + } + + return largest; +} + /* * Check whether the logical_decoding_work_mem limit was reached, and if yes * pick the largest (sub)transaction at-a-time to evict and spill its changes to @@ -2419,11 +3059,33 @@ ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) { /* * Pick the largest transaction (or subtransaction) and evict it from - * memory by serializing it to disk. + * memory by streaming, if possible. Otherwise, spill to disk. */ - txn = ReorderBufferLargestTXN(rb); + if (ReorderBufferCanStartStreaming(rb) && + (txn = ReorderBufferLargestTopTXN(rb)) != NULL) + { + /* we know there has to be one, because the size is not zero */ + Assert(txn && !txn->toptxn); + Assert(txn->total_size > 0); + Assert(rb->size >= txn->total_size); - ReorderBufferSerializeTXN(rb, txn); + ReorderBufferStreamTXN(rb, txn); + } + else + { + /* + * Pick the largest transaction (or subtransaction) and evict it + * from memory by serializing it to disk. + */ + txn = ReorderBufferLargestTXN(rb); + + /* we know there has to be one, because the size is not zero */ + Assert(txn); + Assert(txn->size > 0); + Assert(rb->size >= txn->size); + + ReorderBufferSerializeTXN(rb, txn); + } /* * After eviction, the transaction should have no entries in memory, @@ -2501,7 +3163,7 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) ReorderBufferSerializeChange(rb, txn, fd, change); dlist_delete(&change->node); - ReorderBufferReturnChange(rb, change); + ReorderBufferReturnChange(rb, change, true); spilled++; } @@ -2713,6 +3375,136 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, Assert(ondisk->change.action == change->action); } +/* Returns true, if the output plugin supports streaming, false, otherwise. */ +static inline bool +ReorderBufferCanStream(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + + return ctx->streaming; +} + +/* Returns true, if the streaming can be started now, false, otherwise. */ +static inline bool +ReorderBufferCanStartStreaming(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + SnapBuild *builder = ctx->snapshot_builder; + + /* + * We can't start streaming immediately even if the streaming is enabled + * because we previously decoded this transaction and now just are + * restarting. + */ + if (ReorderBufferCanStream(rb) && + !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr)) + { + /* We must have a consistent snapshot by this time */ + Assert(SnapBuildCurrentState(builder) == SNAPBUILD_CONSISTENT); + return true; + } + + return false; +} + +/* + * Send data of a large transaction (and its subtransactions) to the + * output plugin, but using the stream API. + */ +static void +ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + Snapshot snapshot_now; + CommandId command_id; + + /* We can never reach here for a subtransaction. */ + Assert(txn->toptxn == NULL); + + /* + * We can't make any assumptions about base snapshot here, similar to what + * ReorderBufferCommit() does. That relies on base_snapshot getting + * transferred from subxact in ReorderBufferCommitChild(), but that was + * not yet called as the transaction is in-progress. + * + * So just walk the subxacts and use the same logic here. But we only need + * to do that once, when the transaction is streamed for the first time. + * After that we need to reuse the snapshot from the previous run. + * + * Unlike DecodeCommit which adds xids of all the subtransactions in + * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here + * but we do add them to subxip array instead via ReorderBufferCopySnap. + * This allows the catalog changes made in subtransactions decoded till + * now to be visible. + */ + if (txn->snapshot_now == NULL) + { + dlist_iter subxact_i; + + /* make sure this transaction is streamed for the first time */ + Assert(!rbtxn_is_streamed(txn)); + + /* at the beginning we should have invalid command ID */ + Assert(txn->command_id == InvalidCommandId); + + dlist_foreach(subxact_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur); + ReorderBufferTransferSnapToParent(txn, subtxn); + } + + /* + * If this transaction has no snapshot, it didn't make any changes to + * the database till now, so there's nothing to decode. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + return; + } + + command_id = FirstCommandId; + snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot, + txn, command_id); + } + else + { + /* the transaction must have been already streamed */ + Assert(rbtxn_is_streamed(txn)); + + /* + * Nah, we already have snapshot from the previous streaming run. We + * assume new subxacts can't move the LSN backwards, and so can't beat + * the LSN condition in the previous branch (so no need to walk + * through subxacts again). In fact, we must not do that as we may be + * using snapshot half-way through the subxact. + */ + command_id = txn->command_id; + + /* + * We can't use txn->snapshot_now directly because after the last + * streaming run, we might have got some new sub-transactions. So we + * need to add them to the snapshot. + */ + snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now, + txn, command_id); + + /* Free the previously copied snapshot. */ + Assert(txn->snapshot_now->copied); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + txn->snapshot_now = NULL; + } + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now, + command_id, true); + + Assert(dlist_is_empty(&txn->changes)); + Assert(txn->nentries == 0); + Assert(txn->nentries_mem == 0); +} + /* * Size of a change in memory. */ @@ -2813,7 +3605,7 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, dlist_container(ReorderBufferChange, node, cleanup_iter.cur); dlist_delete(&cleanup->node); - ReorderBufferReturnChange(rb, cleanup); + ReorderBufferReturnChange(rb, cleanup, true); } txn->nentries_mem = 0; Assert(dlist_is_empty(&txn->changes)); @@ -3522,7 +4314,7 @@ ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn) dlist_container(ReorderBufferChange, node, it.cur); dlist_delete(&change->node); - ReorderBufferReturnChange(rb, change); + ReorderBufferReturnChange(rb, change, true); } } @@ -3812,6 +4604,17 @@ ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, BlockNumber blockno; bool updated_mapping = false; + /* + * Return unresolved if tuplecid_data is not valid. That's because when + * streaming in-progress transactions we may run into tuples with the CID + * before actually decoding them. Think e.g. about INSERT followed by + * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the + * INSERT. So in such cases, we assume the CID is from the future + * command. + */ + if (tuplecid_data == NULL) + return false; + /* be careful about padding */ memset(&key, 0, sizeof(key)); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 95d18cdb12e7..aa17f7df84d4 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -67,6 +67,7 @@ #define XLH_INSERT_LAST_IN_MULTI (1<<1) #define XLH_INSERT_IS_SPECULATIVE (1<<2) #define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3) +#define XLH_INSERT_ON_TOAST_RELATION (1<<4) /* * xl_heap_update flag values, 8 bits are available. diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 7ba72c84e021..387eb34a61a3 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -19,6 +19,7 @@ #include "access/relscan.h" #include "access/sdir.h" +#include "access/xact.h" #include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -903,6 +904,15 @@ static inline bool table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) { slot->tts_tableOid = RelationGetRelid(sscan->rs_rd); + + /* + * We don't expect direct calls to table_scan_getnextslot with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding"); + return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot); } @@ -1017,6 +1027,13 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, TupleTableSlot *slot, bool *call_again, bool *all_dead) { + /* + * We don't expect direct calls to table_index_fetch_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, slot, call_again, @@ -1056,6 +1073,14 @@ table_tuple_fetch_row_version(Relation rel, Snapshot snapshot, TupleTableSlot *slot) { + /* + * We don't expect direct calls to table_tuple_fetch_row_version with + * valid CheckXidAlive for catalog or regular tables. See detailed + * comments in xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); + return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); } @@ -1713,6 +1738,14 @@ static inline bool table_scan_bitmap_next_block(TableScanDesc scan, struct TBMIterateResult *tbmres) { + /* + * We don't expect direct calls to table_scan_bitmap_next_block with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding"); + return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, tbmres); } @@ -1730,6 +1763,14 @@ table_scan_bitmap_next_tuple(TableScanDesc scan, struct TBMIterateResult *tbmres, TupleTableSlot *slot) { + /* + * We don't expect direct calls to table_scan_bitmap_next_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding"); + return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, tbmres, slot); @@ -1748,6 +1789,13 @@ static inline bool table_scan_sample_next_block(TableScanDesc scan, struct SampleScanState *scanstate) { + /* + * We don't expect direct calls to table_scan_sample_next_block with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding"); return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); } @@ -1764,6 +1812,13 @@ table_scan_sample_next_tuple(TableScanDesc scan, struct SampleScanState *scanstate, TupleTableSlot *slot) { + /* + * We don't expect direct calls to table_scan_sample_next_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding"); return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, slot); } diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 53480116a462..c18554bae2c2 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -81,6 +81,10 @@ typedef enum /* Synchronous commit level */ extern int synchronous_commit; +/* used during logical streaming of a transaction */ +extern TransactionId CheckXidAlive; +extern bool bsysscan; + /* * Miscellaneous flag bits to record events which occur on the top level * transaction. These flags are only persisted in MyXactFlags and are intended diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index deef31825d6e..b0fae9808bf6 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -121,5 +121,6 @@ extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, extern void LogicalConfirmReceivedLocation(XLogRecPtr lsn); extern bool filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, RepOriginId origin_id); +extern void ResetLogicalStreamingState(void); #endif diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 42bc81764873..1ae17d5f11fd 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -162,6 +162,9 @@ typedef struct ReorderBufferChange #define RBTXN_HAS_CATALOG_CHANGES 0x0001 #define RBTXN_IS_SUBXACT 0x0002 #define RBTXN_IS_SERIALIZED 0x0004 +#define RBTXN_IS_STREAMED 0x0008 +#define RBTXN_HAS_TOAST_INSERT 0x0010 +#define RBTXN_HAS_SPEC_INSERT 0x0020 /* Does the transaction have catalog changes? */ #define rbtxn_has_catalog_changes(txn) \ @@ -181,6 +184,40 @@ typedef struct ReorderBufferChange ((txn)->txn_flags & RBTXN_IS_SERIALIZED) != 0 \ ) +/* This transaction's changes has toast insert, without main table insert. */ +#define rbtxn_has_toast_insert(txn) \ +( \ + ((txn)->txn_flags & RBTXN_HAS_TOAST_INSERT) != 0 \ +) +/* + * This transaction's changes has speculative insert, without speculative + * confirm. + */ +#define rbtxn_has_spec_insert(txn) \ +( \ + ((txn)->txn_flags & RBTXN_HAS_SPEC_INSERT) != 0 \ +) + +/* Check whether this transaction has an incomplete change. */ +#define rbtxn_has_incomplete_tuple(txn) \ +( \ + rbtxn_has_toast_insert(txn) || rbtxn_has_spec_insert(txn) \ +) + +/* + * Has this transaction been streamed to downstream? + * + * (It's not possible to deduce this from nentries and nentries_mem for + * various reasons. For example, all changes may be in subtransactions in + * which case we'd have nentries==0 for the toplevel one, which would say + * nothing about the streaming. So we maintain this flag, but only for the + * toplevel transaction.) + */ +#define rbtxn_is_streamed(txn) \ +( \ + ((txn)->txn_flags & RBTXN_IS_STREAMED) != 0 \ +) + typedef struct ReorderBufferTXN { /* See above */ @@ -248,6 +285,13 @@ typedef struct ReorderBufferTXN XLogRecPtr base_snapshot_lsn; dlist_node base_snapshot_node; /* link in txns_by_base_snapshot_lsn */ + /* + * Snapshot/CID from the previous streaming run. Only valid for already + * streamed transactions (NULL/InvalidCommandId otherwise). + */ + Snapshot snapshot_now; + CommandId command_id; + /* * How many ReorderBufferChange's do we have in this txn. * @@ -313,6 +357,12 @@ typedef struct ReorderBufferTXN * Size of this transaction (changes currently in memory, in bytes). */ Size size; + + /* Size of top-transaction including sub-transactions. */ + Size total_size; + + /* If we have detected concurrent abort then ignore future changes. */ + bool concurrent_abort; } ReorderBufferTXN; /* so we can define the callbacks used inside struct ReorderBuffer itself */ @@ -484,12 +534,14 @@ void ReorderBufferFree(ReorderBuffer *); ReorderBufferTupleBuf *ReorderBufferGetTupleBuf(ReorderBuffer *, Size tuple_len); void ReorderBufferReturnTupleBuf(ReorderBuffer *, ReorderBufferTupleBuf *tuple); ReorderBufferChange *ReorderBufferGetChange(ReorderBuffer *); -void ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *); +void ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *, bool); Oid *ReorderBufferGetRelids(ReorderBuffer *, int nrelids); void ReorderBufferReturnRelids(ReorderBuffer *, Oid *relids); -void ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *); +void ReorderBufferQueueChange(ReorderBuffer *, TransactionId, + XLogRecPtr lsn, ReorderBufferChange *, + bool toast_insert); void ReorderBufferQueueMessage(ReorderBuffer *, TransactionId, Snapshot snapshot, XLogRecPtr lsn, bool transactional, const char *prefix, Size message_size, const char *message); From a13421c96c0e8ffa34310f92d03d0e6f3bfa27f8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sat, 8 Aug 2020 07:31:52 +0200 Subject: [PATCH 269/334] Add some const decorations --- src/backend/replication/logical/logical.c | 6 +++--- src/include/replication/logical.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 42f284b33f6b..f5eb6bc3aff2 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -82,7 +82,7 @@ static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *tx static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, int nrelations, Relation relations[], ReorderBufferChange *change); -static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin); +static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin); /* * Make sure the current settings & environment are capable of doing logical @@ -277,7 +277,7 @@ StartupDecodingContext(List *output_plugin_options, * startup function. */ LogicalDecodingContext * -CreateInitDecodingContext(char *plugin, +CreateInitDecodingContext(const char *plugin, List *output_plugin_options, bool need_full_snapshot, XLogRecPtr restart_lsn, @@ -612,7 +612,7 @@ OutputPluginUpdateProgress(struct LogicalDecodingContext *ctx) * that it provides the required callbacks. */ static void -LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin) +LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin) { LogicalOutputPluginInit plugin_init; diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index b0fae9808bf6..45abc444b7a5 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -96,7 +96,7 @@ typedef struct LogicalDecodingContext extern void CheckLogicalDecodingRequirements(void); -extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin, +extern LogicalDecodingContext *CreateInitDecodingContext(const char *plugin, List *output_plugin_options, bool need_full_snapshot, XLogRecPtr restart_lsn, From 82a0ba7707e010a29f5fe1a0020d963c82b8f1cb Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Sat, 8 Aug 2020 12:13:18 +0530 Subject: [PATCH 270/334] Fix the logical streaming test. Commit 7259736a6e added the capability to stream changes in ReorderBuffer which has some tests to test the streaming mode. It is quite possible that while this test is running a parallel transaction could be logged by autovacuum. Such a transaction won't perform any insert/update/delete to non-catalog tables so will be shown as an empty transaction. Fix it by skipping the empty transactions during this test. Per report by buildfarm. --- contrib/test_decoding/expected/stream.out | 4 ++-- contrib/test_decoding/sql/stream.sql | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/test_decoding/expected/stream.out b/contrib/test_decoding/expected/stream.out index 9a5d7e7c4399..d7e32f818546 100644 --- a/contrib/test_decoding/expected/stream.out +++ b/contrib/test_decoding/expected/stream.out @@ -26,7 +26,7 @@ TRUNCATE table stream_test; rollback to s1; INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i); COMMIT; -SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1'); data ---------------------------------------------------------- opening a streamed block for transaction @@ -67,7 +67,7 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc (0 rows) INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i); -SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1'); data ------------------------------------------ opening a streamed block for transaction diff --git a/contrib/test_decoding/sql/stream.sql b/contrib/test_decoding/sql/stream.sql index 8abc30de0afc..ce86c816d11f 100644 --- a/contrib/test_decoding/sql/stream.sql +++ b/contrib/test_decoding/sql/stream.sql @@ -16,7 +16,7 @@ rollback to s1; INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i); COMMIT; -SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1'); -- streaming test for toast changes ALTER TABLE stream_test ALTER COLUMN data set storage external; @@ -24,7 +24,7 @@ ALTER TABLE stream_test ALTER COLUMN data set storage external; SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i); -SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1'); DROP TABLE stream_test; SELECT pg_drop_replication_slot('regression_slot'); From 470687b4a5bb3b9f2b5bf7c9235680b3c91bd050 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Sat, 8 Aug 2020 12:31:55 -0400 Subject: [PATCH 271/334] walsnd: Don't set waiting_for_ping_response spuriously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ashutosh Bapat noticed that when logical walsender needs to wait for WAL, and it realizes that it must send a keepalive message to walreceiver to update the sent-LSN, which *does not* request a reply from walreceiver, it wrongly sets the flag that it's going to wait for that reply. That means that any future would-be sender of feedback messages ends up not sending a feedback message, because they all believe that a reply is expected. With built-in logical replication there's not much harm in this, because WalReceiverMain will send a ping-back every wal_receiver_timeout/2 anyway; but with other logical replication systems (e.g. pglogical) it can cause significant pain. This problem was introduced in commit 41d5f8ad734, where the request-reply flag was changed from true to false to WalSndKeepalive, without at the same time removing the line that sets waiting_for_ping_response. Just removing that line would be a sufficient fix, but it seems better to shift the responsibility of setting the flag to WalSndKeepalive itself instead of requiring caller to do it; this is clearly less error-prone. Author: Álvaro Herrera Reported-by: Ashutosh Bapat Backpatch: 9.5 and up Discussion: https://postgr.es/m/20200806225558.GA22401@alvherre.pgsql --- src/backend/replication/walsender.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 5e2210dd7bdc..d13220c14008 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -151,7 +151,7 @@ static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; * How far have we sent WAL already? This is also advertised in * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) */ -static XLogRecPtr sentPtr = 0; +static XLogRecPtr sentPtr = InvalidXLogRecPtr; /* Buffers for constructing outgoing messages and processing reply messages. */ static StringInfoData output_message; @@ -1451,10 +1451,7 @@ WalSndWaitForWal(XLogRecPtr loc) if (MyWalSnd->flush < sentPtr && MyWalSnd->write < sentPtr && !waiting_for_ping_response) - { WalSndKeepalive(false); - waiting_for_ping_response = true; - } /* check whether we're done */ if (loc <= RecentFlushPtr) @@ -2932,10 +2929,7 @@ WalSndDone(WalSndSendDataCallback send_data) proc_exit(0); } if (!waiting_for_ping_response) - { WalSndKeepalive(true); - waiting_for_ping_response = true; - } } /* @@ -3432,10 +3426,13 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) } /* - * This function is used to send a keepalive message to standby. - * If requestReply is set, sets a flag in the message requesting the standby - * to send a message back to us, for heartbeat purposes. - */ + * Send a keepalive message to standby. + * + * If requestReply is set, the message requests the other party to send + * a message back to us, for heartbeat purposes. We also set a flag to + * let nearby code that we're waiting for that response, to avoid + * repeated requests. + */ static void WalSndKeepalive(bool requestReply) { @@ -3450,6 +3447,10 @@ WalSndKeepalive(bool requestReply) /* ... and send it wrapped in CopyData */ pq_putmessage_noblock('d', output_message.data, output_message.len); + + /* Set local flag */ + if (requestReply) + waiting_for_ping_response = true; } /* @@ -3480,7 +3481,6 @@ WalSndKeepaliveIfNecessary(void) if (last_processing >= ping_time) { WalSndKeepalive(true); - waiting_for_ping_response = true; /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) From 39132b784aeaaacf5ddfb5c35b6e29a6926f4345 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 8 Aug 2020 11:12:01 -0700 Subject: [PATCH 272/334] Teach amcheck to verify sibling links in all cases. Teach contrib/amcheck's bt_index_check() function to check agreement between siblings links. The left sibling's right link should point to a right sibling page whose left link points back to the same original left sibling. This extends a check that bt_index_parent_check() always performed to bt_index_check(). This is the first time amcheck has been taught to perform buffer lock coupling, which we have explicitly avoided up until now. The sibling link check tends to catch a lot of real world index corruption with little overhead, so it seems worth accepting the complexity. Note that the new lock coupling logic would not work correctly on replica servers without the changes made by commits 0a7d771f and 9a9db08a (there could be false positives without those changes). Author: Andrey Borodin, Peter Geoghegan Discussion: https://postgr.es/m/0EB0CFA8-CBD8-4296-8049-A2C0F28FAE8C@yandex-team.ru --- contrib/amcheck/verify_nbtree.c | 173 +++++++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 23 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b87a3cb4717c..635ece73b354 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, bool rootdescend); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); +static void bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent); static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, @@ -787,17 +790,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ } - /* - * readonly mode can only ever land on live pages and half-dead pages, - * so sibling pointers should always be in mutual agreement - */ - if (state->readonly && opaque->btpo_prev != leftcurrent) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("left link/right link pair in index \"%s\" not in agreement", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u left block=%u left link from block=%u.", - current, leftcurrent, opaque->btpo_prev))); + /* Sibling links should be in mutual agreement */ + if (opaque->btpo_prev != leftcurrent) + bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); /* Check level, which must be valid for non-ignorable page */ if (level.level != opaque->btpo.level) @@ -877,6 +872,140 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) return nextleveldown; } +/* + * Raise an error when target page's left link does not point back to the + * previous target page, called leftcurrent here. The leftcurrent page's + * right link was followed to get to the current target page, and we expect + * mutual agreement among leftcurrent and the current target page. Make sure + * that this condition has definitely been violated in the !readonly case, + * where concurrent page splits are something that we need to deal with. + * + * Cross-page inconsistencies involving pages that don't agree about being + * siblings are known to be a particularly good indicator of corruption + * involving partial writes/lost updates. The bt_right_page_check_scankey + * check also provides a way of detecting cross-page inconsistencies for + * !readonly callers, but it can only detect sibling pages that have an + * out-of-order keyspace, which can't catch many of the problems that we + * expect to catch here. + * + * The classic example of the kind of inconsistency that we can only catch + * with this check (when in !readonly mode) involves three sibling pages that + * were affected by a faulty page split at some point in the past. The + * effects of the split are reflected in the original page and its new right + * sibling page, with a lack of any accompanying changes for the _original_ + * right sibling page. The original right sibling page's left link fails to + * point to the new right sibling page (its left link still points to the + * original page), even though the first phase of a page split is supposed to + * work as a single atomic action. This subtle inconsistency will probably + * only break backwards scans in practice. + * + * Note that this is the only place where amcheck will "couple" buffer locks + * (and only for !readonly callers). In general we prefer to avoid more + * thorough cross-page checks in !readonly mode, but it seems worth the + * complexity here. Also, the performance overhead of performing lock + * coupling here is negligible in practice. Control only reaches here with a + * non-corrupt index when there is a concurrent page split at the instant + * caller crossed over to target page from leftcurrent page. + */ +static void +bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent) +{ + if (!state->readonly) + { + Buffer lbuf; + Buffer newtargetbuf; + Page page; + BTPageOpaque opaque; + BlockNumber newtargetblock; + + /* Couple locks in the usual order for nbtree: Left to right */ + lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent, + RBM_NORMAL, state->checkstrategy); + LockBuffer(lbuf, BT_READ); + _bt_checkpage(state->rel, lbuf); + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * Cannot reason about concurrently deleted page -- the left link + * in the page to the right is expected to point to some other + * page to the left (not leftcurrent page). + * + * Note that we deliberately don't give up with a half-dead page. + */ + UnlockReleaseBuffer(lbuf); + return; + } + + newtargetblock = opaque->btpo_next; + /* Avoid self-deadlock when newtargetblock == leftcurrent */ + if (newtargetblock != leftcurrent) + { + newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, + newtargetblock, RBM_NORMAL, + state->checkstrategy); + LockBuffer(newtargetbuf, BT_READ); + _bt_checkpage(state->rel, newtargetbuf); + page = BufferGetPage(newtargetbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* btpo_prev_from_target may have changed; update it */ + btpo_prev_from_target = opaque->btpo_prev; + } + else + { + /* + * leftcurrent right sibling points back to leftcurrent block. + * Index is corrupt. Easiest way to handle this is to pretend + * that we actually read from a distinct page that has an invalid + * block number in its btpo_prev. + */ + newtargetbuf = InvalidBuffer; + btpo_prev_from_target = InvalidBlockNumber; + } + + /* + * No need to check P_ISDELETED here, since new target block cannot be + * marked deleted as long as we hold a lock on lbuf + */ + if (BufferIsValid(newtargetbuf)) + UnlockReleaseBuffer(newtargetbuf); + UnlockReleaseBuffer(lbuf); + + if (btpo_prev_from_target == leftcurrent) + { + /* Report split in left sibling, not target (or new target) */ + ereport(DEBUG1, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("harmless concurrent page split detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.", + leftcurrent, newtargetblock, + state->targetblock))); + return; + } + + /* + * Index is corrupt. Make sure that we report correct target page. + * + * This could have changed in cases where there was a concurrent page + * split, as well as index corruption (at least in theory). Note that + * btpo_prev_from_target was already updated above. + */ + state->targetblock = newtargetblock; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("left link/right link pair in index \"%s\" not in agreement", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + state->targetblock, leftcurrent, + btpo_prev_from_target))); +} + /* * Function performs the following checks on target page, or pages ancillary to * target page: @@ -1965,18 +2094,14 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * downlink, which was concurrently physically removed in target/parent as * part of deletion's first phase.) * - * Note that while the cross-page-same-level last item check uses a trick - * that allows it to perform verification for !readonly callers, a similar - * trick seems difficult here. The trick that that other check uses is, - * in essence, to lock down race conditions to those that occur due to - * concurrent page deletion of the target; that's a race that can be - * reliably detected before actually reporting corruption. - * - * On the other hand, we'd need to lock down race conditions involving - * deletion of child's left page, for long enough to read the child page - * into memory (in other words, a scheme with concurrently held buffer - * locks on both child and left-of-child pages). That's unacceptable for - * amcheck functions on general principle, though. + * While we use various techniques elsewhere to perform cross-page + * verification for !readonly callers, a similar trick seems difficult + * here. The tricks used by bt_recheck_sibling_links and by + * bt_right_page_check_scankey both involve verification of a same-level, + * cross-sibling invariant. Cross-level invariants are far more squishy, + * though. The nbtree REDO routines do not actually couple buffer locks + * across levels during page splits, so making any cross-level check work + * reliably in !readonly mode may be impossible. */ Assert(state->readonly); @@ -2785,6 +2910,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, * There is never an attempt to get a consistent view of multiple pages using * multiple concurrent buffer locks; in general, we only acquire a single pin * and buffer lock at a time, which is often all that the nbtree code requires. + * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only + * exception to this general rule.) * * Operating on a copy of the page is useful because it prevents control * getting stuck in an uninterruptible state when an underlying operator class From 20e7e1fe316467720d8d062e1a1429f798fc31bf Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 8 Aug 2020 17:26:29 -0400 Subject: [PATCH 273/334] Remove <@ from contrib/intarray's GiST operator classes. Since commit efc77cf5f, an indexed query using <@ has required a full-index scan, so that it actually performs worse than a plain seqscan would do. As I noted at the time, we'd be better off to not treat <@ as being indexable by such indexes at all; and that's what this patch does. It would have been difficult to remove these opclass members without dropping the whole opclass before commit 9f9682783 fixed GiST opclass member dependency rules, but now it's quite simple, so let's do it. I left the existing support code in place for the time being, with comments noting it's now unreachable. At some point, perhaps we should remove that code in favor of throwing an error telling people to upgrade the extension version. Discussion: https://postgr.es/m/2176979.1596389859@sss.pgh.pa.us Discussion: https://postgr.es/m/458.1565114141@sss.pgh.pa.us --- contrib/intarray/Makefile | 3 ++- contrib/intarray/_int_gist.c | 6 ++++++ contrib/intarray/_intbig_gist.c | 6 ++++++ contrib/intarray/intarray--1.3--1.4.sql | 21 +++++++++++++++++++++ contrib/intarray/intarray.control | 2 +- doc/src/sgml/intarray.sgml | 5 +++-- 6 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 contrib/intarray/intarray--1.3--1.4.sql diff --git a/contrib/intarray/Makefile b/contrib/intarray/Makefile index b68959ebd64d..01faa36b1073 100644 --- a/contrib/intarray/Makefile +++ b/contrib/intarray/Makefile @@ -12,7 +12,8 @@ OBJS = \ _intbig_gist.o EXTENSION = intarray -DATA = intarray--1.2--1.3.sql intarray--1.2.sql intarray--1.1--1.2.sql \ +DATA = intarray--1.3--1.4.sql intarray--1.2--1.3.sql \ + intarray--1.2.sql intarray--1.1--1.2.sql \ intarray--1.0--1.1.sql PGFILEDESC = "intarray - functions and operators for arrays of integers" diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c index fb05b06af9eb..f1817a6cce3b 100644 --- a/contrib/intarray/_int_gist.c +++ b/contrib/intarray/_int_gist.c @@ -93,6 +93,12 @@ g_int_consistent(PG_FUNCTION_ARGS) break; case RTContainedByStrategyNumber: case RTOldContainedByStrategyNumber: + + /* + * This code is unreachable as of intarray 1.4, because the <@ + * operator has been removed from the opclass. We keep it for now + * to support older versions of the SQL definitions. + */ if (GIST_LEAF(entry)) retval = inner_int_contains(query, (ArrayType *) DatumGetPointer(entry->key)); diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c index 67c44e99a9a7..18ecd8cda6b1 100644 --- a/contrib/intarray/_intbig_gist.c +++ b/contrib/intarray/_intbig_gist.c @@ -533,6 +533,12 @@ g_intbig_consistent(PG_FUNCTION_ARGS) break; case RTContainedByStrategyNumber: case RTOldContainedByStrategyNumber: + + /* + * This code is unreachable as of intarray 1.4, because the <@ + * operator has been removed from the opclass. We keep it for now + * to support older versions of the SQL definitions. + */ if (GIST_LEAF(entry)) { int i, diff --git a/contrib/intarray/intarray--1.3--1.4.sql b/contrib/intarray/intarray--1.3--1.4.sql new file mode 100644 index 000000000000..3fbebb541737 --- /dev/null +++ b/contrib/intarray/intarray--1.3--1.4.sql @@ -0,0 +1,21 @@ +/* contrib/intarray/intarray--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION intarray UPDATE TO '1.4'" to load this file. \quit + +-- Remove <@ from the GiST opclasses, as it's not usefully indexable +-- due to mishandling of empty arrays. (It's OK in GIN.) + +ALTER OPERATOR FAMILY gist__int_ops USING gist +DROP OPERATOR 8 (_int4, _int4); + +ALTER OPERATOR FAMILY gist__intbig_ops USING gist +DROP OPERATOR 8 (_int4, _int4); + +-- Likewise for the old spelling ~. + +ALTER OPERATOR FAMILY gist__int_ops USING gist +DROP OPERATOR 14 (_int4, _int4); + +ALTER OPERATOR FAMILY gist__intbig_ops USING gist +DROP OPERATOR 14 (_int4, _int4); diff --git a/contrib/intarray/intarray.control b/contrib/intarray/intarray.control index db7746b6c7a0..bbc837c5732e 100644 --- a/contrib/intarray/intarray.control +++ b/contrib/intarray/intarray.control @@ -1,6 +1,6 @@ # intarray extension comment = 'functions, operators, and index support for 1-D arrays of integers' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/_int' relocatable = true trusted = true diff --git a/doc/src/sgml/intarray.sgml b/doc/src/sgml/intarray.sgml index 9d2eb52eeb4f..c8db87e97df9 100644 --- a/doc/src/sgml/intarray.sgml +++ b/doc/src/sgml/intarray.sgml @@ -399,7 +399,7 @@ intarray provides index support for the - &&, @>, <@, + &&, @>, and @@ operators, as well as regular array equality. @@ -436,7 +436,8 @@ There is also a non-default GIN operator class - gin__int_ops supporting the same operators. + gin__int_ops, which supports these operators as well + as <@. From 1c164ef3d28dfab445a885a03e80cfd0d552f64a Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 9 Aug 2020 11:32:31 -0400 Subject: [PATCH 274/334] Remove useless Assert. Testing that an unsigned variable is >= 0 is pretty pointless, as noted by Coverity and numerous buildfarm members. In passing, add comment about new uses of "volatile" --- Coverity doesn't much like that either, but it seems probably necessary. --- src/backend/replication/logical/reorderbuffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 5b7afe6d9e9c..1975d629a6e2 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -1907,6 +1907,9 @@ ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, * merge) and replay the changes in lsn order. * * If streaming is true then data will be sent using stream API. + * + * Note: "volatile" markers on some parameters are to avoid trouble with + * PG_TRY inside the function. */ static void ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, @@ -2762,7 +2765,6 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, } Assert(txn->size <= rb->size); - Assert((txn->size >= 0) && (rb->size >= 0)); } /* From 1b9cde51246c7773eac119b84cc18095118735de Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 9 Aug 2020 12:39:07 -0400 Subject: [PATCH 275/334] Check for fseeko() failure in pg_dump's _tarAddFile(). Coverity pointed out, not unreasonably, that we checked fseeko's result at every other call site but these. Failure to seek in the temp file (note this is NOT pg_dump's output file) seems quite unlikely, and even if it did happen the file length cross-check further down would probably detect the problem. Still, that's a poor excuse for not checking the result of a system call. --- src/bin/pg_dump/pg_backup_tar.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index b4f594295927..c601ec07012a 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -1082,11 +1082,13 @@ _tarAddFile(ArchiveHandle *AH, TAR_MEMBER *th) /* * Find file len & go back to start. */ - fseeko(tmp, 0, SEEK_END); + if (fseeko(tmp, 0, SEEK_END) != 0) + fatal("error during file seek: %m"); th->fileLen = ftello(tmp); if (th->fileLen < 0) fatal("could not determine seek position in archive file: %m"); - fseeko(tmp, 0, SEEK_SET); + if (fseeko(tmp, 0, SEEK_SET) != 0) + fatal("error during file seek: %m"); _tarWriteHeader(th); From d129c07499dbf0d5960115173515e3ce384c662a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 9 Aug 2020 12:01:15 -0700 Subject: [PATCH 276/334] Correct nbtree page split lock coupling comment. There is no reason to distinguish between readers and writers here. --- src/backend/access/nbtree/nbtinsert.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index e3a44bc09e02..d36f7557c87c 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1861,11 +1861,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, } /* - * We have to grab the right sibling (if any) and fix the prev pointer - * there. We are guaranteed that this is deadlock-free since no other - * writer will be holding a lock on that page and trying to move left, and - * all readers release locks on a page before trying to fetch its - * neighbors. + * We have to grab the original right sibling (if any) and update its prev + * link. We are guaranteed that this is deadlock-free, since we couple + * the locks in the standard order: left to right. */ if (!isrightmost) { From 7eeb1d9861b0a3f453f8b31c7648396cdd7f1e59 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 10 Aug 2020 10:44:42 -0400 Subject: [PATCH 277/334] Make contrib modules' installation scripts more secure. Hostile objects located within the installation-time search_path could capture references in an extension's installation or upgrade script. If the extension is being installed with superuser privileges, this opens the door to privilege escalation. While such hazards have existed all along, their urgency increases with the v13 "trusted extensions" feature, because that lets a non-superuser control the installation path for a superuser-privileged script. Therefore, make a number of changes to make such situations more secure: * Tweak the construction of the installation-time search_path to ensure that references to objects in pg_catalog can't be subverted; and explicitly add pg_temp to the end of the path to prevent attacks using temporary objects. * Disable check_function_bodies within installation/upgrade scripts, so that any security gaps in SQL-language or PL-language function bodies cannot create a risk of unwanted installation-time code execution. * Adjust lookup of type input/receive functions and join estimator functions to complain if there are multiple candidate functions. This prevents capture of references to functions whose signature is not the first one checked; and it's arguably more user-friendly anyway. * Modify various contrib upgrade scripts to ensure that catalog modification queries are executed with secure search paths. (These are in-place modifications with no extension version changes, since it is the update process itself that is at issue, not the end result.) Extensions that depend on other extensions cannot be made fully secure by these methods alone; therefore, revert the "trusted" marking that commit eb67623c9 applied to earthdistance and hstore_plperl, pending some better solution to that set of issues. Also add documentation around these issues, to help extension authors write secure installation scripts. Patch by me, following an observation by Andres Freund; thanks to Noah Misch for review. Security: CVE-2020-14350 --- contrib/btree_gist/btree_gist--1.1--1.2.sql | 56 +++-- contrib/citext/citext--1.1--1.2.sql | 26 ++- contrib/citext/citext--1.2--1.3.sql | 18 +- contrib/cube/cube--1.1--1.2.sql | 25 ++- contrib/cube/cube--1.3--1.4.sql | 25 ++- contrib/earthdistance/earthdistance--1.1.sql | 2 +- contrib/earthdistance/earthdistance.control | 1 - contrib/hstore/hstore--1.1--1.2.sql | 9 +- contrib/hstore/hstore--1.3--1.4.sql | 35 +++- contrib/hstore_plperl/hstore_plperl.control | 1 - contrib/intagg/intagg--1.0--1.1.sql | 14 +- contrib/intarray/intarray--1.1--1.2.sql | 27 ++- contrib/ltree/ltree--1.0--1.1.sql | 37 +++- contrib/pg_trgm/pg_trgm--1.2--1.3.sql | 25 ++- contrib/seg/seg--1.0--1.1.sql | 23 ++- contrib/seg/seg--1.2--1.3.sql | 25 ++- doc/src/sgml/earthdistance.sgml | 27 ++- doc/src/sgml/extend.sgml | 203 +++++++++++++++---- doc/src/sgml/hstore.sgml | 12 +- doc/src/sgml/ltree.sgml | 9 + doc/src/sgml/ref/create_extension.sgml | 37 +++- src/backend/commands/extension.c | 21 +- src/backend/commands/operatorcmds.c | 26 ++- src/backend/commands/typecmds.c | 50 +++-- 24 files changed, 575 insertions(+), 159 deletions(-) diff --git a/contrib/btree_gist/btree_gist--1.1--1.2.sql b/contrib/btree_gist/btree_gist--1.1--1.2.sql index 8487f9bfc88a..d5a8c6cf90e9 100644 --- a/contrib/btree_gist/btree_gist--1.1--1.2.sql +++ b/contrib/btree_gist/btree_gist--1.1--1.2.sql @@ -8,56 +8,72 @@ -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); + UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types ('gbt_oid_distance(internal,oid,int2,oid)', '{internal,oid,int2,oid,internal}'), ('gbt_oid_union(bytea,internal)', '{internal,internal}'), -('gbt_oid_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'), +('gbt_oid_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'), ('gbt_int2_distance(internal,int2,int2,oid)', '{internal,int2,int2,oid,internal}'), ('gbt_int2_union(bytea,internal)', '{internal,internal}'), -('gbt_int2_same(internal,internal,internal)', '{gbtreekey4,gbtreekey4,internal}'), +('gbt_int2_same(internal,internal,internal)', '{SCH.gbtreekey4,SCH.gbtreekey4,internal}'), ('gbt_int4_distance(internal,int4,int2,oid)', '{internal,int4,int2,oid,internal}'), ('gbt_int4_union(bytea,internal)', '{internal,internal}'), -('gbt_int4_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'), +('gbt_int4_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'), ('gbt_int8_distance(internal,int8,int2,oid)', '{internal,int8,int2,oid,internal}'), ('gbt_int8_union(bytea,internal)', '{internal,internal}'), -('gbt_int8_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'), +('gbt_int8_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'), ('gbt_float4_distance(internal,float4,int2,oid)', '{internal,float4,int2,oid,internal}'), ('gbt_float4_union(bytea,internal)', '{internal,internal}'), -('gbt_float4_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'), +('gbt_float4_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'), ('gbt_float8_distance(internal,float8,int2,oid)', '{internal,float8,int2,oid,internal}'), ('gbt_float8_union(bytea,internal)', '{internal,internal}'), -('gbt_float8_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'), +('gbt_float8_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'), ('gbt_ts_distance(internal,timestamp,int2,oid)', '{internal,timestamp,int2,oid,internal}'), ('gbt_tstz_distance(internal,timestamptz,int2,oid)', '{internal,timestamptz,int2,oid,internal}'), ('gbt_ts_union(bytea,internal)', '{internal,internal}'), -('gbt_ts_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'), +('gbt_ts_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'), ('gbt_time_distance(internal,time,int2,oid)', '{internal,time,int2,oid,internal}'), ('gbt_time_union(bytea,internal)', '{internal,internal}'), -('gbt_time_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'), +('gbt_time_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'), ('gbt_date_distance(internal,date,int2,oid)', '{internal,date,int2,oid,internal}'), ('gbt_date_union(bytea,internal)', '{internal,internal}'), -('gbt_date_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'), +('gbt_date_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'), ('gbt_intv_distance(internal,interval,int2,oid)', '{internal,interval,int2,oid,internal}'), ('gbt_intv_union(bytea,internal)', '{internal,internal}'), -('gbt_intv_same(internal,internal,internal)', '{gbtreekey32,gbtreekey32,internal}'), +('gbt_intv_same(internal,internal,internal)', '{SCH.gbtreekey32,SCH.gbtreekey32,internal}'), ('gbt_cash_distance(internal,money,int2,oid)', '{internal,money,int2,oid,internal}'), ('gbt_cash_union(bytea,internal)', '{internal,internal}'), -('gbt_cash_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'), +('gbt_cash_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'), ('gbt_macad_union(bytea,internal)', '{internal,internal}'), -('gbt_macad_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'), +('gbt_macad_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'), ('gbt_text_union(bytea,internal)', '{internal,internal}'), -('gbt_text_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'), +('gbt_text_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'), ('gbt_bytea_union(bytea,internal)', '{internal,internal}'), -('gbt_bytea_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'), +('gbt_bytea_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'), ('gbt_numeric_union(bytea,internal)', '{internal,internal}'), -('gbt_numeric_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'), +('gbt_numeric_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'), ('gbt_bit_union(bytea,internal)', '{internal,internal}'), -('gbt_bit_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'), +('gbt_bit_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'), ('gbt_inet_union(bytea,internal)', '{internal,internal}'), -('gbt_inet_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +('gbt_inet_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}') +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; diff --git a/contrib/citext/citext--1.1--1.2.sql b/contrib/citext/citext--1.1--1.2.sql index 4f0e4bc7195b..a8bba860a1d4 100644 --- a/contrib/citext/citext--1.1--1.2.sql +++ b/contrib/citext/citext--1.1--1.2.sql @@ -41,14 +41,28 @@ ALTER FUNCTION replace(citext, citext, citext) PARALLEL SAFE; ALTER FUNCTION split_part(citext, citext, int) PARALLEL SAFE; ALTER FUNCTION translate(citext, citext, text) PARALLEL SAFE; +-- We have to update aggregates the hard way for lack of ALTER support +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); + UPDATE pg_proc SET proparallel = 's' -WHERE oid = 'min(citext)'::pg_catalog.regprocedure; +WHERE oid = (my_schema || '.min(' || my_schema || '.citext)')::pg_catalog.regprocedure; UPDATE pg_proc SET proparallel = 's' -WHERE oid = 'max(citext)'::pg_catalog.regprocedure; +WHERE oid = (my_schema || '.max(' || my_schema || '.citext)')::pg_catalog.regprocedure; + +UPDATE pg_aggregate SET aggcombinefn = (my_schema || '.citext_smaller')::regproc +WHERE aggfnoid = (my_schema || '.max(' || my_schema || '.citext)')::pg_catalog.regprocedure; -UPDATE pg_aggregate SET aggcombinefn = 'citext_smaller' -WHERE aggfnoid = 'max(citext)'::pg_catalog.regprocedure; +UPDATE pg_aggregate SET aggcombinefn = (my_schema || '.citext_larger')::regproc +WHERE aggfnoid = (my_schema || '.max(' || my_schema || '.citext)')::pg_catalog.regprocedure; -UPDATE pg_aggregate SET aggcombinefn = 'citext_larger' -WHERE aggfnoid = 'max(citext)'::pg_catalog.regprocedure; +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; diff --git a/contrib/citext/citext--1.2--1.3.sql b/contrib/citext/citext--1.2--1.3.sql index 4ab867915c73..24a71452c624 100644 --- a/contrib/citext/citext--1.2--1.3.sql +++ b/contrib/citext/citext--1.2--1.3.sql @@ -3,5 +3,19 @@ -- complain if script is sourced in psql, rather than via ALTER EXTENSION \echo Use "ALTER EXTENSION citext UPDATE TO '1.3'" to load this file. \quit -UPDATE pg_aggregate SET aggcombinefn = 'citext_smaller' -WHERE aggfnoid = 'min(citext)'::pg_catalog.regprocedure; +-- We have to update aggregates the hard way for lack of ALTER support +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); + +UPDATE pg_aggregate SET aggcombinefn = (my_schema || '.citext_smaller')::regproc +WHERE aggfnoid = (my_schema || '.min(' || my_schema || '.citext)')::pg_catalog.regprocedure; + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; diff --git a/contrib/cube/cube--1.1--1.2.sql b/contrib/cube/cube--1.1--1.2.sql index 64a531e8b433..76aba239e5bc 100644 --- a/contrib/cube/cube--1.1--1.2.sql +++ b/contrib/cube/cube--1.1--1.2.sql @@ -7,16 +7,31 @@ -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions, -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types -('g_cube_consistent(internal,cube,int4,oid,internal)', '{internal,cube,int2,oid,internal}'), -('g_cube_distance(internal,cube,smallint,oid)', '{internal,cube,smallint,oid,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types +('g_cube_consistent(internal,SCH.cube,int4,oid,internal)', '{internal,SCH.cube,int2,oid,internal}'), +('g_cube_distance(internal,SCH.cube,smallint,oid)', '{internal,SCH.cube,smallint,oid,internal}') +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; ALTER FUNCTION cube_in(cstring) PARALLEL SAFE; ALTER FUNCTION cube(float8[], float8[]) PARALLEL SAFE; diff --git a/contrib/cube/cube--1.3--1.4.sql b/contrib/cube/cube--1.3--1.4.sql index 869820c0c834..41629395df27 100644 --- a/contrib/cube/cube--1.3--1.4.sql +++ b/contrib/cube/cube--1.3--1.4.sql @@ -12,6 +12,15 @@ -- bound into a particular opclass. There's no SQL command for that, -- so fake it with a manual update on pg_depend. -- +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); + UPDATE pg_catalog.pg_depend SET deptype = 'a' WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass @@ -20,14 +29,10 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass FROM pg_catalog.pg_depend WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass - AND (refobjid = 'g_cube_compress(pg_catalog.internal)'::pg_catalog.regprocedure)) + AND (refobjid = (my_schema || '.g_cube_compress(pg_catalog.internal)')::pg_catalog.regprocedure)) AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass AND deptype = 'i'; -ALTER OPERATOR FAMILY gist_cube_ops USING gist drop function 3 (cube); -ALTER EXTENSION cube DROP function g_cube_compress(pg_catalog.internal); -DROP FUNCTION g_cube_compress(pg_catalog.internal); - UPDATE pg_catalog.pg_depend SET deptype = 'a' WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass @@ -36,10 +41,18 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass FROM pg_catalog.pg_depend WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass - AND (refobjid = 'g_cube_decompress(pg_catalog.internal)'::pg_catalog.regprocedure)) + AND (refobjid = (my_schema || '.g_cube_decompress(pg_catalog.internal)')::pg_catalog.regprocedure)) AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass AND deptype = 'i'; +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; + +ALTER OPERATOR FAMILY gist_cube_ops USING gist drop function 3 (cube); +ALTER EXTENSION cube DROP function g_cube_compress(pg_catalog.internal); +DROP FUNCTION g_cube_compress(pg_catalog.internal); + ALTER OPERATOR FAMILY gist_cube_ops USING gist drop function 4 (cube); ALTER EXTENSION cube DROP function g_cube_decompress(pg_catalog.internal); DROP FUNCTION g_cube_decompress(pg_catalog.internal); diff --git a/contrib/earthdistance/earthdistance--1.1.sql b/contrib/earthdistance/earthdistance--1.1.sql index 9136a54a7b34..9ef20ab848c5 100644 --- a/contrib/earthdistance/earthdistance--1.1.sql +++ b/contrib/earthdistance/earthdistance--1.1.sql @@ -31,7 +31,7 @@ CREATE DOMAIN earth AS cube CONSTRAINT not_point check(cube_is_point(value)) CONSTRAINT not_3d check(cube_dim(value) <= 3) CONSTRAINT on_surface check(abs(cube_distance(value, '(0)'::cube) / - earth() - 1) < '10e-7'::float8); + earth() - '1'::float8) < '10e-7'::float8); CREATE FUNCTION sec_to_gc(float8) RETURNS float8 diff --git a/contrib/earthdistance/earthdistance.control b/contrib/earthdistance/earthdistance.control index 3df666dfc1bb..5816d22cdd98 100644 --- a/contrib/earthdistance/earthdistance.control +++ b/contrib/earthdistance/earthdistance.control @@ -3,5 +3,4 @@ comment = 'calculate great-circle distances on the surface of the Earth' default_version = '1.1' module_pathname = '$libdir/earthdistance' relocatable = true -trusted = true requires = 'cube' diff --git a/contrib/hstore/hstore--1.1--1.2.sql b/contrib/hstore/hstore--1.1--1.2.sql index a868ffe48e1a..cc69fc7f802e 100644 --- a/contrib/hstore/hstore--1.1--1.2.sql +++ b/contrib/hstore/hstore--1.1--1.2.sql @@ -9,10 +9,13 @@ -- dependent on the extension. DO LANGUAGE plpgsql - $$ - +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); PERFORM 1 FROM pg_proc p @@ -27,6 +30,7 @@ BEGIN IF NOT FOUND THEN + PERFORM pg_catalog.set_config('search_path', old_path, true); CREATE FUNCTION hstore_to_json(hstore) RETURNS json @@ -43,6 +47,7 @@ BEGIN END IF; +PERFORM pg_catalog.set_config('search_path', old_path, true); END; $$; diff --git a/contrib/hstore/hstore--1.3--1.4.sql b/contrib/hstore/hstore--1.3--1.4.sql index d68956bb9495..53f26f9fb847 100644 --- a/contrib/hstore/hstore--1.3--1.4.sql +++ b/contrib/hstore/hstore--1.3--1.4.sql @@ -7,23 +7,38 @@ -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions, -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types -('ghstore_same(internal,internal,internal)', '{ghstore,ghstore,internal}'), -('ghstore_consistent(internal,internal,int4,oid,internal)', '{internal,hstore,int2,oid,internal}'), -('gin_extract_hstore(internal,internal)', '{hstore,internal}'), -('gin_extract_hstore_query(internal,internal,int2,internal,internal)', '{hstore,internal,int2,internal,internal}'), -('gin_consistent_hstore(internal,int2,internal,int4,internal,internal)', '{internal,int2,hstore,int4,internal,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types +('ghstore_same(internal,internal,internal)', '{SCH.ghstore,SCH.ghstore,internal}'), +('ghstore_consistent(internal,internal,int4,oid,internal)', '{internal,SCH.hstore,int2,oid,internal}'), +('gin_extract_hstore(internal,internal)', '{SCH.hstore,internal}'), +('gin_extract_hstore_query(internal,internal,int2,internal,internal)', '{SCH.hstore,internal,int2,internal,internal}'), +('gin_consistent_hstore(internal,int2,internal,int4,internal,internal)', '{internal,int2,SCH.hstore,int4,internal,internal}') +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); UPDATE pg_catalog.pg_proc SET - prorettype = 'ghstore'::pg_catalog.regtype -WHERE oid = pg_catalog.to_regprocedure('ghstore_union(internal,internal)'); + prorettype = (my_schema || '.ghstore')::pg_catalog.regtype +WHERE oid = pg_catalog.to_regprocedure((my_schema || '.ghstore_union(internal,internal)')); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; ALTER FUNCTION hstore_in(cstring) PARALLEL SAFE; ALTER FUNCTION hstore_out(hstore) PARALLEL SAFE; diff --git a/contrib/hstore_plperl/hstore_plperl.control b/contrib/hstore_plperl/hstore_plperl.control index 4b9fd13d04fc..16277f68c1cc 100644 --- a/contrib/hstore_plperl/hstore_plperl.control +++ b/contrib/hstore_plperl/hstore_plperl.control @@ -3,5 +3,4 @@ comment = 'transform between hstore and plperl' default_version = '1.0' module_pathname = '$libdir/hstore_plperl' relocatable = true -trusted = true requires = 'hstore,plperl' diff --git a/contrib/intagg/intagg--1.0--1.1.sql b/contrib/intagg/intagg--1.0--1.1.sql index b2a2820b0cac..c0cc17a033bd 100644 --- a/contrib/intagg/intagg--1.0--1.1.sql +++ b/contrib/intagg/intagg--1.0--1.1.sql @@ -6,6 +6,18 @@ ALTER FUNCTION int_agg_state(internal, int4) PARALLEL SAFE; ALTER FUNCTION int_agg_final_array(internal) PARALLEL SAFE; ALTER FUNCTION int_array_enum(int4[]) PARALLEL SAFE; +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_proc SET proparallel = 's' -WHERE oid = 'int_array_aggregate(int4)'::pg_catalog.regprocedure; +WHERE oid = (my_schema || '.int_array_aggregate(int4)')::pg_catalog.regprocedure; + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; diff --git a/contrib/intarray/intarray--1.1--1.2.sql b/contrib/intarray/intarray--1.1--1.2.sql index 468f245ecec9..919340ef01ef 100644 --- a/contrib/intarray/intarray--1.1--1.2.sql +++ b/contrib/intarray/intarray--1.1--1.2.sql @@ -7,23 +7,38 @@ -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions, -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types ('g_int_consistent(internal,_int4,int4,oid,internal)', '{internal,_int4,int2,oid,internal}'), ('g_intbig_consistent(internal,internal,int4,oid,internal)', '{internal,_int4,int2,oid,internal}'), -('g_intbig_same(internal,internal,internal)', '{intbig_gkey,intbig_gkey,internal}'), +('g_intbig_same(internal,internal,internal)', '{SCH.intbig_gkey,SCH.intbig_gkey,internal}'), ('ginint4_queryextract(internal,internal,int2,internal,internal,internal,internal)', '{_int4,internal,int2,internal,internal,internal,internal}'), ('ginint4_consistent(internal,int2,internal,int4,internal,internal,internal,internal)', '{internal,int2,_int4,int4,internal,internal,internal,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); UPDATE pg_catalog.pg_proc SET - prorettype = 'intbig_gkey'::pg_catalog.regtype -WHERE oid = pg_catalog.to_regprocedure('g_intbig_union(internal,internal)'); + prorettype = (my_schema || '.intbig_gkey')::pg_catalog.regtype +WHERE oid = pg_catalog.to_regprocedure(my_schema || '.g_intbig_union(internal,internal)'); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; ALTER FUNCTION bqarr_in(cstring) PARALLEL SAFE; ALTER FUNCTION bqarr_out(query_int) PARALLEL SAFE; diff --git a/contrib/ltree/ltree--1.0--1.1.sql b/contrib/ltree/ltree--1.0--1.1.sql index 155751aa3a87..2ce6f5adbc21 100644 --- a/contrib/ltree/ltree--1.0--1.1.sql +++ b/contrib/ltree/ltree--1.0--1.1.sql @@ -7,26 +7,41 @@ -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions, -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types -('ltree_consistent(internal,internal,int2,oid,internal)', '{internal,ltree,int2,oid,internal}'), -('ltree_same(internal,internal,internal)', '{ltree_gist,ltree_gist,internal}'), -('_ltree_consistent(internal,internal,int2,oid,internal)', '{internal,_ltree,int2,oid,internal}'), -('_ltree_same(internal,internal,internal)', '{ltree_gist,ltree_gist,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types +('ltree_consistent(internal,internal,int2,oid,internal)', '{internal,SCH.ltree,int2,oid,internal}'), +('ltree_same(internal,internal,internal)', '{SCH.ltree_gist,SCH.ltree_gist,internal}'), +('_ltree_consistent(internal,internal,int2,oid,internal)', '{internal,SCH._ltree,int2,oid,internal}'), +('_ltree_same(internal,internal,internal)', '{SCH.ltree_gist,SCH.ltree_gist,internal}') +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); UPDATE pg_catalog.pg_proc SET - prorettype = 'ltree_gist'::pg_catalog.regtype -WHERE oid = pg_catalog.to_regprocedure('ltree_union(internal,internal)'); + prorettype = (my_schema || '.ltree_gist')::pg_catalog.regtype +WHERE oid = pg_catalog.to_regprocedure(my_schema || '.ltree_union(internal,internal)'); UPDATE pg_catalog.pg_proc SET - prorettype = 'ltree_gist'::pg_catalog.regtype -WHERE oid = pg_catalog.to_regprocedure('_ltree_union(internal,internal)'); + prorettype = (my_schema || '.ltree_gist')::pg_catalog.regtype +WHERE oid = pg_catalog.to_regprocedure(my_schema || '._ltree_union(internal,internal)'); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; ALTER FUNCTION ltree_in(cstring) PARALLEL SAFE; ALTER FUNCTION ltree_out(ltree) PARALLEL SAFE; diff --git a/contrib/pg_trgm/pg_trgm--1.2--1.3.sql b/contrib/pg_trgm/pg_trgm--1.2--1.3.sql index b082dcd8d841..8dc772c40727 100644 --- a/contrib/pg_trgm/pg_trgm--1.2--1.3.sql +++ b/contrib/pg_trgm/pg_trgm--1.2--1.3.sql @@ -7,21 +7,36 @@ -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions, -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types ('gtrgm_consistent(internal,text,int4,oid,internal)', '{internal,text,int2,oid,internal}'), ('gtrgm_distance(internal,text,int4,oid)', '{internal,text,int2,oid,internal}'), ('gtrgm_union(bytea,internal)', '{internal,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); UPDATE pg_catalog.pg_proc SET - prorettype = 'gtrgm'::pg_catalog.regtype -WHERE oid = pg_catalog.to_regprocedure('gtrgm_union(internal,internal)'); + prorettype = (my_schema || '.gtrgm')::pg_catalog.regtype +WHERE oid = pg_catalog.to_regprocedure(my_schema || '.gtrgm_union(internal,internal)'); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; ALTER FUNCTION set_limit(float4) PARALLEL UNSAFE; ALTER FUNCTION show_limit() PARALLEL SAFE; diff --git a/contrib/seg/seg--1.0--1.1.sql b/contrib/seg/seg--1.0--1.1.sql index 2dcd4d428003..ae6cb2fba889 100644 --- a/contrib/seg/seg--1.0--1.1.sql +++ b/contrib/seg/seg--1.0--1.1.sql @@ -7,15 +7,30 @@ -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions, -- wherein the signatures have been updated already. In that case to_regprocedure() will -- return NULL and no updates will happen. +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); UPDATE pg_catalog.pg_proc SET proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector, pronargs = pg_catalog.array_length(newtypes, 1) FROM (VALUES -(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types -('gseg_consistent(internal,seg,int4,oid,internal)', '{internal,seg,int2,oid,internal}') -) AS update_data (oldproc, newtypes) -WHERE oid = pg_catalog.to_regprocedure(oldproc); +(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types +('gseg_consistent(internal,SCH.seg,int4,oid,internal)', '{internal,SCH.seg,int2,oid,internal}') +) AS update_data (oldproc, newtypestext), +LATERAL ( + SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ +) ls +WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema)); + +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; ALTER FUNCTION seg_in(cstring) PARALLEL SAFE; ALTER FUNCTION seg_out(seg) PARALLEL SAFE; diff --git a/contrib/seg/seg--1.2--1.3.sql b/contrib/seg/seg--1.2--1.3.sql index cd71a300f6df..578e98953ca3 100644 --- a/contrib/seg/seg--1.2--1.3.sql +++ b/contrib/seg/seg--1.2--1.3.sql @@ -12,6 +12,15 @@ -- bound into a particular opclass. There's no SQL command for that, -- so fake it with a manual update on pg_depend. -- +DO LANGUAGE plpgsql +$$ +DECLARE + my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema()); + old_path pg_catalog.text := pg_catalog.current_setting('search_path'); +BEGIN +-- for safety, transiently set search_path to just pg_catalog+pg_temp +PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true); + UPDATE pg_catalog.pg_depend SET deptype = 'a' WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass @@ -20,14 +29,10 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass FROM pg_catalog.pg_depend WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass - AND (refobjid = 'gseg_compress(pg_catalog.internal)'::pg_catalog.regprocedure)) + AND (refobjid = (my_schema || '.gseg_compress(internal)')::pg_catalog.regprocedure)) AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass AND deptype = 'i'; -ALTER OPERATOR FAMILY gist_seg_ops USING gist drop function 3 (seg); -ALTER EXTENSION seg DROP function gseg_compress(pg_catalog.internal); -DROP function gseg_compress(pg_catalog.internal); - UPDATE pg_catalog.pg_depend SET deptype = 'a' WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass @@ -36,10 +41,18 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass FROM pg_catalog.pg_depend WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass - AND (refobjid = 'gseg_decompress(pg_catalog.internal)'::pg_catalog.regprocedure)) + AND (refobjid = (my_schema || '.gseg_decompress(internal)')::pg_catalog.regprocedure)) AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass AND deptype = 'i'; +PERFORM pg_catalog.set_config('search_path', old_path, true); +END +$$; + +ALTER OPERATOR FAMILY gist_seg_ops USING gist drop function 3 (seg); +ALTER EXTENSION seg DROP function gseg_compress(pg_catalog.internal); +DROP function gseg_compress(pg_catalog.internal); + ALTER OPERATOR FAMILY gist_seg_ops USING gist drop function 4 (seg); ALTER EXTENSION seg DROP function gseg_decompress(pg_catalog.internal); DROP function gseg_decompress(pg_catalog.internal); diff --git a/doc/src/sgml/earthdistance.sgml b/doc/src/sgml/earthdistance.sgml index 4ac52cb191cb..641e69c5e984 100644 --- a/doc/src/sgml/earthdistance.sgml +++ b/doc/src/sgml/earthdistance.sgml @@ -10,9 +10,8 @@ The earthdistance module provides two different approaches to calculating great circle distances on the surface of the Earth. The one - described first depends on the cube module (which - must be installed before earthdistance can be - installed). The second one is based on the built-in point data type, + described first depends on the cube module. + The second one is based on the built-in point data type, using longitude and latitude for the coordinates. @@ -24,11 +23,27 @@ - This module is considered trusted, that is, it can be - installed by non-superusers who have CREATE privilege - on the current database. + The cube module must be installed + before earthdistance can be installed + (although you can use the CASCADE option + of CREATE EXTENSION to install both in one command). + + + It is strongly recommended that earthdistance + and cube be installed in the same schema, and that + that schema be one for which CREATE privilege has not been and will not + be granted to any untrusted users. + Otherwise there are installation-time security hazards + if earthdistance's schema contains objects defined + by a hostile user. + Furthermore, when using earthdistance's functions + after installation, the entire search path should contain only trusted + schemas. + + + Cube-Based Earth Distances diff --git a/doc/src/sgml/extend.sgml b/doc/src/sgml/extend.sgml index 890ff97b7aef..641c9ce3c9ba 100644 --- a/doc/src/sgml/extend.sgml +++ b/doc/src/sgml/extend.sgml @@ -540,7 +540,7 @@ RETURNS anycompatible AS ... The extension script may set privileges on objects that are part of the - extension via GRANT and REVOKE + extension, using GRANT and REVOKE statements. The final set of privileges for each object (if any are set) will be stored in the pg_init_privs @@ -597,32 +597,6 @@ RETURNS anycompatible AS ... dropping the whole extension. - - Defining Extension Objects - - - - Widely-distributed extensions should assume little about the database - they occupy. In particular, unless you issued SET search_path = - pg_temp, assume each unqualified name could resolve to an - object that a malicious user has defined. Beware of constructs that - depend on search_path implicitly: IN - and CASE expression WHEN - always select an operator using the search path. In their place, use - OPERATOR(schema.=) ANY - and CASE WHEN expression. - - - - Extension Files @@ -740,7 +714,8 @@ RETURNS anycompatible AS ... If this parameter is true (which is the default), only superusers can create the extension or update it to a new - version. If it is set to false, just the privileges + version (but see also trusted, below). + If it is set to false, just the privileges required to execute the commands in the installation or update script are required. This should normally be set to true if any of the @@ -768,6 +743,9 @@ RETURNS anycompatible AS ... Generally, this should not be set true for extensions that could allow access to otherwise-superuser-only abilities, such as file system access. + Also, marking an extension trusted requires significant extra effort + to write the extension's installation and update script(s) securely; + see . @@ -921,7 +899,7 @@ RETURNS anycompatible AS ... schema; that is, CREATE EXTENSION does the equivalent of this: -SET LOCAL search_path TO @extschema@; +SET LOCAL search_path TO @extschema@, pg_temp; This allows the objects created by the script file to go into the target schema. The script file can change search_path if it wishes, @@ -941,9 +919,15 @@ SET LOCAL search_path TO @extschema@; If any prerequisite extensions are listed in requires - in the control file, their target schemas are appended to the initial - setting of search_path. This allows their objects to be - visible to the new extension's script file. + in the control file, their target schemas are added to the initial + setting of search_path, following the new + extension's target schema. This allows their objects to be visible to + the new extension's script file. + + + + For security, pg_temp is automatically appended to + the end of search_path in all cases. @@ -1170,6 +1154,154 @@ SELECT * FROM pg_extension_update_paths('extension_name + + Security Considerations for Extensions + + + Widely-distributed extensions should assume little about the database + they occupy. Therefore, it's appropriate to write functions provided + by an extension in a secure style that cannot be compromised by + search-path-based attacks. + + + + An extension that has the superuser property set to + true must also consider security hazards for the actions taken within + its installation and update scripts. It is not terribly difficult for + a malicious user to create trojan-horse objects that will compromise + later execution of a carelessly-written extension script, allowing that + user to acquire superuser privileges. + + + + If an extension is marked trusted, then its + installation schema can be selected by the installing user, who might + intentionally use an insecure schema in hopes of gaining superuser + privileges. Therefore, a trusted extension is extremely exposed from a + security standpoint, and all its script commands must be carefully + examined to ensure that no compromise is possible. + + + + Advice about writing functions securely is provided in + below, and advice + about writing installation scripts securely is provided in + . + + + + Security Considerations for Extension Functions + + + SQL-language and PL-language functions provided by extensions are at + risk of search-path-based attacks when they are executed, since + parsing of these functions occurs at execution time not creation time. + + + + The CREATE + FUNCTION reference page contains advice about + writing SECURITY DEFINER functions safely. It's + good practice to apply those techniques for any function provided by + an extension, since the function might be called by a high-privilege + user. + + + + + If you cannot set the search_path to contain only + secure schemas, assume that each unqualified name could resolve to an + object that a malicious user has defined. Beware of constructs that + depend on search_path implicitly; for + example, IN + and CASE expression WHEN + always select an operator using the search path. In their place, use + OPERATOR(schema.=) ANY + and CASE WHEN expression. + + + + A general-purpose extension usually should not assume that it's been + installed into a secure schema, which means that even schema-qualified + references to its own objects are not entirely risk-free. For + example, if the extension has defined a + function myschema.myfunc(bigint) then a call such + as myschema.myfunc(42) could be captured by a + hostile function myschema.myfunc(integer). Be + careful that the data types of function and operator parameters exactly + match the declared argument types, using explicit casts where necessary. + + + + + Security Considerations for Extension Scripts + + + An extension installation or update script should be written to guard + against search-path-based attacks occurring when the script executes. + If an object reference in the script can be made to resolve to some + other object than the script author intended, then a compromise might + occur immediately, or later when the mis-defined extension object is + used. + + + + DDL commands such as CREATE FUNCTION + and CREATE OPERATOR CLASS are generally secure, + but beware of any command having a general-purpose expression as a + component. For example, CREATE VIEW needs to be + vetted, as does a DEFAULT expression + in CREATE FUNCTION. + + + + Sometimes an extension script might need to execute general-purpose + SQL, for example to make catalog adjustments that aren't possible via + DDL. Be careful to execute such commands with a + secure search_path; do not + trust the path provided by CREATE/ALTER EXTENSION + to be secure. Best practice is to temporarily + set search_path to 'pg_catalog, + pg_temp' and insert references to the extension's + installation schema explicitly where needed. (This practice might + also be helpful for creating views.) Examples can be found in + the contrib modules in + the PostgreSQL source code distribution. + + + + Cross-extension references are extremely difficult to make fully + secure, partially because of uncertainty about which schema the other + extension is in. The hazards are reduced if both extensions are + installed in the same schema, because then a hostile object cannot be + placed ahead of the referenced extension in the installation-time + search_path. However, no mechanism currently exists + to require that. For now, best practice is to not mark an extension + trusted if it depends on another one, unless that other one is always + installed in pg_catalog. + + + + Do not use CREATE OR REPLACE + FUNCTION, except in an update script that must change the + definition of a function that is known to be an extension member + already. (Likewise for other OR REPLACE options.) + Using OR REPLACE unnecessarily not only has a risk + of accidentally overwriting someone else's function, but it creates a + security hazard since the overwritten function would still be owned by + its original owner, who could modify it. + + + + Extension Example @@ -1189,18 +1321,18 @@ SELECT * FROM pg_extension_update_paths('extension_name (LEFTARG = text, RIGHTARG = text, FUNCTION = pair); -- "SET search_path" is easy to get right, but qualified names perform better. -CREATE OR REPLACE FUNCTION lower(pair) +CREATE FUNCTION lower(pair) RETURNS pair LANGUAGE SQL AS 'SELECT ROW(lower($1.k), lower($1.v))::@extschema@.pair;' SET search_path = pg_temp; -CREATE OR REPLACE FUNCTION pair_concat(pair, pair) +CREATE FUNCTION pair_concat(pair, pair) RETURNS pair LANGUAGE SQL AS 'SELECT ROW($1.k OPERATOR(pg_catalog.||) $2.k, $1.v OPERATOR(pg_catalog.||) $2.v)::@extschema@.pair;'; @@ -1215,6 +1347,7 @@ AS 'SELECT ROW($1.k OPERATOR(pg_catalog.||) $2.k, # pair extension comment = 'A key/value pair data type' default_version = '1.0' +# cannot be relocatable because of use of @extschema@ relocatable = false diff --git a/doc/src/sgml/hstore.sgml b/doc/src/sgml/hstore.sgml index fd75e92790b3..8a1caa357613 100644 --- a/doc/src/sgml/hstore.sgml +++ b/doc/src/sgml/hstore.sgml @@ -918,10 +918,14 @@ ALTER TABLE tablename ALTER hstorecol TYPE hstore USING hstorecol || ''; Python dictionaries. - - Of these additional extensions, hstore_plperl is - considered trusted; the rest are not. - + + + It is strongly recommended that the transform extensions be installed in + the same schema as hstore. Otherwise there are + installation-time security hazards if a transform extension's schema + contains objects defined by a hostile user. + + diff --git a/doc/src/sgml/ltree.sgml b/doc/src/sgml/ltree.sgml index dea453fc7599..36aa2b5fad86 100644 --- a/doc/src/sgml/ltree.sgml +++ b/doc/src/sgml/ltree.sgml @@ -835,6 +835,15 @@ ltreetest=> SELECT ins_label(path,2,'Space') FROM test WHERE path <@ 'Top. creating a function, ltree values are mapped to Python lists. (The reverse is currently not supported, however.) + + + + It is strongly recommended that the transform extensions be installed in + the same schema as ltree. Otherwise there are + installation-time security hazards if a transform extension's schema + contains objects defined by a hostile user. + + diff --git a/doc/src/sgml/ref/create_extension.sgml b/doc/src/sgml/ref/create_extension.sgml index 756dd193f854..efd7fc646560 100644 --- a/doc/src/sgml/ref/create_extension.sgml +++ b/doc/src/sgml/ref/create_extension.sgml @@ -177,6 +177,33 @@ CREATE EXTENSION [ IF NOT EXISTS ] extension_name system views. + + + Installing an extension as superuser requires trusting that the + extension's author wrote the extension installation script in a secure + fashion. It is not terribly difficult for a malicious user to create + trojan-horse objects that will compromise later execution of a + carelessly-written extension script, allowing that user to acquire + superuser privileges. However, trojan-horse objects are only hazardous + if they are in the search_path during script + execution, meaning that they are in the extension's installation target + schema or in the schema of some extension it depends on. Therefore, a + good rule of thumb when dealing with extensions whose scripts have not + been carefully vetted is to install them only into schemas for which + CREATE privilege has not been and will not be granted to any untrusted + users. Likewise for any extensions they depend on. + + + + The extensions supplied with PostgreSQL are + believed to be secure against installation-time attacks of this sort, + except for a few that depend on other extensions. As stated in the + documentation for those extensions, they should be installed into secure + schemas, or installed into the same schemas as the extensions they + depend on, or both. + + + For information about writing new extensions, see . @@ -188,10 +215,16 @@ CREATE EXTENSION [ IF NOT EXISTS ] extension_name Install the hstore extension into the - current database: + current database, placing its objects in schema addons: + +CREATE EXTENSION hstore SCHEMA addons; + + Another way to accomplish the same thing: +SET search_path = addons; CREATE EXTENSION hstore; - + + diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index c796fcd8da0a..b5630b4c8d98 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -908,9 +908,21 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control, GUC_ACTION_SAVE, true, 0, false); /* - * Set up the search path to contain the target schema, then the schemas - * of any prerequisite extensions, and nothing else. In particular this - * makes the target schema be the default creation target namespace. + * Similarly disable check_function_bodies, to ensure that SQL functions + * won't be parsed during creation. + */ + if (check_function_bodies) + (void) set_config_option("check_function_bodies", "off", + PGC_USERSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); + + /* + * Set up the search path to have the target schema first, making it be + * the default creation target namespace. Then add the schemas of any + * prerequisite extensions, unless they are in pg_catalog which would be + * searched anyway. (Listing pg_catalog explicitly in a non-first + * position would be bad for security.) Finally add pg_temp to ensure + * that temp objects can't take precedence over others. * * Note: it might look tempting to use PushOverrideSearchPath for this, * but we cannot do that. We have to actually set the search_path GUC in @@ -924,9 +936,10 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control, Oid reqschema = lfirst_oid(lc); char *reqname = get_namespace_name(reqschema); - if (reqname) + if (reqname && strcmp(reqname, "pg_catalog") != 0) appendStringInfo(&pathbuf, ", %s", quote_identifier(reqname)); } + appendStringInfoString(&pathbuf, ", pg_temp"); (void) set_config_option("search_path", pathbuf.data, PGC_USERSET, PGC_S_SESSION, diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c index 0a53e9b93e21..bf23937849c9 100644 --- a/src/backend/commands/operatorcmds.c +++ b/src/backend/commands/operatorcmds.c @@ -297,6 +297,7 @@ ValidateJoinEstimator(List *joinName) { Oid typeId[5]; Oid joinOid; + Oid joinOid2; AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ @@ -307,15 +308,26 @@ ValidateJoinEstimator(List *joinName) /* * As of Postgres 8.4, the preferred signature for join estimators has 5 - * arguments, but we still allow the old 4-argument form. Try the - * preferred form first. + * arguments, but we still allow the old 4-argument form. Whine about + * ambiguity if both forms exist. */ joinOid = LookupFuncName(joinName, 5, typeId, true); - if (!OidIsValid(joinOid)) - joinOid = LookupFuncName(joinName, 4, typeId, true); - /* If not found, reference the 5-argument signature in error msg */ - if (!OidIsValid(joinOid)) - joinOid = LookupFuncName(joinName, 5, typeId, false); + joinOid2 = LookupFuncName(joinName, 4, typeId, true); + if (OidIsValid(joinOid)) + { + if (OidIsValid(joinOid2)) + ereport(ERROR, + (errcode(ERRCODE_AMBIGUOUS_FUNCTION), + errmsg("join estimator function %s has multiple matches", + NameListToString(joinName)))); + } + else + { + joinOid = joinOid2; + /* If not found, reference the 5-argument signature in error msg */ + if (!OidIsValid(joinOid)) + joinOid = LookupFuncName(joinName, 5, typeId, false); + } /* estimators must return float8 */ if (get_func_rettype(joinOid) != FLOAT8OID) diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 2e107ace39be..483bb65ddc89 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -1627,21 +1627,31 @@ findTypeInputFunction(List *procname, Oid typeOid) { Oid argList[3]; Oid procOid; + Oid procOid2; /* * Input functions can take a single argument of type CSTRING, or three - * arguments (string, typioparam OID, typmod). They must return the - * target type. + * arguments (string, typioparam OID, typmod). Whine about ambiguity if + * both forms exist. */ argList[0] = CSTRINGOID; + argList[1] = OIDOID; + argList[2] = INT4OID; procOid = LookupFuncName(procname, 1, argList, true); - if (!OidIsValid(procOid)) + procOid2 = LookupFuncName(procname, 3, argList, true); + if (OidIsValid(procOid)) { - argList[1] = OIDOID; - argList[2] = INT4OID; - - procOid = LookupFuncName(procname, 3, argList, true); + if (OidIsValid(procOid2)) + ereport(ERROR, + (errcode(ERRCODE_AMBIGUOUS_FUNCTION), + errmsg("type input function %s has multiple matches", + NameListToString(procname)))); + } + else + { + procOid = procOid2; + /* If not found, reference the 1-argument signature in error msg */ if (!OidIsValid(procOid)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION), @@ -1649,6 +1659,7 @@ findTypeInputFunction(List *procname, Oid typeOid) func_signature_string(procname, 1, NIL, argList)))); } + /* Input functions must return the target type. */ if (get_func_rettype(procOid) != typeOid) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -1714,21 +1725,31 @@ findTypeReceiveFunction(List *procname, Oid typeOid) { Oid argList[3]; Oid procOid; + Oid procOid2; /* * Receive functions can take a single argument of type INTERNAL, or three - * arguments (internal, typioparam OID, typmod). They must return the - * target type. + * arguments (internal, typioparam OID, typmod). Whine about ambiguity if + * both forms exist. */ argList[0] = INTERNALOID; + argList[1] = OIDOID; + argList[2] = INT4OID; procOid = LookupFuncName(procname, 1, argList, true); - if (!OidIsValid(procOid)) + procOid2 = LookupFuncName(procname, 3, argList, true); + if (OidIsValid(procOid)) { - argList[1] = OIDOID; - argList[2] = INT4OID; - - procOid = LookupFuncName(procname, 3, argList, true); + if (OidIsValid(procOid2)) + ereport(ERROR, + (errcode(ERRCODE_AMBIGUOUS_FUNCTION), + errmsg("type receive function %s has multiple matches", + NameListToString(procname)))); + } + else + { + procOid = procOid2; + /* If not found, reference the 1-argument signature in error msg */ if (!OidIsValid(procOid)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION), @@ -1736,6 +1757,7 @@ findTypeReceiveFunction(List *procname, Oid typeOid) func_signature_string(procname, 1, NIL, argList)))); } + /* Receive functions must return the target type. */ if (get_func_rettype(procOid) != typeOid) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), From e078fb5d4eeb23d0d09932e0b183a8e7bdfb17b4 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 10 Aug 2020 09:22:54 -0700 Subject: [PATCH 278/334] Move connect.h from fe_utils to src/include/common. Any libpq client can use the header. Clients include backend components postgres_fdw, dblink, and logical replication apply worker. Back-patch to v10, because another fix needs this. In released branches, just copy the header and keep the original. --- contrib/oid2name/oid2name.c | 2 +- contrib/vacuumlo/vacuumlo.c | 2 +- src/bin/pg_basebackup/streamutil.c | 2 +- src/bin/pg_dump/pg_backup_db.c | 2 +- src/bin/pg_dump/pg_dump.c | 2 +- src/bin/pg_dump/pg_dumpall.c | 2 +- src/bin/pg_rewind/libpq_fetch.c | 2 +- src/bin/pg_upgrade/server.c | 2 +- src/bin/scripts/common.c | 2 +- src/bin/scripts/reindexdb.c | 2 +- src/bin/scripts/vacuumdb.c | 2 +- src/fe_utils/cancel.c | 2 +- src/include/{fe_utils => common}/connect.h | 2 +- src/tools/findoidjoins/findoidjoins.c | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) rename src/include/{fe_utils => common}/connect.h (96%) diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c index c7d0f9025a43..91b7958c48ef 100644 --- a/contrib/oid2name/oid2name.c +++ b/contrib/oid2name/oid2name.c @@ -10,8 +10,8 @@ #include "postgres_fe.h" #include "catalog/pg_class_d.h" +#include "common/connect.h" #include "common/logging.h" -#include "fe_utils/connect.h" #include "getopt_long.h" #include "libpq-fe.h" #include "pg_getopt.h" diff --git a/contrib/vacuumlo/vacuumlo.c b/contrib/vacuumlo/vacuumlo.c index 92bdf71356b1..e4019fafaa9e 100644 --- a/contrib/vacuumlo/vacuumlo.c +++ b/contrib/vacuumlo/vacuumlo.c @@ -22,8 +22,8 @@ #endif #include "catalog/pg_class_d.h" +#include "common/connect.h" #include "common/logging.h" -#include "fe_utils/connect.h" #include "getopt_long.h" #include "libpq-fe.h" #include "pg_getopt.h" diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index 410116492ea1..c08003e7f2c7 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -18,11 +18,11 @@ #include #include "access/xlog_internal.h" +#include "common/connect.h" #include "common/fe_memutils.h" #include "common/file_perm.h" #include "common/logging.h" #include "datatype/timestamp.h" -#include "fe_utils/connect.h" #include "port/pg_bswap.h" #include "pqexpbuffer.h" #include "receivelog.h" diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c index 6dba7e19e433..94af11b80a39 100644 --- a/src/bin/pg_dump/pg_backup_db.c +++ b/src/bin/pg_dump/pg_backup_db.c @@ -17,8 +17,8 @@ #include #endif +#include "common/connect.h" #include "dumputils.h" -#include "fe_utils/connect.h" #include "fe_utils/string_utils.h" #include "parallel.h" #include "pg_backup_archiver.h" diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 94459b3539ad..9c8436dde6cc 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -52,8 +52,8 @@ #include "catalog/pg_proc_d.h" #include "catalog/pg_trigger_d.h" #include "catalog/pg_type_d.h" +#include "common/connect.h" #include "dumputils.h" -#include "fe_utils/connect.h" #include "fe_utils/string_utils.h" #include "getopt_long.h" #include "libpq/libpq-fs.h" diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 8d5484910231..2c82b39af0d2 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -18,10 +18,10 @@ #include #include +#include "common/connect.h" #include "common/file_utils.h" #include "common/logging.h" #include "dumputils.h" -#include "fe_utils/connect.h" #include "fe_utils/string_utils.h" #include "getopt_long.h" #include "pg_backup.h" diff --git a/src/bin/pg_rewind/libpq_fetch.c b/src/bin/pg_rewind/libpq_fetch.c index c44648f82318..bf4dfc23b963 100644 --- a/src/bin/pg_rewind/libpq_fetch.c +++ b/src/bin/pg_rewind/libpq_fetch.c @@ -15,8 +15,8 @@ #include #include "catalog/pg_type_d.h" +#include "common/connect.h" #include "datapagemap.h" -#include "fe_utils/connect.h" #include "fetch.h" #include "file_ops.h" #include "filemap.h" diff --git a/src/bin/pg_upgrade/server.c b/src/bin/pg_upgrade/server.c index 79ec3f04c0ec..7db3c1d51f2e 100644 --- a/src/bin/pg_upgrade/server.c +++ b/src/bin/pg_upgrade/server.c @@ -9,7 +9,7 @@ #include "postgres_fe.h" -#include "fe_utils/connect.h" +#include "common/connect.h" #include "fe_utils/string_utils.h" #include "pg_upgrade.h" diff --git a/src/bin/scripts/common.c b/src/bin/scripts/common.c index ee65cc39481a..420d0d11a5a1 100644 --- a/src/bin/scripts/common.c +++ b/src/bin/scripts/common.c @@ -18,9 +18,9 @@ #include #include "common.h" +#include "common/connect.h" #include "common/logging.h" #include "fe_utils/cancel.h" -#include "fe_utils/connect.h" #include "fe_utils/string_utils.h" #define ERRCODE_UNDEFINED_TABLE "42P01" diff --git a/src/bin/scripts/reindexdb.c b/src/bin/scripts/reindexdb.c index b7b19ccc1ca9..40dcbc928332 100644 --- a/src/bin/scripts/reindexdb.c +++ b/src/bin/scripts/reindexdb.c @@ -13,9 +13,9 @@ #include "catalog/pg_class_d.h" #include "common.h" +#include "common/connect.h" #include "common/logging.h" #include "fe_utils/cancel.h" -#include "fe_utils/connect.h" #include "fe_utils/simple_list.h" #include "fe_utils/string_utils.h" #include "scripts_parallel.h" diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c index 6a3c941158fb..125ed2ff5a46 100644 --- a/src/bin/scripts/vacuumdb.c +++ b/src/bin/scripts/vacuumdb.c @@ -15,9 +15,9 @@ #include "catalog/pg_class_d.h" #include "common.h" +#include "common/connect.h" #include "common/logging.h" #include "fe_utils/cancel.h" -#include "fe_utils/connect.h" #include "fe_utils/simple_list.h" #include "fe_utils/string_utils.h" #include "scripts_parallel.h" diff --git a/src/fe_utils/cancel.c b/src/fe_utils/cancel.c index 51fb67d384ad..70042017481a 100644 --- a/src/fe_utils/cancel.c +++ b/src/fe_utils/cancel.c @@ -18,8 +18,8 @@ #include +#include "common/connect.h" #include "fe_utils/cancel.h" -#include "fe_utils/connect.h" #include "fe_utils/string_utils.h" diff --git a/src/include/fe_utils/connect.h b/src/include/common/connect.h similarity index 96% rename from src/include/fe_utils/connect.h rename to src/include/common/connect.h index 8030af9a9f8b..2cc5d7dd251b 100644 --- a/src/include/fe_utils/connect.h +++ b/src/include/common/connect.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * src/include/fe_utils/connect.h + * src/include/common/connect.h * *------------------------------------------------------------------------- */ diff --git a/src/tools/findoidjoins/findoidjoins.c b/src/tools/findoidjoins/findoidjoins.c index 5239332ea7ee..3d9ca2623576 100644 --- a/src/tools/findoidjoins/findoidjoins.c +++ b/src/tools/findoidjoins/findoidjoins.c @@ -10,7 +10,7 @@ #include "access/transam.h" #include "catalog/pg_class_d.h" -#include "fe_utils/connect.h" +#include "common/connect.h" #include "libpq-fe.h" #include "pqexpbuffer.h" From 11da97024abbe76b8c81e3f2375b2a62e9717c67 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 10 Aug 2020 09:22:54 -0700 Subject: [PATCH 279/334] Empty search_path in logical replication apply worker and walsender. This is like CVE-2018-1058 commit 582edc369cdbd348d68441fc50fa26a84afd0c1a. Today, a malicious user of a publisher or subscriber database can invoke arbitrary SQL functions under an identity running replication, often a superuser. This fix may cause "does not exist" or "no schema has been selected to create in" errors in a replication process. After upgrading, consider watching server logs for these errors. Objects accruing schema qualification in the wake of the earlier commit are unlikely to need further correction. Back-patch to v10, which introduced logical replication. Security: CVE-2020-14349 --- .../libpqwalreceiver/libpqwalreceiver.c | 17 +++++++++++++++++ src/backend/replication/logical/worker.c | 6 ++++++ src/test/subscription/t/001_rep_changes.pl | 4 ++++ 3 files changed, 27 insertions(+) diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index e9057230e40c..8afa5a29b484 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -21,6 +21,7 @@ #include "access/xlog.h" #include "catalog/pg_type.h" +#include "common/connect.h" #include "funcapi.h" #include "libpq-fe.h" #include "mb/pg_wchar.h" @@ -213,6 +214,22 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname, return NULL; } + if (logical) + { + PGresult *res; + + res = libpqrcv_PQexec(conn->streamConn, + ALWAYS_SECURE_SEARCH_PATH_SQL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errmsg("could not clear search path: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + PQclear(res); + } + conn->logical = logical; return conn; diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 2fcf2e61bc3e..b576e342cb7d 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2019,6 +2019,12 @@ ApplyWorkerMain(Datum main_arg) MyLogicalRepWorker->userid, 0); + /* + * Set always-secure search path, so malicious users can't redirect user + * code (e.g. pg_index.indexprs). + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + /* Load the subscription into persistent memory context. */ ApplyContext = AllocSetContextCreate(TopMemoryContext, "ApplyContext", diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index 3f8318fc7cc2..0680f44a1aa5 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -16,6 +16,10 @@ $node_subscriber->start; # Create some preexisting content on publisher +$node_publisher->safe_psql( + 'postgres', + "CREATE FUNCTION public.pg_get_replica_identity_index(int) + RETURNS regclass LANGUAGE sql AS 'SELECT 1/0'"); # shall not call $node_publisher->safe_psql('postgres', "CREATE TABLE tab_notrep AS SELECT generate_series(1,10) AS a"); $node_publisher->safe_psql('postgres', From cec57b1a0fbcd3833086ba686897c5883e0a2afc Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 10 Aug 2020 09:22:54 -0700 Subject: [PATCH 280/334] Document clashes between logical replication and untrusted users. Back-patch to v10, which introduced logical replication. Security: CVE-2020-14349 --- doc/src/sgml/logical-replication.sgml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index 7c8629d74efd..3f69b7192682 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -513,11 +513,27 @@ Security + + A user able to modify the schema of subscriber-side tables can execute + arbitrary code as a superuser. Limit ownership + and TRIGGER privilege on such tables to roles that + superusers trust. Moreover, if untrusted users can create tables, use only + publications that list tables explicitly. That is to say, create a + subscription FOR ALL TABLES only when superusers trust + every user permitted to create a non-temp table on the publisher or the + subscriber. + + The role used for the replication connection must have - the REPLICATION attribute (or be a superuser). Access for the role must be - configured in pg_hba.conf and it must have the - LOGIN attribute. + the REPLICATION attribute (or be a superuser). If the + role lacks SUPERUSER and BYPASSRLS, + publisher row security policies can execute. If the role does not trust + all table owners, include options=-crow_security=off in + the connection string; if a table owner then adds a row security policy, + that setting will cause replication to halt rather than execute the policy. + Access for the role must be configured in pg_hba.conf + and it must have the LOGIN attribute. From 1784f278a63866cc144fcd0a2127cadba6a2b7f8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Mon, 10 Aug 2020 18:51:31 +0200 Subject: [PATCH 281/334] Replace remaining StrNCpy() by strlcpy() They are equivalent, except that StrNCpy() zero-fills the entire destination buffer instead of providing just one trailing zero. For all but a tiny number of callers, that's just overhead rather than being desirable. Remove StrNCpy() as it is now unused. In some cases, namestrcpy() is the more appropriate function to use. While we're here, simplify the API of namestrcpy(): Remove the return value, don't check for NULL input. Nothing was using that anyway. Also, remove a few unused name-related functions. Reviewed-by: Tom Lane Discussion: https://www.postgresql.org/message-id/flat/44f5e198-36f6-6cdb-7fa9-60e34784daae%402ndquadrant.com --- contrib/pgcrypto/crypt-des.c | 2 +- src/backend/access/transam/slru.c | 2 +- src/backend/access/transam/xlogarchive.c | 2 +- src/backend/catalog/pg_constraint.c | 2 +- src/backend/commands/indexcmds.c | 2 +- src/backend/commands/statscmds.c | 2 +- src/backend/commands/tablecmds.c | 2 +- src/backend/postmaster/pgstat.c | 2 +- src/backend/replication/logical/logical.c | 11 ++++- src/backend/replication/slot.c | 2 +- src/backend/utils/adt/formatting.c | 8 ++-- src/backend/utils/adt/name.c | 48 ++----------------- src/backend/utils/adt/pg_locale.c | 9 ---- src/backend/utils/adt/ruleutils.c | 2 +- src/common/exec.c | 4 +- src/include/c.h | 29 ----------- src/include/utils/builtins.h | 3 +- src/interfaces/ecpg/pgtypeslib/dt_common.c | 4 +- src/interfaces/ecpg/test/pg_regress_ecpg.c | 2 +- .../ssl_passphrase_func.c | 2 +- 20 files changed, 34 insertions(+), 106 deletions(-) diff --git a/contrib/pgcrypto/crypt-des.c b/contrib/pgcrypto/crypt-des.c index 6efaa609c9d1..98c30ea122e3 100644 --- a/contrib/pgcrypto/crypt-des.c +++ b/contrib/pgcrypto/crypt-des.c @@ -720,7 +720,7 @@ px_crypt_des(const char *key, const char *setting) if (des_setkey((char *) keybuf)) return NULL; } - StrNCpy(output, setting, 10); + strlcpy(output, setting, 10); /* * Double check that we weren't given a short setting. If we were, the diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 9e145f1c36ac..d1dbb43e096c 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -252,7 +252,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, */ ctl->shared = shared; ctl->do_fsync = true; /* default behavior */ - StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir)); + strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); } /* diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index cdd586fcfbae..8f8734dc1d4e 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -323,7 +323,7 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOn case 'r': /* %r: filename of last restartpoint */ sp++; - StrNCpy(dp, lastRestartPointFname, endp - dp); + strlcpy(dp, lastRestartPointFname, endp - dp); dp += strlen(dp); break; case '%': diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index fdc63e7dea16..6a6b2cb8c0c8 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -484,7 +484,7 @@ ChooseConstraintName(const char *name1, const char *name2, conDesc = table_open(ConstraintRelationId, AccessShareLock); /* try the unmodified label first */ - StrNCpy(modlabel, label, sizeof(modlabel)); + strlcpy(modlabel, label, sizeof(modlabel)); for (;;) { diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 2baca12c5f47..7819266a6306 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -2246,7 +2246,7 @@ ChooseRelationName(const char *name1, const char *name2, char modlabel[NAMEDATALEN]; /* try the unmodified label first */ - StrNCpy(modlabel, label, sizeof(modlabel)); + strlcpy(modlabel, label, sizeof(modlabel)); for (;;) { diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index 974828545ca9..3057d89d50c0 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -681,7 +681,7 @@ ChooseExtendedStatisticName(const char *name1, const char *name2, char modlabel[NAMEDATALEN]; /* try the unmodified label first */ - StrNCpy(modlabel, label, sizeof(modlabel)); + strlcpy(modlabel, label, sizeof(modlabel)); for (;;) { diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index ac53f79ada2a..cd989c95e517 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -606,7 +606,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, * Truncate relname to appropriate length (probably a waste of time, as * parser should have done this already). */ - StrNCpy(relname, stmt->relation->relname, NAMEDATALEN); + strlcpy(relname, stmt->relation->relname, NAMEDATALEN); /* * Check consistency of arguments diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 15f92b66c6ba..73ce944fb1ce 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4367,7 +4367,7 @@ pgstat_send_archiver(const char *xlog, bool failed) */ pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER); msg.m_failed = failed; - StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog)); + strlcpy(msg.m_xlog, xlog, sizeof(msg.m_xlog)); msg.m_timestamp = GetCurrentTimestamp(); pgstat_send(&msg, sizeof(msg)); } diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index f5eb6bc3aff2..57c5b513ccf8 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -39,6 +39,7 @@ #include "replication/snapbuild.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "utils/builtins.h" #include "utils/memutils.h" /* data for errcontext callback */ @@ -288,6 +289,7 @@ CreateInitDecodingContext(const char *plugin, { TransactionId xmin_horizon = InvalidTransactionId; ReplicationSlot *slot; + NameData plugin_name; LogicalDecodingContext *ctx; MemoryContext old_context; @@ -319,9 +321,14 @@ CreateInitDecodingContext(const char *plugin, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), errmsg("cannot create logical replication slot in transaction that has performed writes"))); - /* register output plugin name with slot */ + /* + * Register output plugin name with slot. We need the mutex to avoid + * concurrent reading of a partially copied string. But we don't want any + * complicated code while holding a spinlock, so do namestrcpy() outside. + */ + namestrcpy(&plugin_name, plugin); SpinLockAcquire(&slot->mutex); - StrNCpy(NameStr(slot->data.plugin), plugin, NAMEDATALEN); + slot->data.plugin = plugin_name; SpinLockRelease(&slot->mutex); if (XLogRecPtrIsInvalid(restart_lsn)) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 57bbb6288c68..3dc01b6df22a 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -275,7 +275,7 @@ ReplicationSlotCreate(const char *name, bool db_specific, /* first initialize persistent data */ memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData)); - StrNCpy(NameStr(slot->data.name), name, NAMEDATALEN); + namestrcpy(&slot->data.name, name); slot->data.database = db_specific ? MyDatabaseId : InvalidOid; slot->data.persistency = persistency; diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 662643813660..9de63686ecb5 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -3890,7 +3890,7 @@ DCH_cache_getnew(const char *str, bool std) elog(DEBUG_elog_output, "OLD: '%s' AGE: %d", old->str, old->age); #endif old->valid = false; - StrNCpy(old->str, str, DCH_CACHE_SIZE + 1); + strlcpy(old->str, str, DCH_CACHE_SIZE + 1); old->age = (++DCHCounter); /* caller is expected to fill format, then set valid */ return old; @@ -3904,7 +3904,7 @@ DCH_cache_getnew(const char *str, bool std) DCHCache[n_DCHCache] = ent = (DCHCacheEntry *) MemoryContextAllocZero(TopMemoryContext, sizeof(DCHCacheEntry)); ent->valid = false; - StrNCpy(ent->str, str, DCH_CACHE_SIZE + 1); + strlcpy(ent->str, str, DCH_CACHE_SIZE + 1); ent->std = std; ent->age = (++DCHCounter); /* caller is expected to fill format, then set valid */ @@ -4799,7 +4799,7 @@ NUM_cache_getnew(const char *str) elog(DEBUG_elog_output, "OLD: \"%s\" AGE: %d", old->str, old->age); #endif old->valid = false; - StrNCpy(old->str, str, NUM_CACHE_SIZE + 1); + strlcpy(old->str, str, NUM_CACHE_SIZE + 1); old->age = (++NUMCounter); /* caller is expected to fill format and Num, then set valid */ return old; @@ -4813,7 +4813,7 @@ NUM_cache_getnew(const char *str) NUMCache[n_NUMCache] = ent = (NUMCacheEntry *) MemoryContextAllocZero(TopMemoryContext, sizeof(NUMCacheEntry)); ent->valid = false; - StrNCpy(ent->str, str, NUM_CACHE_SIZE + 1); + strlcpy(ent->str, str, NUM_CACHE_SIZE + 1); ent->age = (++NUMCounter); /* caller is expected to fill format and Num, then set valid */ ++n_NUMCache; diff --git a/src/backend/utils/adt/name.c b/src/backend/utils/adt/name.c index 64877f67e010..a3ce3f3d1e18 100644 --- a/src/backend/utils/adt/name.c +++ b/src/backend/utils/adt/name.c @@ -229,53 +229,13 @@ btnamesortsupport(PG_FUNCTION_ARGS) * MISCELLANEOUS PUBLIC ROUTINES * *****************************************************************************/ -int -namecpy(Name n1, const NameData *n2) -{ - if (!n1 || !n2) - return -1; - StrNCpy(NameStr(*n1), NameStr(*n2), NAMEDATALEN); - return 0; -} - -#ifdef NOT_USED -int -namecat(Name n1, Name n2) -{ - return namestrcat(n1, NameStr(*n2)); /* n2 can't be any longer than n1 */ -} -#endif - -int +void namestrcpy(Name name, const char *str) { - if (!name || !str) - return -1; - StrNCpy(NameStr(*name), str, NAMEDATALEN); - return 0; -} - -#ifdef NOT_USED -int -namestrcat(Name name, const char *str) -{ - int i; - char *p, - *q; - - if (!name || !str) - return -1; - for (i = 0, p = NameStr(*name); i < NAMEDATALEN && *p; ++i, ++p) - ; - for (q = str; i < NAMEDATALEN; ++i, ++p, ++q) - { - *p = *q; - if (!*q) - break; - } - return 0; + /* NB: We need to zero-pad the destination. */ + strncpy(NameStr(*name), str, NAMEDATALEN); + NameStr(*name)[NAMEDATALEN-1] = '\0'; } -#endif /* * Compare a NAME to a C string diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 11d05c73accc..07299dbc0911 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -75,16 +75,7 @@ #endif #ifdef WIN32 -/* - * This Windows file defines StrNCpy. We don't need it here, so we undefine - * it to keep the compiler quiet, and undefine it again after the file is - * included, so we don't accidentally use theirs. - */ -#undef StrNCpy #include -#ifdef StrNCpy -#undef StrNCpy -#endif #endif #define MAX_L10N_DATA 80 diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 2cbcb4b85e3b..60dd80c23c87 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -2489,7 +2489,7 @@ pg_get_userbyid(PG_FUNCTION_ARGS) if (HeapTupleIsValid(roletup)) { role_rec = (Form_pg_authid) GETSTRUCT(roletup); - StrNCpy(NameStr(*result), NameStr(role_rec->rolname), NAMEDATALEN); + *result = role_rec->rolname; ReleaseSysCache(roletup); } else diff --git a/src/common/exec.c b/src/common/exec.c index f39b0a294bf5..78bb486f999a 100644 --- a/src/common/exec.c +++ b/src/common/exec.c @@ -144,7 +144,7 @@ find_my_exec(const char *argv0, char *retpath) if (first_dir_separator(argv0) != NULL) { if (is_absolute_path(argv0)) - StrNCpy(retpath, argv0, MAXPGPATH); + strlcpy(retpath, argv0, MAXPGPATH); else join_path_components(retpath, cwd, argv0); canonicalize_path(retpath); @@ -184,7 +184,7 @@ find_my_exec(const char *argv0, char *retpath) if (!endp) endp = startp + strlen(startp); /* point to end */ - StrNCpy(test_path, startp, Min(endp - startp + 1, MAXPGPATH)); + strlcpy(test_path, startp, Min(endp - startp + 1, MAXPGPATH)); if (is_absolute_path(test_path)) join_path_components(retpath, test_path, argv0); diff --git a/src/include/c.h b/src/include/c.h index f242e32edbe7..2c61ca8aa894 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -932,35 +932,6 @@ extern void ExceptionalCondition(const char *conditionName, */ #define Abs(x) ((x) >= 0 ? (x) : -(x)) -/* - * StrNCpy - * Like standard library function strncpy(), except that result string - * is guaranteed to be null-terminated --- that is, at most N-1 bytes - * of the source string will be kept. - * Also, the macro returns no result (too hard to do that without - * evaluating the arguments multiple times, which seems worse). - * - * BTW: when you need to copy a non-null-terminated string (like a text - * datum) and add a null, do not do it with StrNCpy(..., len+1). That - * might seem to work, but it fetches one byte more than there is in the - * text object. One fine day you'll have a SIGSEGV because there isn't - * another byte before the end of memory. Don't laugh, we've had real - * live bug reports from real live users over exactly this mistake. - * Do it honestly with "memcpy(dst,src,len); dst[len] = '\0';", instead. - */ -#define StrNCpy(dst,src,len) \ - do \ - { \ - char * _dst = (dst); \ - Size _len = (len); \ -\ - if (_len > 0) \ - { \ - strncpy(_dst, (src), _len); \ - _dst[_len-1] = '\0'; \ - } \ - } while (0) - /* Get a bit mask of the bits set in non-long aligned addresses */ #define LONG_ALIGN_MASK (sizeof(long) - 1) diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 3ca5e938f8f8..4db5ad3f12e5 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -39,8 +39,7 @@ extern uint64 hex_decode(const char *src, size_t len, char *dst); extern int2vector *buildint2vector(const int16 *int2s, int n); /* name.c */ -extern int namecpy(Name n1, const NameData *n2); -extern int namestrcpy(Name name, const char *str); +extern void namestrcpy(Name name, const char *str); extern int namestrcmp(Name name, const char *str); /* numutils.c */ diff --git a/src/interfaces/ecpg/pgtypeslib/dt_common.c b/src/interfaces/ecpg/pgtypeslib/dt_common.c index 14cdf2d428b5..e8a8a0f0ed3e 100644 --- a/src/interfaces/ecpg/pgtypeslib/dt_common.c +++ b/src/interfaces/ecpg/pgtypeslib/dt_common.c @@ -1015,7 +1015,7 @@ abstime2tm(AbsoluteTime _time, int *tzp, struct tm *tm, char **tzn) * Copy no more than MAXTZLEN bytes of timezone to tzn, in case it * contains an error message, which doesn't fit in the buffer */ - StrNCpy(*tzn, tm->tm_zone, MAXTZLEN + 1); + strlcpy(*tzn, tm->tm_zone, MAXTZLEN + 1); if (strlen(tm->tm_zone) > MAXTZLEN) tm->tm_isdst = -1; } @@ -1033,7 +1033,7 @@ abstime2tm(AbsoluteTime _time, int *tzp, struct tm *tm, char **tzn) * Copy no more than MAXTZLEN bytes of timezone to tzn, in case it * contains an error message, which doesn't fit in the buffer */ - StrNCpy(*tzn, TZNAME_GLOBAL[tm->tm_isdst], MAXTZLEN + 1); + strlcpy(*tzn, TZNAME_GLOBAL[tm->tm_isdst], MAXTZLEN + 1); if (strlen(TZNAME_GLOBAL[tm->tm_isdst]) > MAXTZLEN) tm->tm_isdst = -1; } diff --git a/src/interfaces/ecpg/test/pg_regress_ecpg.c b/src/interfaces/ecpg/test/pg_regress_ecpg.c index 956a599fcbbc..46b9e78fe59d 100644 --- a/src/interfaces/ecpg/test/pg_regress_ecpg.c +++ b/src/interfaces/ecpg/test/pg_regress_ecpg.c @@ -63,7 +63,7 @@ ecpg_filter(const char *sourcefile, const char *outfile) if (plen > 1) { n = (char *) malloc(plen); - StrNCpy(n, p + 1, plen); + strlcpy(n, p + 1, plen); replace_string(linebuf, n, ""); } } diff --git a/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c b/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c index 563ff144cc10..6b0a3db104c2 100644 --- a/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c +++ b/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c @@ -74,7 +74,7 @@ rot13_passphrase(char *buf, int size, int rwflag, void *userdata) { Assert(ssl_passphrase != NULL); - StrNCpy(buf, ssl_passphrase, size); + strlcpy(buf, ssl_passphrase, size); for (char *p = buf; *p; p++) { char c = *p; From 1f75b454134cce6a67a9bcdb01b5c018221dd359 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 11 Aug 2020 14:37:38 +0900 Subject: [PATCH 282/334] Improve tab completion of REINDEX in psql This allows the tab completion of REINDEX to handle an optional parenthesized list of options. This case is more complicated than VACUUM or ANALYZE because of CONCURRENTLY and the different object types to consider with the reindex. Author: Justin Pryzby Reviewed-by: Alexey Kondratov, Michael Paquier Discussion: https://postgr.es/m/20200403182712.GR14618@telsasoft.com --- src/bin/psql/tab-complete.c | 38 ++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index c4af40bfa9fa..f41785f11c12 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -3430,28 +3430,48 @@ psql_completion(const char *text, int start, int end) COMPLETE_WITH("DATA"); /* REINDEX */ - else if (Matches("REINDEX")) + else if (Matches("REINDEX") || + Matches("REINDEX", "(*)")) COMPLETE_WITH("TABLE", "INDEX", "SYSTEM", "SCHEMA", "DATABASE"); - else if (Matches("REINDEX", "TABLE")) + else if (Matches("REINDEX", "TABLE") || + Matches("REINDEX", "(*)", "TABLE")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexables, " UNION SELECT 'CONCURRENTLY'"); - else if (Matches("REINDEX", "INDEX")) + else if (Matches("REINDEX", "INDEX") || + Matches("REINDEX", "(*)", "INDEX")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes, " UNION SELECT 'CONCURRENTLY'"); - else if (Matches("REINDEX", "SCHEMA")) + else if (Matches("REINDEX", "SCHEMA") || + Matches("REINDEX", "(*)", "SCHEMA")) COMPLETE_WITH_QUERY(Query_for_list_of_schemas " UNION SELECT 'CONCURRENTLY'"); - else if (Matches("REINDEX", "SYSTEM|DATABASE")) + else if (Matches("REINDEX", "SYSTEM|DATABASE") || + Matches("REINDEX", "(*)", "SYSTEM|DATABASE")) COMPLETE_WITH_QUERY(Query_for_list_of_databases " UNION SELECT 'CONCURRENTLY'"); - else if (Matches("REINDEX", "TABLE", "CONCURRENTLY")) + else if (Matches("REINDEX", "TABLE", "CONCURRENTLY") || + Matches("REINDEX", "(*)", "TABLE", "CONCURRENTLY")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexables, NULL); - else if (Matches("REINDEX", "INDEX", "CONCURRENTLY")) + else if (Matches("REINDEX", "INDEX", "CONCURRENTLY") || + Matches("REINDEX", "(*)", "INDEX", "CONCURRENTLY")) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes, NULL); - else if (Matches("REINDEX", "SCHEMA", "CONCURRENTLY")) + else if (Matches("REINDEX", "SCHEMA", "CONCURRENTLY") || + Matches("REINDEX", "(*)", "SCHEMA", "CONCURRENTLY")) COMPLETE_WITH_QUERY(Query_for_list_of_schemas); - else if (Matches("REINDEX", "SYSTEM|DATABASE", "CONCURRENTLY")) + else if (Matches("REINDEX", "SYSTEM|DATABASE", "CONCURRENTLY") || + Matches("REINDEX", "(*)", "SYSTEM|DATABASE", "CONCURRENTLY")) COMPLETE_WITH_QUERY(Query_for_list_of_databases); + else if (HeadMatches("REINDEX", "(*") && + !HeadMatches("REINDEX", "(*)")) + { + /* + * This fires if we're in an unfinished parenthesized option list. + * get_previous_words treats a completed parenthesized option list as + * one word, so the above test is correct. + */ + if (ends_with(prev_wd, '(') || ends_with(prev_wd, ',')) + COMPLETE_WITH("VERBOSE"); + } /* SECURITY LABEL */ else if (Matches("SECURITY")) From fea10a64340e529805609126740a540c8f9daab4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 11 Aug 2020 11:25:23 -0700 Subject: [PATCH 283/334] Rename VariableCacheData.nextFullXid to nextXid. Including Full in variable names duplicates the type information and leads to overly long names. As FullTransactionId cannot accidentally be casted to TransactionId that does not seem necessary. Author: Andres Freund Discussion: https://postgr.es/m/20200724011143.jccsyvsvymuiqfxu@alap3.anarazel.de --- src/backend/access/gist/gistxlog.c | 6 ++-- src/backend/access/rmgrdesc/xlogdesc.c | 4 +-- src/backend/access/transam/clog.c | 8 ++--- src/backend/access/transam/commit_ts.c | 4 +-- src/backend/access/transam/multixact.c | 2 +- src/backend/access/transam/subtrans.c | 10 +++--- src/backend/access/transam/twophase.c | 22 ++++++------ src/backend/access/transam/varsup.c | 26 +++++++------- src/backend/access/transam/xact.c | 4 +-- src/backend/access/transam/xlog.c | 48 ++++++++++++------------- src/backend/access/transam/xlogreader.c | 4 +-- src/backend/storage/ipc/procarray.c | 14 ++++---- src/backend/storage/ipc/standby.c | 2 +- src/backend/storage/lmgr/predicate.c | 2 +- src/backend/utils/misc/pg_controldata.c | 4 +-- src/bin/pg_controldata/pg_controldata.c | 4 +-- src/bin/pg_resetwal/pg_resetwal.c | 18 +++++----- src/include/access/transam.h | 4 +-- src/include/catalog/pg_control.h | 2 +- src/include/storage/standby.h | 2 +- src/include/storage/standbydefs.h | 2 +- 21 files changed, 96 insertions(+), 96 deletions(-) diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 3f0effd5e429..7b5d1e98b70b 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -396,7 +396,7 @@ gistRedoPageReuse(XLogReaderState *record) if (InHotStandby) { FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid; - FullTransactionId nextFullXid = ReadNextFullTransactionId(); + FullTransactionId nextXid = ReadNextFullTransactionId(); uint64 diff; /* @@ -405,8 +405,8 @@ gistRedoPageReuse(XLogReaderState *record) * logged value is very old, so that XID wrap-around already happened * on it, there can't be any snapshots that still see it. */ - nextFullXid = ReadNextFullTransactionId(); - diff = U64FromFullTransactionId(nextFullXid) - + nextXid = ReadNextFullTransactionId(); + diff = U64FromFullTransactionId(nextXid) - U64FromFullTransactionId(latestRemovedFullXid); if (diff < MaxTransactionId / 2) { diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 1cd97852e8f3..3200f777f5a3 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -53,8 +53,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->ThisTimeLineID, checkpoint->PrevTimeLineID, checkpoint->fullPageWrites ? "true" : "false", - EpochFromFullTransactionId(checkpoint->nextFullXid), - XidFromFullTransactionId(checkpoint->nextFullXid), + EpochFromFullTransactionId(checkpoint->nextXid), + XidFromFullTransactionId(checkpoint->nextXid), checkpoint->nextOid, checkpoint->nextMulti, checkpoint->nextMultiOffset, diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index f3da40ae017f..dd2f4d5bc7e7 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -742,12 +742,12 @@ ZeroCLOGPage(int pageno, bool writeXlog) /* * This must be called ONCE during postmaster or standalone-backend startup, - * after StartupXLOG has initialized ShmemVariableCache->nextFullXid. + * after StartupXLOG has initialized ShmemVariableCache->nextXid. */ void StartupCLOG(void) { - TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); int pageno = TransactionIdToPage(xid); LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); @@ -766,7 +766,7 @@ StartupCLOG(void) void TrimCLOG(void) { - TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); int pageno = TransactionIdToPage(xid); LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); @@ -785,7 +785,7 @@ TrimCLOG(void) * but makes no WAL entry). Let's just be safe. (We need not worry about * pages beyond the current one, since those will be zeroed when first * used. For the same reason, there is no need to do anything when - * nextFullXid is exactly at a page boundary; and it's likely that the + * nextXid is exactly at a page boundary; and it's likely that the * "current" page doesn't exist yet in that case.) */ if (TransactionIdToPgIndex(xid) != 0) diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 903280ae92d0..5244b06a2b65 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -614,7 +614,7 @@ ZeroCommitTsPage(int pageno, bool writeXlog) /* * This must be called ONCE during postmaster or standalone-backend startup, - * after StartupXLOG has initialized ShmemVariableCache->nextFullXid. + * after StartupXLOG has initialized ShmemVariableCache->nextXid. */ void StartupCommitTs(void) @@ -704,7 +704,7 @@ ActivateCommitTs(void) } LWLockRelease(CommitTsLock); - xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); pageno = TransactionIdToCTsPage(xid); /* diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 475f5ed86110..b8bedca04a4d 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -3265,7 +3265,7 @@ multixact_redo(XLogReaderState *record) xlrec->moff + xlrec->nmembers); /* - * Make sure nextFullXid is beyond any XID mentioned in the record. + * Make sure nextXid is beyond any XID mentioned in the record. * This should be unnecessary, since any XID found here ought to have * other evidence in the XLOG, but let's be safe. */ diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index f33ae407a609..a087a5554210 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -241,15 +241,15 @@ ZeroSUBTRANSPage(int pageno) /* * This must be called ONCE during postmaster or standalone-backend startup, - * after StartupXLOG has initialized ShmemVariableCache->nextFullXid. + * after StartupXLOG has initialized ShmemVariableCache->nextXid. * - * oldestActiveXID is the oldest XID of any prepared transaction, or nextFullXid + * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid * if there are none. */ void StartupSUBTRANS(TransactionId oldestActiveXID) { - FullTransactionId nextFullXid; + FullTransactionId nextXid; int startPage; int endPage; @@ -262,8 +262,8 @@ StartupSUBTRANS(TransactionId oldestActiveXID) LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); startPage = TransactionIdToPage(oldestActiveXID); - nextFullXid = ShmemVariableCache->nextFullXid; - endPage = TransactionIdToPage(XidFromFullTransactionId(nextFullXid)); + nextXid = ShmemVariableCache->nextXid; + endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid)); while (startPage != endPage) { diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 9b2e59bf0ec1..31f135f5cedc 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1813,16 +1813,16 @@ restoreTwoPhaseData(void) * * Scan the shared memory entries of TwoPhaseState and determine the range * of valid XIDs present. This is run during database startup, after we - * have completed reading WAL. ShmemVariableCache->nextFullXid has been set to + * have completed reading WAL. ShmemVariableCache->nextXid has been set to * one more than the highest XID for which evidence exists in WAL. * - * We throw away any prepared xacts with main XID beyond nextFullXid --- if any + * We throw away any prepared xacts with main XID beyond nextXid --- if any * are present, it suggests that the DBA has done a PITR recovery to an * earlier point in time without cleaning out pg_twophase. We dare not * try to recover such prepared xacts since they likely depend on database * state that doesn't exist now. * - * However, we will advance nextFullXid beyond any subxact XIDs belonging to + * However, we will advance nextXid beyond any subxact XIDs belonging to * valid prepared xacts. We need to do this since subxact commit doesn't * write a WAL entry, and so there might be no evidence in WAL of those * subxact XIDs. @@ -1832,7 +1832,7 @@ restoreTwoPhaseData(void) * backup should be rolled in. * * Our other responsibility is to determine and return the oldest valid XID - * among the prepared xacts (if none, return ShmemVariableCache->nextFullXid). + * among the prepared xacts (if none, return ShmemVariableCache->nextXid). * This is needed to synchronize pg_subtrans startup properly. * * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all @@ -1842,8 +1842,8 @@ restoreTwoPhaseData(void) TransactionId PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) { - FullTransactionId nextFullXid = ShmemVariableCache->nextFullXid; - TransactionId origNextXid = XidFromFullTransactionId(nextFullXid); + FullTransactionId nextXid = ShmemVariableCache->nextXid; + TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId result = origNextXid; TransactionId *xids = NULL; int nxids = 0; @@ -2059,7 +2059,7 @@ RecoverPreparedTransactions(void) * * If setParent is true, set up subtransaction parent linkages. * - * If setNextXid is true, set ShmemVariableCache->nextFullXid to the newest + * If setNextXid is true, set ShmemVariableCache->nextXid to the newest * value scanned. */ static char * @@ -2068,8 +2068,8 @@ ProcessTwoPhaseBuffer(TransactionId xid, bool fromdisk, bool setParent, bool setNextXid) { - FullTransactionId nextFullXid = ShmemVariableCache->nextFullXid; - TransactionId origNextXid = XidFromFullTransactionId(nextFullXid); + FullTransactionId nextXid = ShmemVariableCache->nextXid; + TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId *subxids; char *buf; TwoPhaseFileHeader *hdr; @@ -2149,7 +2149,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, /* * Examine subtransaction XIDs ... they should all follow main XID, and - * they may force us to advance nextFullXid. + * they may force us to advance nextXid. */ subxids = (TransactionId *) (buf + MAXALIGN(sizeof(TwoPhaseFileHeader)) + @@ -2160,7 +2160,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, Assert(TransactionIdFollows(subxid, xid)); - /* update nextFullXid if needed */ + /* update nextXid if needed */ if (setNextXid) AdvanceNextFullTransactionIdPastXid(subxid); diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 0142bc70f6a6..3ebd75118f06 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -75,7 +75,7 @@ GetNewTransactionId(bool isSubXact) LWLockAcquire(XidGenLock, LW_EXCLUSIVE); - full_xid = ShmemVariableCache->nextFullXid; + full_xid = ShmemVariableCache->nextXid; xid = XidFromFullTransactionId(full_xid); /*---------- @@ -159,7 +159,7 @@ GetNewTransactionId(bool isSubXact) /* Re-acquire lock and start over */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); - full_xid = ShmemVariableCache->nextFullXid; + full_xid = ShmemVariableCache->nextXid; xid = XidFromFullTransactionId(full_xid); } @@ -177,12 +177,12 @@ GetNewTransactionId(bool isSubXact) ExtendSUBTRANS(xid); /* - * Now advance the nextFullXid counter. This must not happen until after + * Now advance the nextXid counter. This must not happen until after * we have successfully completed ExtendCLOG() --- if that routine fails, * we want the next incoming transaction to try it again. We cannot * assign more XIDs until there is CLOG space for them. */ - FullTransactionIdAdvance(&ShmemVariableCache->nextFullXid); + FullTransactionIdAdvance(&ShmemVariableCache->nextXid); /* * We must store the new XID into the shared ProcArray before releasing @@ -240,7 +240,7 @@ GetNewTransactionId(bool isSubXact) } /* - * Read nextFullXid but don't allocate it. + * Read nextXid but don't allocate it. */ FullTransactionId ReadNextFullTransactionId(void) @@ -248,14 +248,14 @@ ReadNextFullTransactionId(void) FullTransactionId fullXid; LWLockAcquire(XidGenLock, LW_SHARED); - fullXid = ShmemVariableCache->nextFullXid; + fullXid = ShmemVariableCache->nextXid; LWLockRelease(XidGenLock); return fullXid; } /* - * Advance nextFullXid to the value after a given xid. The epoch is inferred. + * Advance nextXid to the value after a given xid. The epoch is inferred. * This must only be called during recovery or from two-phase start-up code. */ void @@ -266,14 +266,14 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid) uint32 epoch; /* - * It is safe to read nextFullXid without a lock, because this is only + * It is safe to read nextXid without a lock, because this is only * called from the startup process or single-process mode, meaning that no * other process can modify it. */ Assert(AmStartupProcess() || !IsUnderPostmaster); /* Fast return if this isn't an xid high enough to move the needle. */ - next_xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); if (!TransactionIdFollowsOrEquals(xid, next_xid)) return; @@ -286,7 +286,7 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid) * point in the WAL stream. */ TransactionIdAdvance(xid); - epoch = EpochFromFullTransactionId(ShmemVariableCache->nextFullXid); + epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); if (unlikely(xid < next_xid)) ++epoch; newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid); @@ -296,7 +296,7 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid) * concurrent readers. */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); - ShmemVariableCache->nextFullXid = newNextFullXid; + ShmemVariableCache->nextXid = newNextFullXid; LWLockRelease(XidGenLock); } @@ -404,7 +404,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) ShmemVariableCache->xidStopLimit = xidStopLimit; ShmemVariableCache->xidWrapLimit = xidWrapLimit; ShmemVariableCache->oldestXidDB = oldest_datoid; - curXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); LWLockRelease(XidGenLock); /* Log the info */ @@ -480,7 +480,7 @@ ForceTransactionIdLimitUpdate(void) /* Locking is probably not really necessary, but let's be careful */ LWLockAcquire(XidGenLock, LW_SHARED); - nextXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); xidVacLimit = ShmemVariableCache->xidVacLimit; oldestXid = ShmemVariableCache->oldestXid; oldestXidDB = ShmemVariableCache->oldestXidDB; diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 727d61603593..7ccb7d68ed9a 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -5791,7 +5791,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); - /* Make sure nextFullXid is beyond any XID mentioned in the record. */ + /* Make sure nextXid is beyond any XID mentioned in the record. */ AdvanceNextFullTransactionIdPastXid(max_xid); Assert(((parsed->xinfo & XACT_XINFO_HAS_ORIGIN) == 0) == @@ -5931,7 +5931,7 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) Assert(TransactionIdIsValid(xid)); - /* Make sure nextFullXid is beyond any XID mentioned in the record. */ + /* Make sure nextXid is beyond any XID mentioned in the record. */ max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 756b838e6a54..53945c0e305d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -601,7 +601,7 @@ typedef struct XLogCtlData /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ - FullTransactionId ckptFullXid; /* nextFullXid of latest checkpoint */ + FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ @@ -5239,7 +5239,7 @@ BootStrapXLOG(void) checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.PrevTimeLineID = ThisTimeLineID; checkPoint.fullPageWrites = fullPageWrites; - checkPoint.nextFullXid = + checkPoint.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstBootstrapObjectId; checkPoint.nextMulti = FirstMultiXactId; @@ -5253,7 +5253,7 @@ BootStrapXLOG(void) checkPoint.time = (pg_time_t) time(NULL); checkPoint.oldestActiveXid = InvalidTransactionId; - ShmemVariableCache->nextFullXid = checkPoint.nextFullXid; + ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); @@ -6741,7 +6741,7 @@ StartupXLOG(void) wasShutdown ? "true" : "false"))); ereport(DEBUG1, (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", - U64FromFullTransactionId(checkPoint.nextFullXid), + U64FromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", @@ -6756,12 +6756,12 @@ StartupXLOG(void) (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u", checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid))); - if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid))) + if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) ereport(PANIC, (errmsg("invalid next transaction ID"))); /* initialize shared memory variables from the checkpoint record */ - ShmemVariableCache->nextFullXid = checkPoint.nextFullXid; + ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); @@ -6770,7 +6770,7 @@ StartupXLOG(void) SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); - XLogCtl->ckptFullXid = checkPoint.nextFullXid; + XLogCtl->ckptFullXid = checkPoint.nextXid; /* * Initialize replication slots, before there's a chance to remove @@ -7051,7 +7051,7 @@ StartupXLOG(void) Assert(TransactionIdIsValid(oldestActiveXID)); /* Tell procarray about the range of xids it has to deal with */ - ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid)); + ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid)); /* * Startup commit log and subtrans only. MultiXact and commit @@ -7081,9 +7081,9 @@ StartupXLOG(void) running.xcnt = nxids; running.subxcnt = 0; running.subxid_overflow = false; - running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid); + running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); running.oldestRunningXid = oldestActiveXID; - latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid); + latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); TransactionIdRetreat(latestCompletedXid); Assert(TransactionIdIsNormal(latestCompletedXid)); running.latestCompletedXid = latestCompletedXid; @@ -7254,7 +7254,7 @@ StartupXLOG(void) error_context_stack = &errcallback; /* - * ShmemVariableCache->nextFullXid must be beyond record's + * ShmemVariableCache->nextXid must be beyond record's * xid. */ AdvanceNextFullTransactionIdPastXid(record->xl_xid); @@ -7865,7 +7865,7 @@ StartupXLOG(void) /* also initialize latestCompletedXid, to nextXid - 1 */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); TransactionIdRetreat(ShmemVariableCache->latestCompletedXid); LWLockRelease(ProcArrayLock); @@ -8897,7 +8897,7 @@ CreateCheckPoint(int flags) * there. */ LWLockAcquire(XidGenLock, LW_SHARED); - checkPoint.nextFullXid = ShmemVariableCache->nextFullXid; + checkPoint.nextXid = ShmemVariableCache->nextXid; checkPoint.oldestXid = ShmemVariableCache->oldestXid; checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB; LWLockRelease(XidGenLock); @@ -9050,7 +9050,7 @@ CreateCheckPoint(int flags) /* Update shared-memory copy of checkpoint XID/epoch */ SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextFullXid; + XLogCtl->ckptFullXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); /* @@ -9926,7 +9926,7 @@ xlog_redo(XLogReaderState *record) memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In a SHUTDOWN checkpoint, believe the counters exactly */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); - ShmemVariableCache->nextFullXid = checkPoint.nextFullXid; + ShmemVariableCache->nextXid = checkPoint.nextXid; LWLockRelease(XidGenLock); LWLockAcquire(OidGenLock, LW_EXCLUSIVE); ShmemVariableCache->nextOid = checkPoint.nextOid; @@ -9980,9 +9980,9 @@ xlog_redo(XLogReaderState *record) running.xcnt = nxids; running.subxcnt = 0; running.subxid_overflow = false; - running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid); + running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); running.oldestRunningXid = oldestActiveXID; - latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid); + latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); TransactionIdRetreat(latestCompletedXid); Assert(TransactionIdIsNormal(latestCompletedXid)); running.latestCompletedXid = latestCompletedXid; @@ -9995,12 +9995,12 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid; + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); /* Update shared-memory copy of checkpoint XID/epoch */ SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextFullXid; + XLogCtl->ckptFullXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); /* @@ -10021,9 +10021,9 @@ xlog_redo(XLogReaderState *record) memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In an ONLINE checkpoint, treat the XID counter as a minimum */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); - if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid, - checkPoint.nextFullXid)) - ShmemVariableCache->nextFullXid = checkPoint.nextFullXid; + if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid, + checkPoint.nextXid)) + ShmemVariableCache->nextXid = checkPoint.nextXid; LWLockRelease(XidGenLock); /* @@ -10054,12 +10054,12 @@ xlog_redo(XLogReaderState *record) checkPoint.oldestXidDB); /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid; + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); /* Update shared-memory copy of checkpoint XID/epoch */ SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextFullXid; + XLogCtl->ckptFullXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); /* TLI should not change in an on-line checkpoint */ diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index a757baccfc55..67996018da27 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1613,8 +1613,8 @@ XLogRecGetFullXid(XLogReaderState *record) Assert(AmStartupProcess() || !IsUnderPostmaster); xid = XLogRecGetXid(record); - next_xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); - epoch = EpochFromFullTransactionId(ShmemVariableCache->nextFullXid); + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); /* * If xid is numerically greater than next_xid, it has to be from the last diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index b44853356446..be0240e0ddcd 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -878,10 +878,10 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) LWLockRelease(ProcArrayLock); - /* ShmemVariableCache->nextFullXid must be beyond any observed xid. */ + /* ShmemVariableCache->nextXid must be beyond any observed xid. */ AdvanceNextFullTransactionIdPastXid(latestObservedXid); - Assert(FullTransactionIdIsValid(ShmemVariableCache->nextFullXid)); + Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); if (standbyState == STANDBY_SNAPSHOT_READY) @@ -1986,7 +1986,7 @@ GetRunningTransactionData(void) latestCompletedXid = ShmemVariableCache->latestCompletedXid; - oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); /* * Spin over procArray collecting all xids @@ -2078,7 +2078,7 @@ GetRunningTransactionData(void) CurrentRunningXacts->xcnt = count - subcount; CurrentRunningXacts->subxcnt = subcount; CurrentRunningXacts->subxid_overflow = suboverflowed; - CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; @@ -2123,7 +2123,7 @@ GetOldestActiveTransactionId(void) * have already completed), when we spin over it. */ LWLockAcquire(XidGenLock, LW_SHARED); - oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); LWLockRelease(XidGenLock); /* @@ -2191,7 +2191,7 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * a safe, albeit pessimal, value. */ LWLockAcquire(XidGenLock, LW_SHARED); - oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); /* * If there's already a slot pegging the xmin horizon, we can start with @@ -3361,7 +3361,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid) */ latestObservedXid = xid; - /* ShmemVariableCache->nextFullXid must be beyond any observed xid */ + /* ShmemVariableCache->nextXid must be beyond any observed xid */ AdvanceNextFullTransactionIdPastXid(latestObservedXid); next_expected_xid = latestObservedXid; TransactionIdAdvance(next_expected_xid); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index f5229839cfc3..52b2809dac03 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -889,7 +889,7 @@ standby_redo(XLogReaderState *record) * up from a checkpoint and are immediately at our starting point, we * unconditionally move to STANDBY_INITIALIZED. After this point we * must do 4 things: - * * move shared nextFullXid forwards as we see new xids + * * move shared nextXid forwards as we see new xids * * extend the clog and subtrans with each new xid * * keep track of uncommitted known assigned xids * * keep track of uncommitted AccessExclusiveLocks diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index d24919f76b67..a2f8e7524b49 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -3390,7 +3390,7 @@ ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe) * transaction to complete before freeing some RAM; correctness of visible * behavior is not affected. */ - MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid); /* * If it's not a commit it's either a rollback or a read-only transaction diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 419b58330fea..609231275893 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -165,8 +165,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) nulls[5] = false; values[6] = CStringGetTextDatum(psprintf("%u:%u", - EpochFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid), - XidFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid))); + EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid), + XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid))); nulls[6] = false; values[7] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index e73639df744b..3e00ac0f701a 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -250,8 +250,8 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's full_page_writes: %s\n"), ControlFile->checkPointCopy.fullPageWrites ? _("on") : _("off")); printf(_("Latest checkpoint's NextXID: %u:%u\n"), - EpochFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid), - XidFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid)); + EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid), + XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid)); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 233441837f8a..cb6ef1918206 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -424,14 +424,14 @@ main(int argc, char *argv[]) * if any, includes these values.) */ if (set_xid_epoch != -1) - ControlFile.checkPointCopy.nextFullXid = + ControlFile.checkPointCopy.nextXid = FullTransactionIdFromEpochAndXid(set_xid_epoch, - XidFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid)); + XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); if (set_xid != 0) { - ControlFile.checkPointCopy.nextFullXid = - FullTransactionIdFromEpochAndXid(EpochFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid), + ControlFile.checkPointCopy.nextXid = + FullTransactionIdFromEpochAndXid(EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid), set_xid); /* @@ -684,7 +684,7 @@ GuessControlValues(void) ControlFile.checkPointCopy.ThisTimeLineID = 1; ControlFile.checkPointCopy.PrevTimeLineID = 1; ControlFile.checkPointCopy.fullPageWrites = false; - ControlFile.checkPointCopy.nextFullXid = + ControlFile.checkPointCopy.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); ControlFile.checkPointCopy.nextOid = FirstBootstrapObjectId; ControlFile.checkPointCopy.nextMulti = FirstMultiXactId; @@ -756,8 +756,8 @@ PrintControlValues(bool guessed) printf(_("Latest checkpoint's full_page_writes: %s\n"), ControlFile.checkPointCopy.fullPageWrites ? _("on") : _("off")); printf(_("Latest checkpoint's NextXID: %u:%u\n"), - EpochFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid), - XidFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid)); + EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid), + XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), @@ -847,7 +847,7 @@ PrintNewControlValues(void) if (set_xid != 0) { printf(_("NextXID: %u\n"), - XidFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid)); + XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); printf(_("OldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); printf(_("OldestXID's DB: %u\n"), @@ -857,7 +857,7 @@ PrintNewControlValues(void) if (set_xid_epoch != -1) { printf(_("NextXID epoch: %u\n"), - EpochFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid)); + EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); } if (set_oldest_commit_ts_xid != 0) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index a91a0c7487d8..85508300e9a2 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -175,12 +175,12 @@ typedef struct VariableCacheData /* * These fields are protected by XidGenLock. */ - FullTransactionId nextFullXid; /* next full XID to assign */ + FullTransactionId nextXid; /* next XID to assign */ TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ TransactionId xidVacLimit; /* start forcing autovacuums here */ TransactionId xidWarnLimit; /* start complaining here */ - TransactionId xidStopLimit; /* refuse to advance nextFullXid beyond here */ + TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */ TransactionId xidWrapLimit; /* where the world ends */ Oid oldestXidDB; /* database with minimum datfrozenxid */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index de5670e53826..06bed90c5e9e 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -40,7 +40,7 @@ typedef struct CheckPoint TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new * timeline (equals ThisTimeLineID otherwise) */ bool fullPageWrites; /* current full_page_writes */ - FullTransactionId nextFullXid; /* next free full transaction ID */ + FullTransactionId nextXid; /* next free transaction ID */ Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index cfbe426e5ae3..faaf1d3817b6 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -72,7 +72,7 @@ typedef struct RunningTransactionsData int xcnt; /* # of xact ids in xids[] */ int subxcnt; /* # of subxact ids in xids[] */ bool subxid_overflow; /* snapshot overflowed, subxids missing */ - TransactionId nextXid; /* xid from ShmemVariableCache->nextFullXid */ + TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index 4876d2eeea13..4dda1c403a4b 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -49,7 +49,7 @@ typedef struct xl_running_xacts int xcnt; /* # of xact ids in xids[] */ int subxcnt; /* # of subxact ids in xids[] */ bool subxid_overflow; /* snapshot overflowed, subxids missing */ - TransactionId nextXid; /* xid from ShmemVariableCache->nextFullXid */ + TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ From 3bd7f9969a240827bc2effa399170b7565238fd2 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 11 Aug 2020 17:41:18 -0700 Subject: [PATCH 284/334] Track latest completed xid as a FullTransactionId. The reason for doing so is that a subsequent commit will need that to avoid wraparound issues. As the subsequent change is large this was split out for easier review. The reason this is not a perfect straight-forward change is that we do not want track 64bit xids in the procarray or the WAL. Therefore we need to advance lastestCompletedXid in relation to 32 bit xids. The code for that is now centralized in MaintainLatestCompletedXid*. Author: Andres Freund Reviewed-By: Thomas Munro, Robert Haas, David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- src/backend/access/transam/varsup.c | 50 +++++++++++ src/backend/access/transam/xlog.c | 4 +- src/backend/storage/ipc/procarray.c | 129 ++++++++++++++++++++++------ src/include/access/transam.h | 37 +++++++- 4 files changed, 191 insertions(+), 29 deletions(-) diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 3ebd75118f06..2ef0f4991caf 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -569,3 +569,53 @@ GetNewObjectId(void) return result; } + + +#ifdef USE_ASSERT_CHECKING + +/* + * Assert that xid is between [oldestXid, nextXid], which is the range we + * expect XIDs coming from tables etc to be in. + * + * As ShmemVariableCache->oldestXid could change just after this call without + * further precautions, and as a wrapped-around xid could again fall within + * the valid range, this assertion can only detect if something is definitely + * wrong, but not establish correctness. + * + * This intentionally does not expose a return value, to avoid code being + * introduced that depends on the return value. + */ +void +AssertTransactionIdInAllowableRange(TransactionId xid) +{ + TransactionId oldest_xid; + TransactionId next_xid; + + Assert(TransactionIdIsValid(xid)); + + /* we may see bootstrap / frozen */ + if (!TransactionIdIsNormal(xid)) + return; + + /* + * We can't acquire XidGenLock, as this may be called with XidGenLock + * already held (or with other locks that don't allow XidGenLock to be + * nested). That's ok for our purposes though, since we already rely on + * 32bit reads to be atomic. While nextXid is 64 bit, we only look at + * the lower 32bit, so a skewed read doesn't hurt. + * + * There's no increased danger of falling outside [oldest, next] by + * accessing them without a lock. xid needs to have been created with + * GetNewTransactionId() in the originating session, and the locks there + * pair with the memory barrier below. We do however accept xid to be <= + * to next_xid, instead of just <, as xid could be from the procarray, + * before we see the updated nextXid value. + */ + pg_memory_barrier(); + oldest_xid = ShmemVariableCache->oldestXid; + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) || + TransactionIdPrecedesOrEquals(xid, next_xid)); +} +#endif diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 53945c0e305d..8f72faee82cc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7865,8 +7865,8 @@ StartupXLOG(void) /* also initialize latestCompletedXid, to nextXid - 1 */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); - TransactionIdRetreat(ShmemVariableCache->latestCompletedXid); + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; + FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); LWLockRelease(ProcArrayLock); /* diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index be0240e0ddcd..522518695eec 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -175,6 +175,11 @@ static void KnownAssignedXidsReset(void); static inline void ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, TransactionId latestXid); static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); +static void MaintainLatestCompletedXid(TransactionId latestXid); +static void MaintainLatestCompletedXidRecovery(TransactionId latestXid); + +static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel, + TransactionId xid); /* * Report shared-memory space needed by CreateSharedProcArray. @@ -349,9 +354,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); /* Advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + MaintainLatestCompletedXid(latestXid); } else { @@ -464,9 +467,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, pgxact->overflowed = false; /* Also advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + MaintainLatestCompletedXid(latestXid); } /* @@ -621,6 +622,59 @@ ProcArrayClearTransaction(PGPROC *proc) pgxact->overflowed = false; } +/* + * Update ShmemVariableCache->latestCompletedXid to point to latestXid if + * currently older. + */ +static void +MaintainLatestCompletedXid(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; + + Assert(FullTransactionIdIsValid(cur_latest)); + Assert(!RecoveryInProgress()); + Assert(LWLockHeldByMe(ProcArrayLock)); + + if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + ShmemVariableCache->latestCompletedXid = + FullXidRelativeTo(cur_latest, latestXid); + } + + Assert(IsBootstrapProcessingMode() || + FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); +} + +/* + * Same as MaintainLatestCompletedXid, except for use during WAL replay. + */ +static void +MaintainLatestCompletedXidRecovery(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; + FullTransactionId rel; + + Assert(AmStartupProcess() || !IsUnderPostmaster); + Assert(LWLockHeldByMe(ProcArrayLock)); + + /* + * Need a FullTransactionId to compare latestXid with. Can't rely on + * latestCompletedXid to be initialized in recovery. But in recovery it's + * safe to access nextXid without a lock for the startup process. + */ + rel = ShmemVariableCache->nextXid; + Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); + + if (!FullTransactionIdIsValid(cur_latest) || + TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + ShmemVariableCache->latestCompletedXid = + FullXidRelativeTo(rel, latestXid); + } + + Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); +} + /* * ProcArrayInitRecovery -- initialize recovery xid mgmt environment * @@ -869,12 +923,9 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) * If a transaction wrote a commit record in the gap between taking and * logging the snapshot then latestCompletedXid may already be higher than * the value from the snapshot, so check before we use the incoming value. + * It also might not yet be set at all. */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - running->latestCompletedXid)) - ShmemVariableCache->latestCompletedXid = running->latestCompletedXid; - - Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); + MaintainLatestCompletedXidRecovery(running->latestCompletedXid); LWLockRelease(ProcArrayLock); @@ -989,6 +1040,7 @@ TransactionIdIsInProgress(TransactionId xid) int nxids = 0; ProcArrayStruct *arrayP = procArray; TransactionId topxid; + TransactionId latestCompletedXid; int i, j; @@ -1051,7 +1103,9 @@ TransactionIdIsInProgress(TransactionId xid) * Now that we have the lock, we can check latestCompletedXid; if the * target Xid is after that, it's surely still running. */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid)) + latestCompletedXid = + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); + if (TransactionIdPrecedes(latestCompletedXid, xid)) { LWLockRelease(ProcArrayLock); xc_by_latest_xid_inc(); @@ -1330,9 +1384,9 @@ GetOldestXmin(Relation rel, int flags) * and so protects us against overestimating the result due to future * additions. */ - result = ShmemVariableCache->latestCompletedXid; - Assert(TransactionIdIsNormal(result)); + result = XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); TransactionIdAdvance(result); + Assert(TransactionIdIsNormal(result)); for (index = 0; index < arrayP->numProcs; index++) { @@ -1511,6 +1565,7 @@ GetSnapshotData(Snapshot snapshot) int count = 0; int subcount = 0; bool suboverflowed = false; + FullTransactionId latest_completed; TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; @@ -1554,10 +1609,11 @@ GetSnapshotData(Snapshot snapshot) */ LWLockAcquire(ProcArrayLock, LW_SHARED); + latest_completed = ShmemVariableCache->latestCompletedXid; /* xmax is always latestCompletedXid + 1 */ - xmax = ShmemVariableCache->latestCompletedXid; - Assert(TransactionIdIsNormal(xmax)); + xmax = XidFromFullTransactionId(latest_completed); TransactionIdAdvance(xmax); + Assert(TransactionIdIsNormal(xmax)); /* initialize xmin calculation with xmax */ globalxmin = xmin = xmax; @@ -1984,9 +2040,10 @@ GetRunningTransactionData(void) LWLockAcquire(ProcArrayLock, LW_SHARED); LWLockAcquire(XidGenLock, LW_SHARED); - latestCompletedXid = ShmemVariableCache->latestCompletedXid; - - oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + latestCompletedXid = + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); + oldestRunningXid = + XidFromFullTransactionId(ShmemVariableCache->nextXid); /* * Spin over procArray collecting all xids @@ -3207,9 +3264,7 @@ XidCacheRemoveRunningXids(TransactionId xid, elog(WARNING, "did not find subXID %u in MyProc", xid); /* Also advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + MaintainLatestCompletedXid(latestXid); LWLockRelease(ProcArrayLock); } @@ -3236,6 +3291,32 @@ DisplayXidCache(void) } #endif /* XIDCACHE_DEBUG */ +/* + * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it + * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel). + * + * Be very careful about when to use this function. It can only safely be used + * when there is a guarantee that xid is within MaxTransactionId / 2 xids of + * rel. That e.g. can be guaranteed if the the caller assures a snapshot is + * held by the backend and xid is from a table (where vacuum/freezing ensures + * the xid has to be within that range), or if xid is from the procarray and + * prevents xid wraparound that way. + */ +static inline FullTransactionId +FullXidRelativeTo(FullTransactionId rel, TransactionId xid) +{ + TransactionId rel_xid = XidFromFullTransactionId(rel); + + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdIsValid(rel_xid)); + + /* not guaranteed to find issues, but likely to catch mistakes */ + AssertTransactionIdInAllowableRange(xid); + + return FullTransactionIdFromU64(U64FromFullTransactionId(rel) + + (int32) (xid - rel_xid)); +} + /* ---------------------------------------------- * KnownAssignedTransactionIds sub-module @@ -3388,9 +3469,7 @@ ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids, KnownAssignedXidsRemoveTree(xid, nsubxids, subxids); /* As in ProcArrayEndTransaction, advance latestCompletedXid */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - max_xid)) - ShmemVariableCache->latestCompletedXid = max_xid; + MaintainLatestCompletedXidRecovery(max_xid); LWLockRelease(ProcArrayLock); } diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 85508300e9a2..8db326ad1b50 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -54,6 +54,8 @@ #define FullTransactionIdFollowsOrEquals(a, b) ((a).value >= (b).value) #define FullTransactionIdIsValid(x) TransactionIdIsValid(XidFromFullTransactionId(x)) #define InvalidFullTransactionId FullTransactionIdFromEpochAndXid(0, InvalidTransactionId) +#define FirstNormalFullTransactionId FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId) +#define FullTransactionIdIsNormal(x) FullTransactionIdFollowsOrEquals(x, FirstNormalFullTransactionId) /* * A 64 bit value that contains an epoch and a TransactionId. This is @@ -102,6 +104,31 @@ FullTransactionIdAdvance(FullTransactionId *dest) dest->value++; } +/* + * Retreat a FullTransactionId variable, stepping over xids that would appear + * to be special only when viewed as 32bit XIDs. + */ +static inline void +FullTransactionIdRetreat(FullTransactionId *dest) +{ + dest->value--; + + /* + * In contrast to 32bit XIDs don't step over the "actual" special xids. + * For 64bit xids these can't be reached as part of a wraparound as they + * can in the 32bit case. + */ + if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId)) + return; + + /* + * But we do need to step over XIDs that'd appear special only for 32bit + * XIDs. + */ + while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId) + dest->value--; +} + /* back up a transaction ID variable, handling wraparound correctly */ #define TransactionIdRetreat(dest) \ do { \ @@ -193,8 +220,8 @@ typedef struct VariableCacheData /* * These fields are protected by ProcArrayLock. */ - TransactionId latestCompletedXid; /* newest XID that has committed or - * aborted */ + FullTransactionId latestCompletedXid; /* newest full XID that has + * committed or aborted */ /* * These fields are protected by XactTruncationLock @@ -244,6 +271,12 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); +#ifdef USE_ASSERT_CHECKING +extern void AssertTransactionIdInAllowableRange(TransactionId xid); +#else +#define AssertTransactionIdInAllowableRange(xid) ((void)true) +#endif + /* * Some frontend programs include this header. For compilers that emit static * inline functions even when they're unused, that leads to unsatisfied From 3546cf8a7a9dc57e6aa98f5fc1ac5476ad6b99ff Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 12 Aug 2020 11:54:16 -0400 Subject: [PATCH 285/334] Improve comments for postmaster.c's BackendList. This had gotten a little disjointed over time, and some of the grammar was sloppy. Rewrite for more clarity. In passing, re-pgindent some recently added comments. No code changes. --- src/backend/postmaster/postmaster.c | 37 ++++++++++++++++------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 5b5fc97c72da..38e2c16ac206 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -156,28 +156,32 @@ * authorization phase). This is used mainly to keep track of how many * children we have and send them appropriate signals when necessary. * - * "Special" children such as the startup, bgwriter and autovacuum launcher - * tasks are not in this list. Autovacuum worker and walsender are in it. + * As shown in the above set of backend types, this list includes not only + * "normal" client sessions, but also autovacuum workers, walsenders, and + * background workers. (Note that at the time of launch, walsenders are + * labeled BACKEND_TYPE_NORMAL; we relabel them to BACKEND_TYPE_WALSND + * upon noticing they've changed their PMChildFlags entry. Hence that check + * must be done before any operation that needs to distinguish walsenders + * from normal backends.) + * * Also, "dead_end" children are in it: these are children launched just for * the purpose of sending a friendly rejection message to a would-be client. * We must track them because they are attached to shared memory, but we know * they will never become live backends. dead_end children are not assigned a - * PMChildSlot. + * PMChildSlot. dead_end children have bkend_type NORMAL. * - * Background workers are in this list, too. + * "Special" children such as the startup, bgwriter and autovacuum launcher + * tasks are not in this list. They are tracked via StartupPID and other + * pid_t variables below. (Thus, there can't be more than one of any given + * "special" child process type. We use BackendList entries for any child + * process there can be more than one of.) */ typedef struct bkend { pid_t pid; /* process id of backend */ int32 cancel_key; /* cancel key for cancels for this backend */ int child_slot; /* PMChildSlot for this backend, if any */ - - /* - * Flavor of backend or auxiliary process. Note that BACKEND_TYPE_WALSND - * backends initially announce themselves as BACKEND_TYPE_NORMAL, so if - * bkend_type is normal, you should check for a recent transition. - */ - int bkend_type; + int bkend_type; /* child process flavor, see above */ bool dead_end; /* is it going to send an error and quit? */ bool bgworker_notify; /* gets bgworker start/stop notifications */ dlist_node elem; /* list link in BackendList */ @@ -1059,10 +1063,9 @@ PostmasterMain(int argc, char *argv[]) * only during a few moments during a standby promotion. However there is * a race condition: if pg_ctl promote is executed and creates the files * during a promotion, the files can stay around even after the server is - * brought up to be the primary. Then, if a new standby starts by using the - * backup taken from the new primary, the files can exist at the server - * startup and should be removed in order to avoid an unexpected - * promotion. + * brought up to be the primary. Then, if a new standby starts by using + * the backup taken from the new primary, the files can exist at server + * startup and must be removed in order to avoid an unexpected promotion. * * Note that promotion signal files need to be removed before the startup * process is invoked. Because, after that, they can be used by @@ -5336,8 +5339,8 @@ sigusr1_handler(SIGNAL_ARGS) /* * Tell startup process to finish recovery. * - * Leave the promote signal file in place and let the Startup - * process do the unlink. + * Leave the promote signal file in place and let the Startup process + * do the unlink. */ signal_child(StartupPID, SIGUSR2); } From 1f42d35a1d6144a23602b2c0bc7f97f3046cf890 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Wed, 12 Aug 2020 15:33:36 -0400 Subject: [PATCH 286/334] BRIN: Handle concurrent desummarization properly If a page range is desummarized at just the right time concurrently with an index walk, BRIN would raise an error indicating index corruption. This is scary and unhelpful; silently returning that the page range is not summarized is sufficient reaction. This bug was introduced by commit 975ad4e602ff as additional protection against a bug whose actual fix was elsewhere. Backpatch equally. Reported-By: Anastasia Lubennikova Diagnosed-By: Alexander Lakhin Discussion: https://postgr.es/m/2588667e-d07d-7e10-74e2-7e1e46194491@postgrespro.ru Backpatch: 9.5 - master --- src/backend/access/brin/brin_revmap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index e8b8308f82ec..35746714a7c4 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -282,10 +282,17 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, /* If we land on a revmap page, start over */ if (BRIN_IS_REGULAR_PAGE(page)) { + /* + * If the offset number is greater than what's in the page, it's + * possible that the range was desummarized concurrently. Just + * return NULL to handle that case. + */ if (*off > PageGetMaxOffsetNumber(page)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("corrupted BRIN index: inconsistent range map"))); + { + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + return NULL; + } + lp = PageGetItemId(page, *off); if (ItemIdIsUsed(lp)) { From dc7420c2c9274a283779ec19718d2d16323640c0 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 12 Aug 2020 16:03:49 -0700 Subject: [PATCH 287/334] snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund Reviewed-By: Robert Haas Reviewed-By: Thomas Munro Reviewed-By: David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- contrib/amcheck/verify_nbtree.c | 8 +- contrib/pg_visibility/pg_visibility.c | 18 +- contrib/pgstattuple/pgstatapprox.c | 2 +- src/backend/access/gin/ginvacuum.c | 26 + src/backend/access/gist/gistutil.c | 8 +- src/backend/access/gist/gistxlog.c | 10 +- src/backend/access/heap/heapam.c | 15 +- src/backend/access/heap/heapam_handler.c | 24 +- src/backend/access/heap/heapam_visibility.c | 99 ++- src/backend/access/heap/pruneheap.c | 205 ++++- src/backend/access/heap/vacuumlazy.c | 24 +- src/backend/access/index/indexam.c | 3 +- src/backend/access/nbtree/README | 10 +- src/backend/access/nbtree/nbtpage.c | 4 +- src/backend/access/nbtree/nbtree.c | 28 +- src/backend/access/nbtree/nbtxlog.c | 10 +- src/backend/access/spgist/spgvacuum.c | 6 +- src/backend/access/transam/README | 82 +- src/backend/access/transam/xlog.c | 4 +- src/backend/commands/analyze.c | 2 +- src/backend/commands/vacuum.c | 41 +- src/backend/postmaster/autovacuum.c | 4 + src/backend/replication/logical/launcher.c | 4 + src/backend/replication/walreceiver.c | 17 +- src/backend/replication/walsender.c | 15 +- src/backend/storage/ipc/procarray.c | 901 ++++++++++++++++---- src/backend/utils/adt/selfuncs.c | 20 +- src/backend/utils/init/postinit.c | 4 + src/backend/utils/time/snapmgr.c | 250 +++--- src/include/access/ginblock.h | 4 +- src/include/access/heapam.h | 10 +- src/include/access/transam.h | 79 +- src/include/storage/bufpage.h | 6 - src/include/storage/proc.h | 8 - src/include/storage/procarray.h | 32 +- src/include/utils/snapmgr.h | 37 +- src/include/utils/snapshot.h | 6 + src/tools/pgindent/typedefs.list | 2 + 38 files changed, 1462 insertions(+), 566 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 635ece73b354..5f3de3c0b7f6 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -434,10 +434,10 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, RelationGetRelationName(rel)); /* - * RecentGlobalXmin assertion matches index_getnext_tid(). See note on - * RecentGlobalXmin/B-Tree page deletion. + * This assertion matches the one in index_getnext_tid(). See page + * recycling/"visible to everyone" notes in nbtree README. */ - Assert(TransactionIdIsValid(RecentGlobalXmin)); + Assert(TransactionIdIsValid(RecentXmin)); /* * Initialize state for entire verification operation @@ -1581,7 +1581,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) * does not occur until no possible index scan could land on the page. * Index scans can follow links with nothing more than their snapshot as * an interlock and be sure of at least that much. (See page - * recycling/RecentGlobalXmin notes in nbtree README.) + * recycling/"visible to everyone" notes in nbtree README.) * * Furthermore, it's okay if we follow a rightlink and find a half-dead or * dead (ignorable) page one or more times. There will either be a diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index e731161734ae..54e47b810fd2 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -563,17 +563,14 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); TransactionId OldestXmin = InvalidTransactionId; - if (all_visible) - { - /* Don't pass rel; that will fail in recovery. */ - OldestXmin = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM); - } - rel = relation_open(relid, AccessShareLock); /* Only some relkinds have a visibility map */ check_relation_relkind(rel); + if (all_visible) + OldestXmin = GetOldestNonRemovableTransactionId(rel); + nblocks = RelationGetNumberOfBlocks(rel); /* @@ -679,11 +676,12 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) * From a concurrency point of view, it sort of sucks to * retake ProcArrayLock here while we're holding the buffer * exclusively locked, but it should be safe against - * deadlocks, because surely GetOldestXmin() should never take - * a buffer lock. And this shouldn't happen often, so it's - * worth being careful so as to avoid false positives. + * deadlocks, because surely + * GetOldestNonRemovableTransactionId() should never take a + * buffer lock. And this shouldn't happen often, so it's worth + * being careful so as to avoid false positives. */ - RecomputedOldestXmin = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM); + RecomputedOldestXmin = GetOldestNonRemovableTransactionId(rel); if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin)) record_corrupt_item(items, &tuple.t_self); diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index dbc0fa11f615..3a99333d4435 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -71,7 +71,7 @@ statapprox_heap(Relation rel, output_type *stat) BufferAccessStrategy bstrategy; TransactionId OldestXmin; - OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); + OldestXmin = GetOldestNonRemovableTransactionId(rel); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 8ae4fd95a7bf..9cd6638df621 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -793,3 +793,29 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) return stats; } + +/* + * Return whether Page can safely be recycled. + */ +bool +GinPageIsRecyclable(Page page) +{ + TransactionId delete_xid; + + if (PageIsNew(page)) + return true; + + if (!GinPageIsDeleted(page)) + return false; + + delete_xid = GinPageGetDeleteXid(page); + + if (!TransactionIdIsValid(delete_xid)) + return true; + + /* + * If no backend still could view delete_xid as in running, all scans + * concurrent with ginDeletePage() must have finished. + */ + return GlobalVisCheckRemovableXid(NULL, delete_xid); +} diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 765329bbcd43..bfda7fbe3d58 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -891,15 +891,13 @@ gistPageRecyclable(Page page) * As long as that can happen, we must keep the deleted page around as * a tombstone. * - * Compare the deletion XID with RecentGlobalXmin. If deleteXid < - * RecentGlobalXmin, then no scan that's still in progress could have + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have * seen its downlink, and we can recycle it. */ FullTransactionId deletexid_full = GistPageGetDeleteXid(page); - FullTransactionId recentxmin_full = GetFullRecentGlobalXmin(); - if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full)) - return true; + return GlobalVisIsRemovableFullXid(NULL, deletexid_full); } return false; } diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 7b5d1e98b70b..a63b05388c5d 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -387,11 +387,11 @@ gistRedoPageReuse(XLogReaderState *record) * PAGE_REUSE records exist to provide a conflict point when we reuse * pages in the index via the FSM. That's all they do though. * - * latestRemovedXid was the page's deleteXid. The deleteXid < - * RecentGlobalXmin test in gistPageRecyclable() conceptually mirrors the - * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). - * Consequently, one XID value achieves the same exclusion effect on - * primary and standby. + * latestRemovedXid was the page's deleteXid. The + * GlobalVisIsRemovableFullXid(deleteXid) test in gistPageRecyclable() + * conceptually mirrors the pgxact->xmin > limitXmin test in + * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the + * same exclusion effect on primary and standby. */ if (InHotStandby) { diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 00169006fb1f..f75e1cf0e7b0 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1517,6 +1517,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, bool at_chain_start; bool valid; bool skip; + GlobalVisState *vistest = NULL; /* If this is not the first call, previous call returned a (live!) tuple */ if (all_dead) @@ -1527,7 +1528,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, at_chain_start = first_call; skip = !first_call; - Assert(TransactionIdIsValid(RecentGlobalXmin)); + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); Assert(BufferGetBlockNumber(buffer) == blkno); /* Scan through possible multiple members of HOT-chain */ @@ -1616,9 +1618,14 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * Note: if you change the criterion here for what is "dead", fix the * planner's get_actual_variable_range() function to match. */ - if (all_dead && *all_dead && - !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin)) - *all_dead = false; + if (all_dead && *all_dead) + { + if (!vistest) + vistest = GlobalVisTestFor(relation); + + if (!HeapTupleIsSurelyDead(heapTuple, vistest)) + *all_dead = false; + } /* * Check to see if HOT chain continues past this tuple; if so fetch diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 267a6ee25a75..e3e41fb75163 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1203,7 +1203,7 @@ heapam_index_build_range_scan(Relation heapRelation, /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) - OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM); + OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); if (!scan) { @@ -1244,6 +1244,17 @@ heapam_index_build_range_scan(Relation heapRelation, hscan = (HeapScanDesc) scan; + /* + * Must have called GetOldestNonRemovableTransactionId() if using + * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially + * worth checking this for parallel builds, since ambuild routines that + * support parallel builds must work these details out for themselves.) + */ + Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); + Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : + !TransactionIdIsValid(OldestXmin)); + Assert(snapshot == SnapshotAny || !anyvisible); + /* Publish number of blocks to scan */ if (progress) { @@ -1263,17 +1274,6 @@ heapam_index_build_range_scan(Relation heapRelation, nblocks); } - /* - * Must call GetOldestXmin() with SnapshotAny. Should never call - * GetOldestXmin() with MVCC snapshot. (It's especially worth checking - * this for parallel builds, since ambuild routines that support parallel - * builds must work these details out for themselves.) - */ - Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); - Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : - !TransactionIdIsValid(OldestXmin)); - Assert(snapshot == SnapshotAny || !anyvisible); - /* set our scan endpoints */ if (!allow_sync) heap_setscanlimits(scan, start_blockno, numblocks); diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index c77128087cf7..528e75bafd45 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -1154,19 +1154,56 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, * we mainly want to know is if a tuple is potentially visible to *any* * running transaction. If so, it can't be removed yet by VACUUM. * - * OldestXmin is a cutoff XID (obtained from GetOldestXmin()). Tuples - * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might - * still be visible to some open transaction, so we can't remove them, - * even if we see that the deleting transaction has committed. + * OldestXmin is a cutoff XID (obtained from + * GetOldestNonRemovableTransactionId()). Tuples deleted by XIDs >= + * OldestXmin are deemed "recently dead"; they might still be visible to some + * open transaction, so we can't remove them, even if we see that the deleting + * transaction has committed. */ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (TransactionIdPrecedes(dead_after, OldestXmin)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res; +} + +/* + * Work horse for HeapTupleSatisfiesVacuum and similar routines. + * + * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a + * tuple that could still be visible to some backend, stores the xid that + * needs to be compared with the horizon in *dead_after, and returns + * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with + * the horizon. This is e.g. useful when comparing with different horizons. + * + * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting + * transaction aborted. + */ +HTSV_Result +HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after) { HeapTupleHeader tuple = htup->t_data; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); + Assert(dead_after != NULL); + + *dead_after = InvalidTransactionId; /* * Has inserting transaction committed? @@ -1323,17 +1360,15 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, else if (TransactionIdDidCommit(xmax)) { /* - * The multixact might still be running due to lockers. If the - * updater is below the xid horizon, we have to return DEAD - * regardless -- otherwise we could end up with a tuple where the - * updater has to be removed due to the horizon, but is not pruned - * away. It's not a problem to prune that tuple, because any - * remaining lockers will also be present in newer tuple versions. + * The multixact might still be running due to lockers. Need to + * allow for pruning if below the xid horizon regardless -- + * otherwise we could end up with a tuple where the updater has to + * be removed due to the horizon, but is not pruned away. It's + * not a problem to prune that tuple, because any remaining + * lockers will also be present in newer tuple versions. */ - if (!TransactionIdPrecedes(xmax, OldestXmin)) - return HEAPTUPLE_RECENTLY_DEAD; - - return HEAPTUPLE_DEAD; + *dead_after = xmax; + return HEAPTUPLE_RECENTLY_DEAD; } else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) { @@ -1372,14 +1407,11 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, } /* - * Deleter committed, but perhaps it was recent enough that some open - * transactions could still see the tuple. + * Deleter committed, allow caller to check if it was recent enough that + * some open transactions could still see the tuple. */ - if (!TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin)) - return HEAPTUPLE_RECENTLY_DEAD; - - /* Otherwise, it's dead and removable */ - return HEAPTUPLE_DEAD; + *dead_after = HeapTupleHeaderGetRawXmax(tuple); + return HEAPTUPLE_RECENTLY_DEAD; } @@ -1393,14 +1425,28 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, * * This is an interface to HeapTupleSatisfiesVacuum that's callable via * HeapTupleSatisfiesSnapshot, so it can be used through a Snapshot. - * snapshot->xmin must have been set up with the xmin horizon to use. + * snapshot->vistest must have been set up with the horizon to use. */ static bool HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, Buffer buffer) { - return HeapTupleSatisfiesVacuum(htup, snapshot->xmin, buffer) - != HEAPTUPLE_DEAD; + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (GlobalVisTestIsRemovableXid(snapshot->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res != HEAPTUPLE_DEAD; } @@ -1418,7 +1464,7 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, * if the tuple is removable. */ bool -HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin) +HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) { HeapTupleHeader tuple = htup->t_data; @@ -1459,7 +1505,8 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin) return false; /* Deleter committed, so tuple is dead if the XID is old enough. */ - return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin); + return GlobalVisTestIsRemovableXid(vistest, + HeapTupleHeaderGetRawXmax(tuple)); } /* diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 256df4de1050..00a3cb106aac 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -23,12 +23,30 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "utils/snapmgr.h" #include "utils/rel.h" #include "utils/snapmgr.h" /* Working data for heap_page_prune and subroutines */ typedef struct { + Relation rel; + + /* tuple visibility test, initialized for the relation */ + GlobalVisState *vistest; + + /* + * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have + * been computed (done on demand, and only if + * OldSnapshotThresholdActive()). The first time a tuple is about to be + * removed based on the limited horizon, old_snap_used is set to true, and + * SetOldSnapshotThresholdTimestamp() is called. See + * heap_prune_satisfies_vacuum(). + */ + TimestampTz old_snap_ts; + TransactionId old_snap_xmin; + bool old_snap_used; + TransactionId new_prune_xid; /* new prune hint value for page */ TransactionId latestRemovedXid; /* latest xid to be removed by this prune */ int nredirected; /* numbers of entries in arrays below */ @@ -43,9 +61,8 @@ typedef struct } PruneState; /* Local functions */ -static int heap_prune_chain(Relation relation, Buffer buffer, +static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, - TransactionId OldestXmin, PruneState *prstate); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, @@ -65,16 +82,16 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. - * - * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD - * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). */ void heap_page_prune_opt(Relation relation, Buffer buffer) { Page page = BufferGetPage(buffer); + TransactionId prune_xid; + GlobalVisState *vistest; + TransactionId limited_xmin = InvalidTransactionId; + TimestampTz limited_ts = 0; Size minfree; - TransactionId OldestXmin; /* * We can't write WAL in recovery mode, so there's no point trying to @@ -85,37 +102,55 @@ heap_page_prune_opt(Relation relation, Buffer buffer) return; /* - * Use the appropriate xmin horizon for this relation. If it's a proper - * catalog relation or a user defined, additional, catalog relation, we - * need to use the horizon that includes slots, otherwise the data-only - * horizon can be used. Note that the toast relation of user defined - * relations are *not* considered catalog relations. + * XXX: Magic to keep old_snapshot_threshold tests appear "working". They + * currently are broken, and discussion of what to do about them is + * ongoing. See + * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de + */ + if (old_snapshot_threshold == 0) + SnapshotTooOldMagicForTest(); + + /* + * First check whether there's any chance there's something to prune, + * determining the appropriate horizon is a waste if there's no prune_xid + * (i.e. no updates/deletes left potentially dead tuples around). + */ + prune_xid = ((PageHeader) page)->pd_prune_xid; + if (!TransactionIdIsValid(prune_xid)) + return; + + /* + * Check whether prune_xid indicates that there may be dead rows that can + * be cleaned up. * - * It is OK to apply the old snapshot limit before acquiring the cleanup + * It is OK to check the old snapshot limit before acquiring the cleanup * lock because the worst that can happen is that we are not quite as * aggressive about the cleanup (by however many transaction IDs are * consumed between this point and acquiring the lock). This allows us to * save significant overhead in the case where the page is found not to be * prunable. + * + * Even if old_snapshot_threshold is set, we first check whether the page + * can be pruned without. Both because + * TransactionIdLimitedForOldSnapshots() is not cheap, and because not + * unnecessarily relying on old_snapshot_threshold avoids causing + * conflicts. */ - if (IsCatalogRelation(relation) || - RelationIsAccessibleInLogicalDecoding(relation)) - OldestXmin = RecentGlobalXmin; - else - OldestXmin = - TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, - relation); + vistest = GlobalVisTestFor(relation); - Assert(TransactionIdIsValid(OldestXmin)); + if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) + { + if (!OldSnapshotThresholdActive()) + return; - /* - * Let's see if we really need pruning. - * - * Forget it if page is not hinted to contain something prunable that's - * older than OldestXmin. - */ - if (!PageIsPrunable(page, OldestXmin)) - return; + if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest), + relation, + &limited_xmin, &limited_ts)) + return; + + if (!TransactionIdPrecedes(prune_xid, limited_xmin)) + return; + } /* * We prune when a previous UPDATE failed to find enough space on the page @@ -151,7 +186,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * needed */ /* OK to prune */ - (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); + (void) heap_page_prune(relation, buffer, vistest, + limited_xmin, limited_ts, + true, &ignore); } /* And release buffer lock */ @@ -165,8 +202,11 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * * Caller must have pin and buffer cleanup lock on the page. * - * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD - * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). + * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD + * (see heap_prune_satisfies_vacuum and + * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to + * either have been set by TransactionIdLimitedForOldSnapshots, or + * InvalidTransactionId/0 respectively. * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be false during vacuum, since vacuum will @@ -177,7 +217,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * latestRemovedXid. */ int -heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, +heap_page_prune(Relation relation, Buffer buffer, + GlobalVisState *vistest, + TransactionId old_snap_xmin, + TimestampTz old_snap_ts, bool report_stats, TransactionId *latestRemovedXid) { int ndeleted = 0; @@ -198,6 +241,11 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; + prstate.rel = relation; + prstate.vistest = vistest; + prstate.old_snap_xmin = old_snap_xmin; + prstate.old_snap_ts = old_snap_ts; + prstate.old_snap_used = false; prstate.latestRemovedXid = *latestRemovedXid; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); @@ -220,9 +268,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, continue; /* Process this item or chain of items */ - ndeleted += heap_prune_chain(relation, buffer, offnum, - OldestXmin, - &prstate); + ndeleted += heap_prune_chain(buffer, offnum, &prstate); } /* Any error while applying the changes is critical */ @@ -323,6 +369,85 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, } +/* + * Perform visiblity checks for heap pruning. + * + * This is more complicated than just using GlobalVisTestIsRemovableXid() + * because of old_snapshot_threshold. We only want to increase the threshold + * that triggers errors for old snapshots when we actually decide to remove a + * row based on the limited horizon. + * + * Due to its cost we also only want to call + * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have + * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we + * still want to be able to remove rows that are too new to be removed + * according to prstate->vistest, but that can be removed based on + * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on + * demand in here, if appropriate. + */ +static HTSV_Result +heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) +{ + HTSV_Result res; + TransactionId dead_after; + + res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after); + + if (res != HEAPTUPLE_RECENTLY_DEAD) + return res; + + /* + * If we are already relying on the limited xmin, there is no need to + * delay doing so anymore. + */ + if (prstate->old_snap_used) + { + Assert(TransactionIdIsValid(prstate->old_snap_xmin)); + + if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) + res = HEAPTUPLE_DEAD; + return res; + } + + /* + * First check if GlobalVisTestIsRemovableXid() is sufficient to find the + * row dead. If not, and old_snapshot_threshold is enabled, try to use the + * lowered horizon. + */ + if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + else if (OldSnapshotThresholdActive()) + { + /* haven't determined limited horizon yet, requests */ + if (!TransactionIdIsValid(prstate->old_snap_xmin)) + { + TransactionId horizon = + GlobalVisTestNonRemovableHorizon(prstate->vistest); + + TransactionIdLimitedForOldSnapshots(horizon, prstate->rel, + &prstate->old_snap_xmin, + &prstate->old_snap_ts); + } + + if (TransactionIdIsValid(prstate->old_snap_xmin) && + TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) + { + /* + * About to remove row based on snapshot_too_old. Need to raise + * the threshold so problematic accesses would error. + */ + Assert(!prstate->old_snap_used); + SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts, + prstate->old_snap_xmin); + prstate->old_snap_used = true; + res = HEAPTUPLE_DEAD; + } + } + + return res; +} + + /* * Prune specified line pointer or a HOT chain originating at line pointer. * @@ -349,9 +474,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, * Returns the number of tuples (to be) deleted from the page. */ static int -heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, - TransactionId OldestXmin, - PruneState *prstate) +heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); @@ -366,7 +489,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, i; HeapTupleData tup; - tup.t_tableOid = RelationGetRelid(relation); + tup.t_tableOid = RelationGetRelid(prstate->rel); rootlp = PageGetItemId(dp, rootoffnum); @@ -401,7 +524,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ - if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer) + if (heap_prune_satisfies_vacuum(prstate, &tup, buffer) == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); @@ -485,7 +608,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, */ tupdead = recent_dead = false; - switch (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)) + switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer)) { case HEAPTUPLE_DEAD: tupdead = true; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 1bbc4598f75e..44e2224dd557 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -788,6 +788,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, PROGRESS_VACUUM_MAX_DEAD_TUPLES }; int64 initprog_val[3]; + GlobalVisState *vistest; pg_rusage_init(&ru0); @@ -816,6 +817,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; + vistest = GlobalVisTestFor(onerel); + /* * Initialize state for a parallel vacuum. As of now, only one worker can * be used for an index, so we invoke parallelism only if there are at @@ -1239,7 +1242,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, * * We count tuples removed by the pruning step as removed by VACUUM. */ - tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, + tups_vacuumed += heap_page_prune(onerel, buf, vistest, false, + InvalidTransactionId, 0, &vacrelstats->latestRemovedXid); /* @@ -1596,14 +1600,16 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, } /* - * It's possible for the value returned by GetOldestXmin() to move - * backwards, so it's not wrong for us to see tuples that appear to - * not be visible to everyone yet, while PD_ALL_VISIBLE is already - * set. The real safe xmin value never moves backwards, but - * GetOldestXmin() is conservative and sometimes returns a value - * that's unnecessarily small, so if we see that contradiction it just - * means that the tuples that we think are not visible to everyone yet - * actually are, and the PD_ALL_VISIBLE flag is correct. + * It's possible for the value returned by + * GetOldestNonRemovableTransactionId() to move backwards, so it's not + * wrong for us to see tuples that appear to not be visible to + * everyone yet, while PD_ALL_VISIBLE is already set. The real safe + * xmin value never moves backwards, but + * GetOldestNonRemovableTransactionId() is conservative and sometimes + * returns a value that's unnecessarily small, so if we see that + * contradiction it just means that the tuples that we think are not + * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag + * is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 6b9750c244a7..3fb8688f8f4c 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -519,7 +519,8 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) SCAN_CHECKS; CHECK_SCAN_PROCEDURE(amgettuple); - Assert(TransactionIdIsValid(RecentGlobalXmin)); + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); /* * The AM's amgettuple proc finds the next index entry matching the scan diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index abce31a5a96b..781a8f1932d3 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -342,9 +342,9 @@ snapshots and registered snapshots as of the deletion are gone; which is overly strong, but is simple to implement within Postgres. When marked dead, a deleted page is labeled with the next-transaction counter value. VACUUM can reclaim the page for re-use when this transaction number is -older than RecentGlobalXmin. As collateral damage, this implementation -also waits for running XIDs with no snapshots and for snapshots taken -until the next transaction to allocate an XID commits. +guaranteed to be "visible to everyone". As collateral damage, this +implementation also waits for running XIDs with no snapshots and for +snapshots taken until the next transaction to allocate an XID commits. Reclaiming a page doesn't actually change its state on disk --- we simply record it in the shared-memory free space map, from which it will be @@ -411,8 +411,8 @@ page and also the correct place to hold the current value. We can avoid the cost of walking down the tree in such common cases. The optimization works on the assumption that there can only be one -non-ignorable leaf rightmost page, and so even a RecentGlobalXmin style -interlock isn't required. We cannot fail to detect that our hint was +non-ignorable leaf rightmost page, and so not even a visible-to-everyone +style interlock required. We cannot fail to detect that our hint was invalidated, because there can only be one such page in the B-Tree at any time. It's possible that the page will be deleted and recycled without a backend's cached page also being detected as invalidated, but diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index d5db9aaa3a13..74be3807bb7d 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1097,7 +1097,7 @@ _bt_page_recyclable(Page page) */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque) && - TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin)) + GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact)) return true; return false; } @@ -2318,7 +2318,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * updated links to the target, ReadNewTransactionId() suffices as an * upper bound. Any scan having retained a now-stale link is advertising * in its PGXACT an xmin less than or equal to the value we read here. It - * will continue to do so, holding back RecentGlobalXmin, for the duration + * will continue to do so, holding back the xmin horizon, for the duration * of that scan. */ page = BufferGetPage(buf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 49a8a9708e38..8fa6ac7296b9 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -808,6 +808,12 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + /* + * XXX: If IndexVacuumInfo contained the heap relation, we could be more + * aggressive about vacuuming non catalog relations by passing the table + * to GlobalVisCheckRemovableXid(). + */ + if (metad->btm_version < BTREE_NOVAC_VERSION) { /* @@ -817,13 +823,12 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) result = true; } else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && - TransactionIdPrecedes(metad->btm_oldest_btpo_xact, - RecentGlobalXmin)) + GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact)) { /* * If any oldest btpo.xact from a previously deleted page in the index - * is older than RecentGlobalXmin, then at least one deleted page can - * be recycled -- don't skip cleanup. + * is visible to everyone, then at least one deleted page can be + * recycled -- don't skip cleanup. */ result = true; } @@ -1276,14 +1281,13 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) * own conflict now.) * * Backends with snapshots acquired after a VACUUM starts but - * before it finishes could have a RecentGlobalXmin with a - * later xid than the VACUUM's OldestXmin cutoff. These - * backends might happen to opportunistically mark some index - * tuples LP_DEAD before we reach them, even though they may - * be after our cutoff. We don't try to kill these "extra" - * index tuples in _bt_delitems_vacuum(). This keep things - * simple, and allows us to always avoid generating our own - * conflicts. + * before it finishes could have visibility cutoff with a + * later xid than VACUUM's OldestXmin cutoff. These backends + * might happen to opportunistically mark some index tuples + * LP_DEAD before we reach them, even though they may be after + * our cutoff. We don't try to kill these "extra" index + * tuples in _bt_delitems_vacuum(). This keep things simple, + * and allows us to always avoid generating our own conflicts. */ Assert(!BTreeTupleIsPivot(itup)); if (!BTreeTupleIsPosting(itup)) diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index dbec58d5249c..bda9be234896 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -948,11 +948,11 @@ btree_xlog_reuse_page(XLogReaderState *record) * Btree reuse_page records exist to provide a conflict point when we * reuse pages in the index via the FSM. That's all they do though. * - * latestRemovedXid was the page's btpo.xact. The btpo.xact < - * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the - * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). - * Consequently, one XID value achieves the same exclusion effect on - * primary and standby. + * latestRemovedXid was the page's btpo.xact. The + * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually + * mirrors the pgxact->xmin > limitXmin test in + * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the + * same exclusion effect on primary and standby. */ if (InHotStandby) { diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index bd98707f3c05..e1c58933f979 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -501,10 +501,14 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber itemnos[MaxIndexTuplesPerPage]; spgxlogVacuumRedirect xlrec; + GlobalVisState *vistest; xlrec.nToPlaceholder = 0; xlrec.newestRedirectXid = InvalidTransactionId; + /* XXX: providing heap relation would allow more pruning */ + vistest = GlobalVisTestFor(NULL); + START_CRIT_SECTION(); /* @@ -521,7 +525,7 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); if (dt->tupstate == SPGIST_REDIRECT && - TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) + GlobalVisTestIsRemovableXid(vistest, dt->xid)) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index eb9aac5fd396..6f44ae9ce6a5 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -281,7 +281,7 @@ present or the overflow flag is set.) If a backend released XidGenLock before storing its XID into MyPgXact, then it would be possible for another backend to allocate and commit a later XID, causing latestCompletedXid to pass the first backend's XID, before that value became visible in the -ProcArray. That would break GetOldestXmin, as discussed below. +ProcArray. That would break ComputeXidHorizons, as discussed below. We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the subxid array) without taking ProcArrayLock. This was once necessary to @@ -293,42 +293,50 @@ once, rather than assume they can read it multiple times and get the same answer each time. (Use volatile-qualified pointers when doing this, to ensure that the C compiler does exactly what you tell it to.) -Another important activity that uses the shared ProcArray is GetOldestXmin, -which must determine a lower bound for the oldest xmin of any active MVCC -snapshot, system-wide. Each individual backend advertises the smallest -xmin of its own snapshots in MyPgXact->xmin, or zero if it currently has no -live snapshots (eg, if it's between transactions or hasn't yet set a -snapshot for a new transaction). GetOldestXmin takes the MIN() of the -valid xmin fields. It does this with only shared lock on ProcArrayLock, -which means there is a potential race condition against other backends -doing GetSnapshotData concurrently: we must be certain that a concurrent -backend that is about to set its xmin does not compute an xmin less than -what GetOldestXmin returns. We ensure that by including all the active -XIDs into the MIN() calculation, along with the valid xmins. The rule that -transactions can't exit without taking exclusive ProcArrayLock ensures that -concurrent holders of shared ProcArrayLock will compute the same minimum of -currently-active XIDs: no xact, in particular not the oldest, can exit -while we hold shared ProcArrayLock. So GetOldestXmin's view of the minimum -active XID will be the same as that of any concurrent GetSnapshotData, and -so it can't produce an overestimate. If there is no active transaction at -all, GetOldestXmin returns latestCompletedXid + 1, which is a lower bound -for the xmin that might be computed by concurrent or later GetSnapshotData -calls. (We know that no XID less than this could be about to appear in -the ProcArray, because of the XidGenLock interlock discussed above.) - -GetSnapshotData also performs an oldest-xmin calculation (which had better -match GetOldestXmin's) and stores that into RecentGlobalXmin, which is used -for some tuple age cutoff checks where a fresh call of GetOldestXmin seems -too expensive. Note that while it is certain that two concurrent -executions of GetSnapshotData will compute the same xmin for their own -snapshots, as argued above, it is not certain that they will arrive at the -same estimate of RecentGlobalXmin. This is because we allow XID-less -transactions to clear their MyPgXact->xmin asynchronously (without taking -ProcArrayLock), so one execution might see what had been the oldest xmin, -and another not. This is OK since RecentGlobalXmin need only be a valid -lower bound. As noted above, we are already assuming that fetch/store -of the xid fields is atomic, so assuming it for xmin as well is no extra -risk. +Another important activity that uses the shared ProcArray is +ComputeXidHorizons, which must determine a lower bound for the oldest xmin +of any active MVCC snapshot, system-wide. Each individual backend +advertises the smallest xmin of its own snapshots in MyPgXact->xmin, or zero +if it currently has no live snapshots (eg, if it's between transactions or +hasn't yet set a snapshot for a new transaction). ComputeXidHorizons takes +the MIN() of the valid xmin fields. It does this with only shared lock on +ProcArrayLock, which means there is a potential race condition against other +backends doing GetSnapshotData concurrently: we must be certain that a +concurrent backend that is about to set its xmin does not compute an xmin +less than what ComputeXidHorizons determines. We ensure that by including +all the active XIDs into the MIN() calculation, along with the valid xmins. +The rule that transactions can't exit without taking exclusive ProcArrayLock +ensures that concurrent holders of shared ProcArrayLock will compute the +same minimum of currently-active XIDs: no xact, in particular not the +oldest, can exit while we hold shared ProcArrayLock. So +ComputeXidHorizons's view of the minimum active XID will be the same as that +of any concurrent GetSnapshotData, and so it can't produce an overestimate. +If there is no active transaction at all, ComputeXidHorizons uses +latestCompletedXid + 1, which is a lower bound for the xmin that might +be computed by concurrent or later GetSnapshotData calls. (We know that no +XID less than this could be about to appear in the ProcArray, because of the +XidGenLock interlock discussed above.) + +As GetSnapshotData is performance critical, it does not perform an accurate +oldest-xmin calculation (it used to, until v13). The contents of a snapshot +only depend on the xids of other backends, not their xmin. As backend's xmin +changes much more often than its xid, having GetSnapshotData look at xmins +can lead to a lot of unnecessary cacheline ping-pong. Instead +GetSnapshotData updates approximate thresholds (one that guarantees that all +deleted rows older than it can be removed, another determining that deleted +rows newer than it can not be removed). GlobalVisTest* uses those threshold +to make invisibility decision, falling back to ComputeXidHorizons if +necessary. + +Note that while it is certain that two concurrent executions of +GetSnapshotData will compute the same xmin for their own snapshots, there is +no such guarantee for the horizons computed by ComputeXidHorizons. This is +because we allow XID-less transactions to clear their MyPgXact->xmin +asynchronously (without taking ProcArrayLock), so one execution might see +what had been the oldest xmin, and another not. This is OK since the +thresholds need only be a valid lower bound. As noted above, we are already +assuming that fetch/store of the xid fields is atomic, so assuming it for +xmin as well is no extra risk. pg_xact and pg_subtrans diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8f72faee82cc..09c01ed4ae48 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9096,7 +9096,7 @@ CreateCheckPoint(int flags) * StartupSUBTRANS hasn't been called yet. */ if (!RecoveryInProgress()) - TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -9456,7 +9456,7 @@ CreateRestartPoint(int flags) * this because StartupSUBTRANS hasn't been called yet. */ if (EnableHotStandby) - TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); /* Real work is done, but log and update before releasing lock. */ LogCheckpointEnd(true); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index e0fa73ba7909..8af12b5c6b2b 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1045,7 +1045,7 @@ acquire_sample_rows(Relation onerel, int elevel, totalblocks = RelationGetNumberOfBlocks(onerel); /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ - OldestXmin = GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM); + OldestXmin = GetOldestNonRemovableTransactionId(onerel); /* Prepare for sampling block numbers */ nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 576c7e63e99a..22228f5684f0 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -955,8 +955,25 @@ vacuum_set_xid_limits(Relation rel, * working on a particular table at any time, and that each vacuum is * always an independent transaction. */ - *oldestXmin = - TransactionIdLimitedForOldSnapshots(GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM), rel); + *oldestXmin = GetOldestNonRemovableTransactionId(rel); + + if (OldSnapshotThresholdActive()) + { + TransactionId limit_xmin; + TimestampTz limit_ts; + + if (TransactionIdLimitedForOldSnapshots(*oldestXmin, rel, &limit_xmin, &limit_ts)) + { + /* + * TODO: We should only set the threshold if we are pruning on the + * basis of the increased limits. Not as crucial here as it is for + * opportunistic pruning (which often happens at a much higher + * frequency), but would still be a significant improvement. + */ + SetOldSnapshotThresholdTimestamp(limit_ts, limit_xmin); + *oldestXmin = limit_xmin; + } + } Assert(TransactionIdIsNormal(*oldestXmin)); @@ -1345,12 +1362,13 @@ vac_update_datfrozenxid(void) bool dirty = false; /* - * Initialize the "min" calculation with GetOldestXmin, which is a - * reasonable approximation to the minimum relfrozenxid for not-yet- - * committed pg_class entries for new tables; see AddNewRelationTuple(). - * So we cannot produce a wrong minimum by starting with this. + * Initialize the "min" calculation with + * GetOldestNonRemovableTransactionId(), which is a reasonable + * approximation to the minimum relfrozenxid for not-yet-committed + * pg_class entries for new tables; see AddNewRelationTuple(). So we + * cannot produce a wrong minimum by starting with this. */ - newFrozenXid = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM); + newFrozenXid = GetOldestNonRemovableTransactionId(NULL); /* * Similarly, initialize the MultiXact "min" with the value that would be @@ -1681,8 +1699,9 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params) StartTransactionCommand(); /* - * Functions in indexes may want a snapshot set. Also, setting a snapshot - * ensures that RecentGlobalXmin is kept truly recent. + * Need to acquire a snapshot to prevent pg_subtrans from being truncated, + * cutoff xids in local memory wrapping around, and to have updated xmin + * horizons. */ PushActiveSnapshot(GetTransactionSnapshot()); @@ -1705,8 +1724,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params) * * Note: these flags remain set until CommitTransaction or * AbortTransaction. We don't want to clear them until we reset - * MyPgXact->xid/xmin, else OldestXmin might appear to go backwards, - * which is probably Not Good. + * MyPgXact->xid/xmin, otherwise GetOldestNonRemovableTransactionId() + * might appear to go backwards, which is probably Not Good. */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyPgXact->vacuumFlags |= PROC_IN_VACUUM; diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 9c7d4b0c60e4..ac97e28be19c 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -1877,6 +1877,10 @@ get_database_list(void) * the secondary effect that it sets RecentGlobalXmin. (This is critical * for anything that reads heap pages, because HOT may decide to prune * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). */ StartTransactionCommand(); (void) GetTransactionSnapshot(); diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index ff985b9b24ca..bdaf0312d63d 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -122,6 +122,10 @@ get_subscription_list(void) * the secondary effect that it sets RecentGlobalXmin. (This is critical * for anything that reads heap pages, because HOT may decide to prune * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). */ StartTransactionCommand(); (void) GetTransactionSnapshot(); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index d5a9b568a682..7c11e1ab44cb 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1181,22 +1181,7 @@ XLogWalRcvSendHSFeedback(bool immed) */ if (hot_standby_feedback) { - TransactionId slot_xmin; - - /* - * Usually GetOldestXmin() would include both global replication slot - * xmin and catalog_xmin in its calculations, but we want to derive - * separate values for each of those. So we ask for an xmin that - * excludes the catalog_xmin. - */ - xmin = GetOldestXmin(NULL, - PROCARRAY_FLAGS_DEFAULT | PROCARRAY_SLOTS_XMIN); - - ProcArrayGetReplicationSlotXmin(&slot_xmin, &catalog_xmin); - - if (TransactionIdIsValid(slot_xmin) && - TransactionIdPrecedes(slot_xmin, xmin)) - xmin = slot_xmin; + GetReplicationHorizons(&xmin, &catalog_xmin); } else { diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index d13220c14008..460ca3f947f4 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2113,9 +2113,10 @@ ProcessStandbyHSFeedbackMessage(void) /* * Set the WalSender's xmin equal to the standby's requested xmin, so that - * the xmin will be taken into account by GetOldestXmin. This will hold - * back the removal of dead rows and thereby prevent the generation of - * cleanup conflicts on the standby server. + * the xmin will be taken into account by GetSnapshotData() / + * ComputeXidHorizons(). This will hold back the removal of dead rows and + * thereby prevent the generation of cleanup conflicts on the standby + * server. * * There is a small window for a race condition here: although we just * checked that feedbackXmin precedes nextXid, the nextXid could have @@ -2128,10 +2129,10 @@ ProcessStandbyHSFeedbackMessage(void) * own xmin would prevent nextXid from advancing so far. * * We don't bother taking the ProcArrayLock here. Setting the xmin field - * is assumed atomic, and there's no real need to prevent a concurrent - * GetOldestXmin. (If we're moving our xmin forward, this is obviously - * safe, and if we're moving it backwards, well, the data is at risk - * already since a VACUUM could have just finished calling GetOldestXmin.) + * is assumed atomic, and there's no real need to prevent concurrent + * horizon determinations. (If we're moving our xmin forward, this is + * obviously safe, and if we're moving it backwards, well, the data is at + * risk already since a VACUUM could already have determined the horizon.) * * If we're using a replication slot we reserve the xmin via that, * otherwise via the walsender's PGXACT entry. We can only track the diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 522518695eec..e582d5af4291 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -99,6 +99,142 @@ typedef struct ProcArrayStruct int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; } ProcArrayStruct; +/* + * State for the GlobalVisTest* family of functions. Those functions can + * e.g. be used to decide if a deleted row can be removed without violating + * MVCC semantics: If the deleted row's xmax is not considered to be running + * by anyone, the row can be removed. + * + * To avoid slowing down GetSnapshotData(), we don't calculate a precise + * cutoff XID while building a snapshot (looking at the frequently changing + * xmins scales badly). Instead we compute two boundaries while building the + * snapshot: + * + * 1) definitely_needed, indicating that rows deleted by XIDs >= + * definitely_needed are definitely still visible. + * + * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can + * definitely be removed + * + * When testing an XID that falls in between the two (i.e. XID >= maybe_needed + * && XID < definitely_needed), the boundaries can be recomputed (using + * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than + * maintaining an accurate value all the time. + * + * As it is not cheap to compute accurate boundaries, we limit the number of + * times that happens in short succession. See GlobalVisTestShouldUpdate(). + * + * + * There are three backend lifetime instances of this struct, optimized for + * different types of relations. As e.g. a normal user defined table in one + * database is inaccessible to backends connected to another database, a test + * specific to a relation can be more aggressive than a test for a shared + * relation. Currently we track three different states: + * + * 1) GlobalVisSharedRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in any database, nor a + * replication slot's xmin, nor a replication slot's catalog_xmin might + * still consider XID as running. + * + * 2) GlobalVisCatalogRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in the current + * database, nor a replication slot's xmin, nor a replication slot's + * catalog_xmin might still consider XID as running. + * + * I.e. the difference to GlobalVisSharedRels is that + * snapshot in other databases are ignored. + * + * 3) GlobalVisCatalogRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in the current + * database, nor a replication slot's xmin consider XID as running. + * + * I.e. the difference to GlobalVisCatalogRels is that + * replication slot's catalog_xmin is not taken into account. + * + * GlobalVisTestFor(relation) returns the appropriate state + * for the relation. + * + * The boundaries are FullTransactionIds instead of TransactionIds to avoid + * wraparound dangers. There e.g. would otherwise exist no procarray state to + * prevent maybe_needed to become old enough after the GetSnapshotData() + * call. + * + * The typedef is in the header. + */ +struct GlobalVisState +{ + /* XIDs >= are considered running by some backend */ + FullTransactionId definitely_needed; + + /* XIDs < are not considered to be running by any backend */ + FullTransactionId maybe_needed; +}; + +/* + * Result of ComputeXidHorizons(). + */ +typedef struct ComputeXidHorizonsResult +{ + /* + * The value of ShmemVariableCache->latestCompletedXid when + * ComputeXidHorizons() held ProcArrayLock. + */ + FullTransactionId latest_completed; + + /* + * The same for procArray->replication_slot_xmin and. + * procArray->replication_slot_catalog_xmin. + */ + TransactionId slot_xmin; + TransactionId slot_catalog_xmin; + + /* + * Oldest xid that any backend might still consider running. This needs to + * include processes running VACUUM, in contrast to the normal visibility + * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when + * determining visibility, but doesn't care about rows above its xmin to + * be removed. + * + * This likely should only be needed to determine whether pg_subtrans can + * be truncated. It currently includes the effects of replications slots, + * for historical reasons. But that could likely be changed. + */ + TransactionId oldest_considered_running; + + /* + * Oldest xid for which deleted tuples need to be retained in shared + * tables. + * + * This includes the effects of replications lots. If that's not desired, + * look at shared_oldest_nonremovable_raw; + */ + TransactionId shared_oldest_nonremovable; + + /* + * Oldest xid that may be necessary to retain in shared tables. This is + * the same as shared_oldest_nonremovable, except that is not affected by + * replication slot's catalog_xmin. + * + * This is mainly useful to be able to send the catalog_xmin to upstream + * streaming replication servers via hot_standby_feedback, so they can + * apply the limit only when accessing catalog tables. + */ + TransactionId shared_oldest_nonremovable_raw; + + /* + * Oldest xid for which deleted tuples need to be retained in non-shared + * catalog tables. + */ + TransactionId catalog_oldest_nonremovable; + + /* + * Oldest xid for which deleted tuples need to be retained in normal user + * defined tables. + */ + TransactionId data_oldest_nonremovable; +} ComputeXidHorizonsResult; + + static ProcArrayStruct *procArray; static PGPROC *allProcs; @@ -118,6 +254,22 @@ static TransactionId latestObservedXid = InvalidTransactionId; */ static TransactionId standbySnapshotPendingXmin; +/* + * State for visibility checks on different types of relations. See struct + * GlobalVisState for details. As shared, catalog, and user defined + * relations can have different horizons, one such state exists for each. + */ +static GlobalVisState GlobalVisSharedRels; +static GlobalVisState GlobalVisCatalogRels; +static GlobalVisState GlobalVisDataRels; + +/* + * This backend's RecentXmin at the last time the accurate xmin horizon was + * recomputed, or InvalidTransactionId if it has not. Used to limit how many + * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate(). + */ +static TransactionId ComputeXidHorizonsResultLastXmin; + #ifdef XIDCACHE_DEBUG /* counters for XidCache measurement */ @@ -180,6 +332,7 @@ static void MaintainLatestCompletedXidRecovery(TransactionId latestXid); static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel, TransactionId xid); +static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons); /* * Report shared-memory space needed by CreateSharedProcArray. @@ -1302,159 +1455,191 @@ TransactionIdIsActive(TransactionId xid) /* - * GetOldestXmin -- returns oldest transaction that was running - * when any current transaction was started. + * Determine XID horizons. * - * If rel is NULL or a shared relation, all backends are considered, otherwise - * only backends running in this database are considered. + * This is used by wrapper functions like GetOldestNonRemovableTransactionId() + * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as + * well as "internally" by GlobalVisUpdate() (see comment above struct + * GlobalVisState). * - * The flags are used to ignore the backends in calculation when any of the - * corresponding flags is set. Typically, if you want to ignore ones with - * PROC_IN_VACUUM flag, you can use PROCARRAY_FLAGS_VACUUM. + * See the definition of ComputedXidHorizonsResult for the various computed + * horizons. * - * PROCARRAY_SLOTS_XMIN causes GetOldestXmin to ignore the xmin and - * catalog_xmin of any replication slots that exist in the system when - * calculating the oldest xmin. + * For VACUUM separate horizons (used to to decide which deleted tuples must + * be preserved), for shared and non-shared tables are computed. For shared + * relations backends in all databases must be considered, but for non-shared + * relations that's not required, since only backends in my own database could + * ever see the tuples in them. Also, we can ignore concurrently running lazy + * VACUUMs because (a) they must be working on other tables, and (b) they + * don't need to do snapshot-based lookups. * - * This is used by VACUUM to decide which deleted tuples must be preserved in - * the passed in table. For shared relations backends in all databases must be - * considered, but for non-shared relations that's not required, since only - * backends in my own database could ever see the tuples in them. Also, we can - * ignore concurrently running lazy VACUUMs because (a) they must be working - * on other tables, and (b) they don't need to do snapshot-based lookups. - * - * This is also used to determine where to truncate pg_subtrans. For that - * backends in all databases have to be considered, so rel = NULL has to be - * passed in. + * This also computes a horizon used to truncate pg_subtrans. For that + * backends in all databases have to be considered, and concurrently running + * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans + * accesses. * * Note: we include all currently running xids in the set of considered xids. * This ensures that if a just-started xact has not yet set its snapshot, * when it does set the snapshot it cannot set xmin less than what we compute. * See notes in src/backend/access/transam/README. * - * Note: despite the above, it's possible for the calculated value to move - * backwards on repeated calls. The calculated value is conservative, so that - * anything older is definitely not considered as running by anyone anymore, - * but the exact value calculated depends on a number of things. For example, - * if rel = NULL and there are no transactions running in the current - * database, GetOldestXmin() returns latestCompletedXid. If a transaction + * Note: despite the above, it's possible for the calculated values to move + * backwards on repeated calls. The calculated values are conservative, so + * that anything older is definitely not considered as running by anyone + * anymore, but the exact values calculated depend on a number of things. For + * example, if there are no transactions running in the current database, the + * horizon for normal tables will be latestCompletedXid. If a transaction * begins after that, its xmin will include in-progress transactions in other * databases that started earlier, so another call will return a lower value. * Nonetheless it is safe to vacuum a table in the current database with the * first result. There are also replication-related effects: a walsender * process can set its xmin based on transactions that are no longer running * on the primary but are still being replayed on the standby, thus possibly - * making the GetOldestXmin reading go backwards. In this case there is a - * possibility that we lose data that the standby would like to have, but - * unless the standby uses a replication slot to make its xmin persistent - * there is little we can do about that --- data is only protected if the - * walsender runs continuously while queries are executed on the standby. - * (The Hot Standby code deals with such cases by failing standby queries - * that needed to access already-removed data, so there's no integrity bug.) - * The return value is also adjusted with vacuum_defer_cleanup_age, so - * increasing that setting on the fly is another easy way to make - * GetOldestXmin() move backwards, with no consequences for data integrity. + * making the values go backwards. In this case there is a possibility that + * we lose data that the standby would like to have, but unless the standby + * uses a replication slot to make its xmin persistent there is little we can + * do about that --- data is only protected if the walsender runs continuously + * while queries are executed on the standby. (The Hot Standby code deals + * with such cases by failing standby queries that needed to access + * already-removed data, so there's no integrity bug.) The computed values + * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting + * on the fly is another easy way to make horizons move backwards, with no + * consequences for data integrity. + * + * Note: the approximate horizons (see definition of GlobalVisState) are + * updated by the computations done here. That's currently required for + * correctness and a small optimization. Without doing so it's possible that + * heap vacuum's call to heap_page_prune() uses a more conservative horizon + * than later when deciding which tuples can be removed - which the code + * doesn't expect (breaking HOT). */ -TransactionId -GetOldestXmin(Relation rel, int flags) +static void +ComputeXidHorizons(ComputeXidHorizonsResult *h) { ProcArrayStruct *arrayP = procArray; - TransactionId result; - int index; - bool allDbs; - - TransactionId replication_slot_xmin = InvalidTransactionId; - TransactionId replication_slot_catalog_xmin = InvalidTransactionId; - - /* - * If we're not computing a relation specific limit, or if a shared - * relation has been passed in, backends in all databases have to be - * considered. - */ - allDbs = rel == NULL || rel->rd_rel->relisshared; + TransactionId kaxmin; + bool in_recovery = RecoveryInProgress(); - /* Cannot look for individual databases during recovery */ - Assert(allDbs || !RecoveryInProgress()); + /* inferred after ProcArrayLock is released */ + h->catalog_oldest_nonremovable = InvalidTransactionId; LWLockAcquire(ProcArrayLock, LW_SHARED); + h->latest_completed = ShmemVariableCache->latestCompletedXid; + /* * We initialize the MIN() calculation with latestCompletedXid + 1. This * is a lower bound for the XIDs that might appear in the ProcArray later, * and so protects us against overestimating the result due to future * additions. */ - result = XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); - TransactionIdAdvance(result); - Assert(TransactionIdIsNormal(result)); + { + TransactionId initial; - for (index = 0; index < arrayP->numProcs; index++) + initial = XidFromFullTransactionId(h->latest_completed); + Assert(TransactionIdIsValid(initial)); + TransactionIdAdvance(initial); + + h->oldest_considered_running = initial; + h->shared_oldest_nonremovable = initial; + h->data_oldest_nonremovable = initial; + } + + /* + * Fetch slot horizons while ProcArrayLock is held - the + * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside + * the lock. + */ + h->slot_xmin = procArray->replication_slot_xmin; + h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + + for (int index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; PGXACT *pgxact = &allPgXact[pgprocno]; + TransactionId xid; + TransactionId xmin; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(pgxact->xid); + xmin = UINT32_ACCESS_ONCE(pgxact->xmin); + + /* + * Consider both the transaction's Xmin, and its Xid. + * + * We must check both because a transaction might have an Xmin but not + * (yet) an Xid; conversely, if it has an Xid, that could determine + * some not-yet-set Xmin. + */ + xmin = TransactionIdOlder(xmin, xid); - if (pgxact->vacuumFlags & (flags & PROCARRAY_PROC_FLAGS_MASK)) + /* if neither is set, this proc doesn't influence the horizon */ + if (!TransactionIdIsValid(xmin)) continue; - if (allDbs || + /* + * Don't ignore any procs when determining which transactions might be + * considered running. While slots should ensure logical decoding + * backends are protected even without this check, it can't hurt to + * include them here as well.. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, xmin); + + /* + * Skip over backends either vacuuming (which is ok with rows being + * removed, as long as pg_subtrans is not truncated) or doing logical + * decoding (which manages xmin separately, check below). + */ + if (pgxact->vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) + continue; + + /* shared tables need to take backends in all database into account */ + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, xmin); + + /* + * Normally queries in other databases are ignored for anything but + * the shared horizon. But in recovery we cannot compute an accurate + * per-database horizon as all xids are managed via the + * KnownAssignedXids machinery. + */ + if (in_recovery || proc->databaseId == MyDatabaseId || proc->databaseId == 0) /* always include WalSender */ { - /* Fetch xid just once - see GetNewTransactionId */ - TransactionId xid = UINT32_ACCESS_ONCE(pgxact->xid); - - /* First consider the transaction's own Xid, if any */ - if (TransactionIdIsNormal(xid) && - TransactionIdPrecedes(xid, result)) - result = xid; - - /* - * Also consider the transaction's Xmin, if set. - * - * We must check both Xid and Xmin because a transaction might - * have an Xmin but not (yet) an Xid; conversely, if it has an - * Xid, that could determine some not-yet-set Xmin. - */ - xid = UINT32_ACCESS_ONCE(pgxact->xmin); - if (TransactionIdIsNormal(xid) && - TransactionIdPrecedes(xid, result)) - result = xid; + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, xmin); } } /* - * Fetch into local variable while ProcArrayLock is held - the - * LWLockRelease below is a barrier, ensuring this happens inside the - * lock. + * If in recovery fetch oldest xid in KnownAssignedXids, will be applied + * after lock is released. */ - replication_slot_xmin = procArray->replication_slot_xmin; - replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + if (in_recovery) + kaxmin = KnownAssignedXidsGetOldestXmin(); - if (RecoveryInProgress()) - { - /* - * Check to see whether KnownAssignedXids contains an xid value older - * than the main procarray. - */ - TransactionId kaxmin = KnownAssignedXidsGetOldestXmin(); - - LWLockRelease(ProcArrayLock); + /* + * No other information from shared state is needed, release the lock + * immediately. The rest of the computations can be done without a lock. + */ + LWLockRelease(ProcArrayLock); - if (TransactionIdIsNormal(kaxmin) && - TransactionIdPrecedes(kaxmin, result)) - result = kaxmin; + if (in_recovery) + { + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, kaxmin); + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin); + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, kaxmin); } else { /* - * No other information needed, so release the lock immediately. - */ - LWLockRelease(ProcArrayLock); - - /* - * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age, - * being careful not to generate a "permanent" XID. + * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age. * * vacuum_defer_cleanup_age provides some additional "slop" for the * benefit of hot standby queries on standby servers. This is quick @@ -1466,34 +1651,146 @@ GetOldestXmin(Relation rel, int flags) * in varsup.c. Also note that we intentionally don't apply * vacuum_defer_cleanup_age on standby servers. */ - result -= vacuum_defer_cleanup_age; - if (!TransactionIdIsNormal(result)) - result = FirstNormalTransactionId; + h->oldest_considered_running = + TransactionIdRetreatedBy(h->oldest_considered_running, + vacuum_defer_cleanup_age); + h->shared_oldest_nonremovable = + TransactionIdRetreatedBy(h->shared_oldest_nonremovable, + vacuum_defer_cleanup_age); + h->data_oldest_nonremovable = + TransactionIdRetreatedBy(h->data_oldest_nonremovable, + vacuum_defer_cleanup_age); } /* * Check whether there are replication slots requiring an older xmin. */ - if (!(flags & PROCARRAY_SLOTS_XMIN) && - TransactionIdIsValid(replication_slot_xmin) && - NormalTransactionIdPrecedes(replication_slot_xmin, result)) - result = replication_slot_xmin; + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin); + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin); /* - * After locks have been released and vacuum_defer_cleanup_age has been - * applied, check whether we need to back up further to make logical - * decoding possible. We need to do so if we're computing the global limit - * (rel = NULL) or if the passed relation is a catalog relation of some - * kind. + * The only difference between catalog / data horizons is that the slot's + * catalog xmin is applied to the catalog one (so catalogs can be accessed + * for logical decoding). Initialize with data horizon, and then back up + * further if necessary. Have to back up the shared horizon as well, since + * that also can contain catalogs. */ - if (!(flags & PROCARRAY_SLOTS_XMIN) && - (rel == NULL || - RelationIsAccessibleInLogicalDecoding(rel)) && - TransactionIdIsValid(replication_slot_catalog_xmin) && - NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result)) - result = replication_slot_catalog_xmin; + h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable; + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, + h->slot_catalog_xmin); + h->catalog_oldest_nonremovable = h->data_oldest_nonremovable; + h->catalog_oldest_nonremovable = + TransactionIdOlder(h->catalog_oldest_nonremovable, + h->slot_catalog_xmin); - return result; + /* + * It's possible that slots / vacuum_defer_cleanup_age backed up the + * horizons further than oldest_considered_running. Fix. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->shared_oldest_nonremovable); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->catalog_oldest_nonremovable); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->data_oldest_nonremovable); + + /* + * shared horizons have to be at least as old as the oldest visible in + * current db + */ + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->data_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->catalog_oldest_nonremovable)); + + /* + * Horizons need to ensure that pg_subtrans access is still possible for + * the relevant backends. + */ + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->shared_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->catalog_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->data_oldest_nonremovable)); + Assert(!TransactionIdIsValid(h->slot_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_xmin)); + Assert(!TransactionIdIsValid(h->slot_catalog_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_catalog_xmin)); + + /* update approximate horizons with the computed horizons */ + GlobalVisUpdateApply(h); +} + +/* + * Return the oldest XID for which deleted tuples must be preserved in the + * passed table. + * + * If rel is not NULL the horizon may be considerably more recent than + * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon + * that is correct (but not optimal) for all relations will be returned. + * + * This is used by VACUUM to decide which deleted tuples must be preserved in + * the passed in table. + */ +TransactionId +GetOldestNonRemovableTransactionId(Relation rel) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + /* select horizon appropriate for relation */ + if (rel == NULL || rel->rd_rel->relisshared) + return horizons.shared_oldest_nonremovable; + else if (RelationIsAccessibleInLogicalDecoding(rel)) + return horizons.catalog_oldest_nonremovable; + else + return horizons.data_oldest_nonremovable; +} + +/* + * Return the oldest transaction id any currently running backend might still + * consider running. This should not be used for visibility / pruning + * determinations (see GetOldestNonRemovableTransactionId()), but for + * decisions like up to where pg_subtrans can be truncated. + */ +TransactionId +GetOldestTransactionIdConsideredRunning(void) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + return horizons.oldest_considered_running; +} + +/* + * Return the visibility horizons for a hot standby feedback message. + */ +void +GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + /* + * Don't want to use shared_oldest_nonremovable here, as that contains the + * effect of replication slot's catalog_xmin. We want to send a separate + * feedback for the catalog horizon, so the primary can remove data table + * contents more aggressively. + */ + *xmin = horizons.shared_oldest_nonremovable_raw; + *catalog_xmin = horizons.slot_catalog_xmin; } /* @@ -1544,12 +1841,9 @@ GetMaxSnapshotSubxidCount(void) * current transaction (this is the same as MyPgXact->xmin). * RecentXmin: the xmin computed for the most recent snapshot. XIDs * older than this are known not running any more. - * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all - * running transactions, except those running LAZY VACUUM). This is - * the same computation done by - * GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM). - * RecentGlobalDataXmin: the global xmin for non-catalog tables - * >= RecentGlobalXmin + * + * And try to advance the bounds of GlobalVisSharedRels, GlobalVisCatalogRels, + * GlobalVisDataRels for the benefit of theGlobalVisTest* family of functions. * * Note: this function should probably not be called with an argument that's * not statically allocated (see xip allocation below). @@ -1560,12 +1854,12 @@ GetSnapshotData(Snapshot snapshot) ProcArrayStruct *arrayP = procArray; TransactionId xmin; TransactionId xmax; - TransactionId globalxmin; int index; int count = 0; int subcount = 0; bool suboverflowed = false; FullTransactionId latest_completed; + TransactionId oldestxid; TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; @@ -1610,13 +1904,15 @@ GetSnapshotData(Snapshot snapshot) LWLockAcquire(ProcArrayLock, LW_SHARED); latest_completed = ShmemVariableCache->latestCompletedXid; + oldestxid = ShmemVariableCache->oldestXid; + /* xmax is always latestCompletedXid + 1 */ xmax = XidFromFullTransactionId(latest_completed); TransactionIdAdvance(xmax); Assert(TransactionIdIsNormal(xmax)); /* initialize xmin calculation with xmax */ - globalxmin = xmin = xmax; + xmin = xmax; snapshot->takenDuringRecovery = RecoveryInProgress(); @@ -1645,12 +1941,6 @@ GetSnapshotData(Snapshot snapshot) (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) continue; - /* Update globalxmin to be the smallest valid xmin */ - xid = UINT32_ACCESS_ONCE(pgxact->xmin); - if (TransactionIdIsNormal(xid) && - NormalTransactionIdPrecedes(xid, globalxmin)) - globalxmin = xid; - /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(pgxact->xid); @@ -1766,34 +2056,78 @@ GetSnapshotData(Snapshot snapshot) LWLockRelease(ProcArrayLock); - /* - * Update globalxmin to include actual process xids. This is a slightly - * different way of computing it than GetOldestXmin uses, but should give - * the same result. - */ - if (TransactionIdPrecedes(xmin, globalxmin)) - globalxmin = xmin; + /* maintain state for GlobalVis* */ + { + TransactionId def_vis_xid; + TransactionId def_vis_xid_data; + FullTransactionId def_vis_fxid; + FullTransactionId def_vis_fxid_data; + FullTransactionId oldestfxid; - /* Update global variables too */ - RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age; - if (!TransactionIdIsNormal(RecentGlobalXmin)) - RecentGlobalXmin = FirstNormalTransactionId; + /* + * Converting oldestXid is only safe when xid horizon cannot advance, + * i.e. holding locks. While we don't hold the lock anymore, all the + * necessary data has been gathered with lock held. + */ + oldestfxid = FullXidRelativeTo(latest_completed, oldestxid); - /* Check whether there's a replication slot requiring an older xmin. */ - if (TransactionIdIsValid(replication_slot_xmin) && - NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin)) - RecentGlobalXmin = replication_slot_xmin; + /* apply vacuum_defer_cleanup_age */ + def_vis_xid_data = + TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age); - /* Non-catalog tables can be vacuumed if older than this xid */ - RecentGlobalDataXmin = RecentGlobalXmin; + /* Check whether there's a replication slot requiring an older xmin. */ + def_vis_xid_data = + TransactionIdOlder(def_vis_xid_data, replication_slot_xmin); - /* - * Check whether there's a replication slot requiring an older catalog - * xmin. - */ - if (TransactionIdIsNormal(replication_slot_catalog_xmin) && - NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin)) - RecentGlobalXmin = replication_slot_catalog_xmin; + /* + * Rows in non-shared, non-catalog tables possibly could be vacuumed + * if older than this xid. + */ + def_vis_xid = def_vis_xid_data; + + /* + * Check whether there's a replication slot requiring an older catalog + * xmin. + */ + def_vis_xid = + TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid); + + def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid); + def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data); + + /* + * Check if we can increase upper bound. As a previous + * GlobalVisUpdate() might have computed more aggressive values, don't + * overwrite them if so. + */ + GlobalVisSharedRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid, + GlobalVisSharedRels.definitely_needed); + GlobalVisCatalogRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid, + GlobalVisCatalogRels.definitely_needed); + GlobalVisDataRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid_data, + GlobalVisDataRels.definitely_needed); + + /* + * Check if we know that we can initialize or increase the lower + * bound. Currently the only cheap way to do so is to use + * ShmemVariableCache->oldestXid as input. + * + * We should definitely be able to do better. We could e.g. put a + * global lower bound value into ShmemVariableCache. + */ + GlobalVisSharedRels.maybe_needed = + FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed, + oldestfxid); + GlobalVisCatalogRels.maybe_needed = + FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed, + oldestfxid); + GlobalVisDataRels.maybe_needed = + FullTransactionIdNewer(GlobalVisDataRels.maybe_needed, + oldestfxid); + } RecentXmin = xmin; @@ -3291,6 +3625,255 @@ DisplayXidCache(void) } #endif /* XIDCACHE_DEBUG */ +/* + * If rel != NULL, return test state appropriate for relation, otherwise + * return state usable for all relations. The latter may consider XIDs as + * not-yet-visible-to-everyone that a state for a specific relation would + * already consider visible-to-everyone. + * + * This needs to be called while a snapshot is active or registered, otherwise + * there are wraparound and other dangers. + * + * See comment for GlobalVisState for details. + */ +GlobalVisState * +GlobalVisTestFor(Relation rel) +{ + bool need_shared; + bool need_catalog; + GlobalVisState *state; + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(RecentXmin); + + if (!rel) + need_shared = need_catalog = true; + else + { + /* + * Other kinds currently don't contain xids, nor always the necessary + * logical decoding markers. + */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + + need_shared = rel->rd_rel->relisshared || RecoveryInProgress(); + need_catalog = IsCatalogRelation(rel) || RelationIsAccessibleInLogicalDecoding(rel); + } + + if (need_shared) + state = &GlobalVisSharedRels; + else if (need_catalog) + state = &GlobalVisCatalogRels; + else + state = &GlobalVisDataRels; + + Assert(FullTransactionIdIsValid(state->definitely_needed) && + FullTransactionIdIsValid(state->maybe_needed)); + + return state; +} + +/* + * Return true if it's worth updating the accurate maybe_needed boundary. + * + * As it is somewhat expensive to determine xmin horizons, we don't want to + * repeatedly do so when there is a low likelihood of it being beneficial. + * + * The current heuristic is that we update only if RecentXmin has changed + * since the last update. If the oldest currently running transaction has not + * finished, it is unlikely that recomputing the horizon would be useful. + */ +static bool +GlobalVisTestShouldUpdate(GlobalVisState *state) +{ + /* hasn't been updated yet */ + if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin)) + return true; + + /* + * If the maybe_needed/definitely_needed boundaries are the same, it's + * unlikely to be beneficial to refresh boundaries. + */ + if (FullTransactionIdFollowsOrEquals(state->maybe_needed, + state->definitely_needed)) + return false; + + /* does the last snapshot built have a different xmin? */ + return RecentXmin != ComputeXidHorizonsResultLastXmin; +} + +static void +GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons) +{ + GlobalVisSharedRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->shared_oldest_nonremovable); + GlobalVisCatalogRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->catalog_oldest_nonremovable); + GlobalVisDataRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->data_oldest_nonremovable); + + /* + * In longer running transactions it's possible that transactions we + * previously needed to treat as running aren't around anymore. So update + * definitely_needed to not be earlier than maybe_needed. + */ + GlobalVisSharedRels.definitely_needed = + FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed, + GlobalVisSharedRels.definitely_needed); + GlobalVisCatalogRels.definitely_needed = + FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed, + GlobalVisCatalogRels.definitely_needed); + GlobalVisDataRels.definitely_needed = + FullTransactionIdNewer(GlobalVisDataRels.maybe_needed, + GlobalVisDataRels.definitely_needed); + + ComputeXidHorizonsResultLastXmin = RecentXmin; +} + +/* + * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels + * using ComputeXidHorizons(). + */ +static void +GlobalVisUpdate(void) +{ + ComputeXidHorizonsResult horizons; + + /* updates the horizons as a side-effect */ + ComputeXidHorizons(&horizons); +} + +/* + * Return true if no snapshot still considers fxid to be running. + * + * The state passed needs to have been initialized for the relation fxid is + * from (NULL is also OK), otherwise the result may not be correct. + * + * See comment for GlobalVisState for details. + */ +bool +GlobalVisTestIsRemovableFullXid(GlobalVisState *state, + FullTransactionId fxid) +{ + /* + * If fxid is older than maybe_needed bound, it definitely is visible to + * everyone. + */ + if (FullTransactionIdPrecedes(fxid, state->maybe_needed)) + return true; + + /* + * If fxid is >= definitely_needed bound, it is very likely to still be + * considered running. + */ + if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed)) + return false; + + /* + * fxid is between maybe_needed and definitely_needed, i.e. there might or + * might not exist a snapshot considering fxid running. If it makes sense, + * update boundaries and recheck. + */ + if (GlobalVisTestShouldUpdate(state)) + { + GlobalVisUpdate(); + + Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed)); + + return FullTransactionIdPrecedes(fxid, state->maybe_needed); + } + else + return false; +} + +/* + * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids. + * + * It is crucial that this only gets called for xids from a source that + * protects against xid wraparounds (e.g. from a table and thus protected by + * relfrozenxid). + */ +bool +GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid) +{ + FullTransactionId fxid; + + /* + * Convert 32 bit argument to FullTransactionId. We can do so safely + * because we know the xid has to, at the very least, be between + * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking + * a lock to determine either, we can just compare with + * state->definitely_needed, which was based on those value at the time + * the current snapshot was built. + */ + fxid = FullXidRelativeTo(state->definitely_needed, xid); + + return GlobalVisTestIsRemovableFullXid(state, fxid); +} + +/* + * Return FullTransactionId below which all transactions are not considered + * running anymore. + * + * Note: This is less efficient than testing with + * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate + * cutoff, even in the case all the XIDs compared with the cutoff are outside + * [maybe_needed, definitely_needed). + */ +FullTransactionId +GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state) +{ + /* acquire accurate horizon if not already done */ + if (GlobalVisTestShouldUpdate(state)) + GlobalVisUpdate(); + + return state->maybe_needed; +} + +/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */ +TransactionId +GlobalVisTestNonRemovableHorizon(GlobalVisState *state) +{ + FullTransactionId cutoff; + + cutoff = GlobalVisTestNonRemovableFullHorizon(state); + + return XidFromFullTransactionId(cutoff); +} + +/* + * Convenience wrapper around GlobalVisTestFor() and + * GlobalVisTestIsRemovableFullXid(), see their comments. + */ +bool +GlobalVisIsRemovableFullXid(Relation rel, FullTransactionId fxid) +{ + GlobalVisState *state; + + state = GlobalVisTestFor(rel); + + return GlobalVisTestIsRemovableFullXid(state, fxid); +} + +/* + * Convenience wrapper around GlobalVisTestFor() and + * GlobalVisTestIsRemovableXid(), see their comments. + */ +bool +GlobalVisCheckRemovableXid(Relation rel, TransactionId xid) +{ + GlobalVisState *state; + + state = GlobalVisTestFor(rel); + + return GlobalVisTestIsRemovableXid(state, xid); +} + /* * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel). diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 53d974125fd5..00c7afc66fc2 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5786,14 +5786,15 @@ get_actual_variable_endpoint(Relation heapRel, * recent); that case motivates not using SnapshotAny here. * * A crucial point here is that SnapshotNonVacuumable, with - * RecentGlobalXmin as horizon, yields the inverse of the condition that - * the indexscan will use to decide that index entries are killable (see - * heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple - * (or more precisely, all tuples of a HOT chain) and we have to continue - * scanning past it, we know that the indexscan will mark that index entry - * killed. That means that the next get_actual_variable_endpoint() call - * will not have to re-consider that index entry. In this way we avoid - * repetitive work when this function is used a lot during planning. + * GlobalVisTestFor(heapRel) as horizon, yields the inverse of the + * condition that the indexscan will use to decide that index entries are + * killable (see heap_hot_search_buffer()). Therefore, if the snapshot + * rejects a tuple (or more precisely, all tuples of a HOT chain) and we + * have to continue scanning past it, we know that the indexscan will mark + * that index entry killed. That means that the next + * get_actual_variable_endpoint() call will not have to re-consider that + * index entry. In this way we avoid repetitive work when this function + * is used a lot during planning. * * But using SnapshotNonVacuumable creates a hazard of its own. In a * recently-created index, some index entries may point at "broken" HOT @@ -5805,7 +5806,8 @@ get_actual_variable_endpoint(Relation heapRel, * or could even be NULL. We avoid this hazard because we take the data * from the index entry not the heap. */ - InitNonVacuumableSnapshot(SnapshotNonVacuumable, RecentGlobalXmin); + InitNonVacuumableSnapshot(SnapshotNonVacuumable, + GlobalVisTestFor(heapRel)); index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index f4247ea70d55..893be2f3ddbf 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -722,6 +722,10 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * is critical for anything that reads heap pages, because HOT may decide * to prune them even if the process doesn't attempt to modify any * tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). */ if (!bootstrap) { diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 6b6c8571e237..604d823f6861 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -157,16 +157,9 @@ static Snapshot HistoricSnapshot = NULL; * These are updated by GetSnapshotData. We initialize them this way * for the convenience of TransactionIdIsInProgress: even in bootstrap * mode, we don't want it to say that BootstrapTransactionId is in progress. - * - * RecentGlobalXmin and RecentGlobalDataXmin are initialized to - * InvalidTransactionId, to ensure that no one tries to use a stale - * value. Readers should ensure that it has been set to something else - * before using it. */ TransactionId TransactionXmin = FirstNormalTransactionId; TransactionId RecentXmin = FirstNormalTransactionId; -TransactionId RecentGlobalXmin = InvalidTransactionId; -TransactionId RecentGlobalDataXmin = InvalidTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; @@ -581,9 +574,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, * Even though we are not going to use the snapshot it computes, we must * call GetSnapshotData, for two reasons: (1) to be sure that * CurrentSnapshotData's XID arrays have been allocated, and (2) to update - * RecentXmin and RecentGlobalXmin. (We could alternatively include those - * two variables in exported snapshot files, but it seems better to have - * snapshot importers compute reasonably up-to-date values for them.) + * the state for GlobalVis*. */ CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -956,36 +947,6 @@ xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) return 0; } -/* - * Get current RecentGlobalXmin value, as a FullTransactionId. - */ -FullTransactionId -GetFullRecentGlobalXmin(void) -{ - FullTransactionId nextxid_full; - uint32 nextxid_epoch; - TransactionId nextxid_xid; - uint32 epoch; - - Assert(TransactionIdIsNormal(RecentGlobalXmin)); - - /* - * Compute the epoch from the next XID's epoch. This relies on the fact - * that RecentGlobalXmin must be within the 2 billion XID horizon from the - * next XID. - */ - nextxid_full = ReadNextFullTransactionId(); - nextxid_epoch = EpochFromFullTransactionId(nextxid_full); - nextxid_xid = XidFromFullTransactionId(nextxid_full); - - if (RecentGlobalXmin > nextxid_xid) - epoch = nextxid_epoch - 1; - else - epoch = nextxid_epoch; - - return FullTransactionIdFromEpochAndXid(epoch, RecentGlobalXmin); -} - /* * SnapshotResetXmin * @@ -1753,106 +1714,157 @@ GetOldSnapshotThresholdTimestamp(void) return threshold_timestamp; } -static void +void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit) { SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + Assert(oldSnapshotControl->threshold_timestamp <= ts); + Assert(TransactionIdPrecedesOrEquals(oldSnapshotControl->threshold_xid, xlimit)); oldSnapshotControl->threshold_timestamp = ts; oldSnapshotControl->threshold_xid = xlimit; SpinLockRelease(&oldSnapshotControl->mutex_threshold); } +/* + * XXX: Magic to keep old_snapshot_threshold tests appear "working". They + * currently are broken, and discussion of what to do about them is + * ongoing. See + * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de + */ +void +SnapshotTooOldMagicForTest(void) +{ + TimestampTz ts = GetSnapshotCurrentTimestamp(); + + Assert(old_snapshot_threshold == 0); + + ts -= 5 * USECS_PER_SEC; + + SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + oldSnapshotControl->threshold_timestamp = ts; + SpinLockRelease(&oldSnapshotControl->mutex_threshold); +} + +/* + * If there is a valid mapping for the timestamp, set *xlimitp to + * that. Returns whether there is such a mapping. + */ +static bool +GetOldSnapshotFromTimeMapping(TimestampTz ts, TransactionId *xlimitp) +{ + bool in_mapping = false; + + Assert(ts == AlignTimestampToMinuteBoundary(ts)); + + LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED); + + if (oldSnapshotControl->count_used > 0 + && ts >= oldSnapshotControl->head_timestamp) + { + int offset; + + offset = ((ts - oldSnapshotControl->head_timestamp) + / USECS_PER_MINUTE); + if (offset > oldSnapshotControl->count_used - 1) + offset = oldSnapshotControl->count_used - 1; + offset = (oldSnapshotControl->head_offset + offset) + % OLD_SNAPSHOT_TIME_MAP_ENTRIES; + + *xlimitp = oldSnapshotControl->xid_by_minute[offset]; + + in_mapping = true; + } + + LWLockRelease(OldSnapshotTimeMapLock); + + return in_mapping; +} + /* * TransactionIdLimitedForOldSnapshots * - * Apply old snapshot limit, if any. This is intended to be called for page - * pruning and table vacuuming, to allow old_snapshot_threshold to override - * the normal global xmin value. Actual testing for snapshot too old will be - * based on whether a snapshot timestamp is prior to the threshold timestamp - * set in this function. + * Apply old snapshot limit. This is intended to be called for page pruning + * and table vacuuming, to allow old_snapshot_threshold to override the normal + * global xmin value. Actual testing for snapshot too old will be based on + * whether a snapshot timestamp is prior to the threshold timestamp set in + * this function. + * + * If the limited horizon allows a cleanup action that otherwise would not be + * possible, SetOldSnapshotThresholdTimestamp(*limit_ts, *limit_xid) needs to + * be called before that cleanup action. */ -TransactionId +bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, - Relation relation) + Relation relation, + TransactionId *limit_xid, + TimestampTz *limit_ts) { - if (TransactionIdIsNormal(recentXmin) - && old_snapshot_threshold >= 0 - && RelationAllowsEarlyPruning(relation)) - { - TimestampTz ts = GetSnapshotCurrentTimestamp(); - TransactionId xlimit = recentXmin; - TransactionId latest_xmin; - TimestampTz update_ts; - bool same_ts_as_threshold = false; + TimestampTz ts; + TransactionId xlimit = recentXmin; + TransactionId latest_xmin; + TimestampTz next_map_update_ts; + TransactionId threshold_timestamp; + TransactionId threshold_xid; - SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); - latest_xmin = oldSnapshotControl->latest_xmin; - update_ts = oldSnapshotControl->next_map_update; - SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); + Assert(TransactionIdIsNormal(recentXmin)); + Assert(OldSnapshotThresholdActive()); + Assert(limit_ts != NULL && limit_xid != NULL); - /* - * Zero threshold always overrides to latest xmin, if valid. Without - * some heuristic it will find its own snapshot too old on, for - * example, a simple UPDATE -- which would make it useless for most - * testing, but there is no principled way to ensure that it doesn't - * fail in this way. Use a five-second delay to try to get useful - * testing behavior, but this may need adjustment. - */ - if (old_snapshot_threshold == 0) - { - if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) - && TransactionIdFollows(latest_xmin, xlimit)) - xlimit = latest_xmin; + if (!RelationAllowsEarlyPruning(relation)) + return false; - ts -= 5 * USECS_PER_SEC; - SetOldSnapshotThresholdTimestamp(ts, xlimit); + ts = GetSnapshotCurrentTimestamp(); - return xlimit; - } + SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); + latest_xmin = oldSnapshotControl->latest_xmin; + next_map_update_ts = oldSnapshotControl->next_map_update; + SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); + /* + * Zero threshold always overrides to latest xmin, if valid. Without some + * heuristic it will find its own snapshot too old on, for example, a + * simple UPDATE -- which would make it useless for most testing, but + * there is no principled way to ensure that it doesn't fail in this way. + * Use a five-second delay to try to get useful testing behavior, but this + * may need adjustment. + */ + if (old_snapshot_threshold == 0) + { + if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) + && TransactionIdFollows(latest_xmin, xlimit)) + xlimit = latest_xmin; + + ts -= 5 * USECS_PER_SEC; + } + else + { ts = AlignTimestampToMinuteBoundary(ts) - (old_snapshot_threshold * USECS_PER_MINUTE); /* Check for fast exit without LW locking. */ SpinLockAcquire(&oldSnapshotControl->mutex_threshold); - if (ts == oldSnapshotControl->threshold_timestamp) - { - xlimit = oldSnapshotControl->threshold_xid; - same_ts_as_threshold = true; - } + threshold_timestamp = oldSnapshotControl->threshold_timestamp; + threshold_xid = oldSnapshotControl->threshold_xid; SpinLockRelease(&oldSnapshotControl->mutex_threshold); - if (!same_ts_as_threshold) + if (ts == threshold_timestamp) + { + /* + * Current timestamp is in same bucket as the the last limit that + * was applied. Reuse. + */ + xlimit = threshold_xid; + } + else if (ts == next_map_update_ts) + { + /* + * FIXME: This branch is super iffy - but that should probably + * fixed separately. + */ + xlimit = latest_xmin; + } + else if (GetOldSnapshotFromTimeMapping(ts, &xlimit)) { - if (ts == update_ts) - { - xlimit = latest_xmin; - if (NormalTransactionIdFollows(xlimit, recentXmin)) - SetOldSnapshotThresholdTimestamp(ts, xlimit); - } - else - { - LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED); - - if (oldSnapshotControl->count_used > 0 - && ts >= oldSnapshotControl->head_timestamp) - { - int offset; - - offset = ((ts - oldSnapshotControl->head_timestamp) - / USECS_PER_MINUTE); - if (offset > oldSnapshotControl->count_used - 1) - offset = oldSnapshotControl->count_used - 1; - offset = (oldSnapshotControl->head_offset + offset) - % OLD_SNAPSHOT_TIME_MAP_ENTRIES; - xlimit = oldSnapshotControl->xid_by_minute[offset]; - - if (NormalTransactionIdFollows(xlimit, recentXmin)) - SetOldSnapshotThresholdTimestamp(ts, xlimit); - } - - LWLockRelease(OldSnapshotTimeMapLock); - } } /* @@ -1867,12 +1879,18 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, if (TransactionIdIsNormal(latest_xmin) && TransactionIdPrecedes(latest_xmin, xlimit)) xlimit = latest_xmin; + } + + if (TransactionIdIsValid(xlimit) && + TransactionIdFollowsOrEquals(xlimit, recentXmin)) + { + *limit_ts = ts; + *limit_xid = xlimit; - if (NormalTransactionIdFollows(xlimit, recentXmin)) - return xlimit; + return true; } - return recentXmin; + return false; } /* diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h index 3f64fd572e32..fe66a95226b9 100644 --- a/src/include/access/ginblock.h +++ b/src/include/access/ginblock.h @@ -12,6 +12,7 @@ #include "access/transam.h" #include "storage/block.h" +#include "storage/bufpage.h" #include "storage/itemptr.h" #include "storage/off.h" @@ -134,8 +135,7 @@ typedef struct GinMetaPageData */ #define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) #define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid) -#define GinPageIsRecyclable(page) ( PageIsNew(page) || (GinPageIsDeleted(page) \ - && TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalXmin))) +extern bool GinPageIsRecyclable(Page page); /* * We use our own ItemPointerGet(BlockNumber|OffsetNumber) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index b31de389106d..ba77013f64f2 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -172,9 +172,12 @@ extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel, int nitems); /* in heap/pruneheap.c */ +struct GlobalVisState; extern void heap_page_prune_opt(Relation relation, Buffer buffer); extern int heap_page_prune(Relation relation, Buffer buffer, - TransactionId OldestXmin, + struct GlobalVisState *vistest, + TransactionId limited_oldest_xmin, + TimestampTz limited_oldest_ts, bool report_stats, TransactionId *latestRemovedXid); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, @@ -195,11 +198,14 @@ extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple stup, CommandId curcid, Buffer buffer); extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple stup, TransactionId OldestXmin, Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple stup, Buffer buffer, + TransactionId *dead_after); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); -extern bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin); +extern bool HeapTupleIsSurelyDead(HeapTuple htup, + struct GlobalVisState *vistest); /* * To avoid leaking too much knowledge about reorderbuffer implementation diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 8db326ad1b50..b32044153b09 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -95,15 +95,6 @@ FullTransactionIdFromU64(uint64 value) (dest) = FirstNormalTransactionId; \ } while(0) -/* advance a FullTransactionId variable, stepping over special XIDs */ -static inline void -FullTransactionIdAdvance(FullTransactionId *dest) -{ - dest->value++; - while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId) - dest->value++; -} - /* * Retreat a FullTransactionId variable, stepping over xids that would appear * to be special only when viewed as 32bit XIDs. @@ -129,6 +120,23 @@ FullTransactionIdRetreat(FullTransactionId *dest) dest->value--; } +/* + * Advance a FullTransactionId variable, stepping over xids that would appear + * to be special only when viewed as 32bit XIDs. + */ +static inline void +FullTransactionIdAdvance(FullTransactionId *dest) +{ + dest->value++; + + /* see FullTransactionIdAdvance() */ + if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId)) + return; + + while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId) + dest->value++; +} + /* back up a transaction ID variable, handling wraparound correctly */ #define TransactionIdRetreat(dest) \ do { \ @@ -293,6 +301,59 @@ ReadNewTransactionId(void) return XidFromFullTransactionId(ReadNextFullTransactionId()); } +/* return transaction ID backed up by amount, handling wraparound correctly */ +static inline TransactionId +TransactionIdRetreatedBy(TransactionId xid, uint32 amount) +{ + xid -= amount; + + while (xid < FirstNormalTransactionId) + xid--; + + return xid; +} + +/* return the older of the two IDs */ +static inline TransactionId +TransactionIdOlder(TransactionId a, TransactionId b) +{ + if (!TransactionIdIsValid(a)) + return b; + + if (!TransactionIdIsValid(b)) + return a; + + if (TransactionIdPrecedes(a, b)) + return a; + return b; +} + +/* return the older of the two IDs, assuming they're both normal */ +static inline TransactionId +NormalTransactionIdOlder(TransactionId a, TransactionId b) +{ + Assert(TransactionIdIsNormal(a)); + Assert(TransactionIdIsNormal(b)); + if (NormalTransactionIdPrecedes(a, b)) + return a; + return b; +} + +/* return the newer of the two IDs */ +static inline FullTransactionId +FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) +{ + if (!FullTransactionIdIsValid(a)) + return b; + + if (!FullTransactionIdIsValid(b)) + return a; + + if (FullTransactionIdFollows(a, b)) + return a; + return b; +} + #endif /* FRONTEND */ #endif /* TRANSAM_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 3f88683a059d..51b8f994ac0a 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -389,12 +389,6 @@ PageValidateSpecialPointer(Page page) #define PageClearAllVisible(page) \ (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) -#define PageIsPrunable(page, oldestxmin) \ -( \ - AssertMacro(TransactionIdIsNormal(oldestxmin)), \ - TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) && \ - TransactionIdPrecedes(((PageHeader) (page))->pd_prune_xid, oldestxmin) \ -) #define PageSetPrunable(page, xid) \ do { \ Assert(TransactionIdIsNormal(xid)); \ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 5ceb2494bae7..52ff43cabaaf 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -42,20 +42,12 @@ struct XidCache /* * Flags for PGXACT->vacuumFlags - * - * Note: If you modify these flags, you need to modify PROCARRAY_XXX flags - * in src/include/storage/procarray.h. - * - * PROC_RESERVED may later be assigned for use in vacuumFlags, but its value is - * used for PROCARRAY_SLOTS_XMIN in procarray.h, so GetOldestXmin won't be able - * to match and ignore processes with this flag set. */ #define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ #define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ #define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ #define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical * decoding outside xact */ -#define PROC_RESERVED 0x20 /* reserved for procarray */ /* flags reset at EOXact */ #define PROC_VACUUM_STATE_MASK \ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 01040d76e122..ea8a876ca45c 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -20,34 +20,6 @@ #include "utils/snapshot.h" -/* - * These are to implement PROCARRAY_FLAGS_XXX - * - * Note: These flags are cloned from PROC_XXX flags in src/include/storage/proc.h - * to avoid forcing to include proc.h when including procarray.h. So if you modify - * PROC_XXX flags, you need to modify these flags. - */ -#define PROCARRAY_VACUUM_FLAG 0x02 /* currently running lazy - * vacuum */ -#define PROCARRAY_LOGICAL_DECODING_FLAG 0x10 /* currently doing logical - * decoding outside xact */ - -#define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot xmin, - * catalog_xmin */ -/* - * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching - * PGXACT->vacuumFlags. Other flags are used for different purposes and - * have no corresponding PROC flag equivalent. - */ -#define PROCARRAY_PROC_FLAGS_MASK (PROCARRAY_VACUUM_FLAG | \ - PROCARRAY_LOGICAL_DECODING_FLAG) - -/* Use the following flags as an input "flags" to GetOldestXmin function */ -/* Consider all backends except for logical decoding ones which manage xmin separately */ -#define PROCARRAY_FLAGS_DEFAULT PROCARRAY_LOGICAL_DECODING_FLAG -/* Ignore vacuum backends */ -#define PROCARRAY_FLAGS_VACUUM PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG - extern Size ProcArrayShmemSize(void); extern void CreateSharedProcArray(void); extern void ProcArrayAdd(PGPROC *proc); @@ -81,9 +53,11 @@ extern RunningTransactions GetRunningTransactionData(void); extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); -extern TransactionId GetOldestXmin(Relation rel, int flags); +extern TransactionId GetOldestNonRemovableTransactionId(Relation rel); +extern TransactionId GetOldestTransactionIdConsideredRunning(void); extern TransactionId GetOldestActiveTransactionId(void); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); +extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin); extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids); extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids); diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index ffb4ba3adfb0..b6b403e29313 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -52,13 +52,12 @@ extern Size SnapMgrShmemSize(void); extern void SnapMgrInit(void); extern TimestampTz GetSnapshotCurrentTimestamp(void); extern TimestampTz GetOldSnapshotThresholdTimestamp(void); +extern void SnapshotTooOldMagicForTest(void); extern bool FirstSnapshotSet; extern PGDLLIMPORT TransactionId TransactionXmin; extern PGDLLIMPORT TransactionId RecentXmin; -extern PGDLLIMPORT TransactionId RecentGlobalXmin; -extern PGDLLIMPORT TransactionId RecentGlobalDataXmin; /* Variables representing various special snapshot semantics */ extern PGDLLIMPORT SnapshotData SnapshotSelfData; @@ -78,11 +77,12 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData; /* * Similarly, some initialization is required for a NonVacuumable snapshot. - * The caller must supply the xmin horizon to use (e.g., RecentGlobalXmin). + * The caller must supply the visibility cutoff state to use (c.f. + * GlobalVisTestFor()). */ -#define InitNonVacuumableSnapshot(snapshotdata, xmin_horizon) \ +#define InitNonVacuumableSnapshot(snapshotdata, vistestp) \ ((snapshotdata).snapshot_type = SNAPSHOT_NON_VACUUMABLE, \ - (snapshotdata).xmin = (xmin_horizon)) + (snapshotdata).vistest = (vistestp)) /* * Similarly, some initialization is required for SnapshotToast. We need @@ -98,6 +98,11 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData; ((snapshot)->snapshot_type == SNAPSHOT_MVCC || \ (snapshot)->snapshot_type == SNAPSHOT_HISTORIC_MVCC) +static inline bool +OldSnapshotThresholdActive(void) +{ + return old_snapshot_threshold >= 0; +} extern Snapshot GetTransactionSnapshot(void); extern Snapshot GetLatestSnapshot(void); @@ -121,8 +126,6 @@ extern void UnregisterSnapshot(Snapshot snapshot); extern Snapshot RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner); extern void UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner); -extern FullTransactionId GetFullRecentGlobalXmin(void); - extern void AtSubCommit_Snapshot(int level); extern void AtSubAbort_Snapshot(int level); extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin); @@ -131,13 +134,29 @@ extern void ImportSnapshot(const char *idstr); extern bool XactHasExportedSnapshots(void); extern void DeleteAllExportedSnapshotFiles(void); extern bool ThereAreNoPriorRegisteredSnapshots(void); -extern TransactionId TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, - Relation relation); +extern bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, + Relation relation, + TransactionId *limit_xid, + TimestampTz *limit_ts); +extern void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit); extern void MaintainOldSnapshotTimeMapping(TimestampTz whenTaken, TransactionId xmin); extern char *ExportSnapshot(Snapshot snapshot); +/* + * These live in procarray.c because they're intimately linked to the + * procarray contents, but thematically they better fit into snapmgr.h. + */ +typedef struct GlobalVisState GlobalVisState; +extern GlobalVisState *GlobalVisTestFor(Relation rel); +extern bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid); +extern bool GlobalVisTestIsRemovableFullXid(GlobalVisState *state, FullTransactionId fxid); +extern FullTransactionId GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state); +extern TransactionId GlobalVisTestNonRemovableHorizon(GlobalVisState *state); +extern bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid); +extern bool GlobalVisIsRemovableFullXid(Relation rel, FullTransactionId fxid); + /* * Utility functions for implementing visibility routines in table AMs. */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 4796edb63aa2..35b1f05bea65 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -192,6 +192,12 @@ typedef struct SnapshotData */ uint32 speculativeToken; + /* + * For SNAPSHOT_NON_VACUUMABLE (and hopefully more in the future) this is + * used to determine whether row could be vacuumed. + */ + struct GlobalVisState *vistest; + /* * Book-keeping information, used by the snapshot manager */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 7eaaad1e140a..b4948ac675f7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -395,6 +395,7 @@ CompositeTypeStmt CompoundAffixFlag CompressionAlgorithm CompressorState +ComputeXidHorizonsResult ConditionVariable ConditionalStack ConfigData @@ -930,6 +931,7 @@ GistSplitVector GistTsVectorOptions GistVacState GlobalTransaction +GlobalVisState GrantRoleStmt GrantStmt GrantTargetType From b8443eae72b5c36e6b443a2f09b9c605c61a589d Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 12 Aug 2020 17:04:51 -0700 Subject: [PATCH 288/334] Fix out-of-date version reference, grammar. Time appears to be passing fast. Reported-By: Peter Geoghegan --- src/backend/access/nbtree/README | 2 +- src/backend/access/transam/README | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 781a8f1932d3..9692e4cdf644 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -412,7 +412,7 @@ the cost of walking down the tree in such common cases. The optimization works on the assumption that there can only be one non-ignorable leaf rightmost page, and so not even a visible-to-everyone -style interlock required. We cannot fail to detect that our hint was +style interlock is required. We cannot fail to detect that our hint was invalidated, because there can only be one such page in the B-Tree at any time. It's possible that the page will be deleted and recycled without a backend's cached page also being detected as invalidated, but diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 6f44ae9ce6a5..98acb429b67e 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -318,7 +318,7 @@ XID less than this could be about to appear in the ProcArray, because of the XidGenLock interlock discussed above.) As GetSnapshotData is performance critical, it does not perform an accurate -oldest-xmin calculation (it used to, until v13). The contents of a snapshot +oldest-xmin calculation (it used to, until v14). The contents of a snapshot only depend on the xids of other backends, not their xmin. As backend's xmin changes much more often than its xid, having GetSnapshotData look at xmins can lead to a lot of unnecessary cacheline ping-pong. Instead From a811ea5bde2fbf450095994b5726dcbf64d68668 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Thu, 13 Aug 2020 17:33:49 -0400 Subject: [PATCH 289/334] Handle new HOT chains in index-build table scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a table is scanned by heapam_index_build_range_scan (née IndexBuildHeapScan) and the table lock being held allows concurrent data changes, it is possible for new HOT chains to sprout in a page that were unknown when the scan of a page happened. This leads to an error such as ERROR: failed to find parent tuple for heap-only tuple at (X,Y) in table "tbl" because the root tuple was not present when we first obtained the list of the page's root tuples. This can be fixed by re-obtaining the list of root tuples, if we see that a heap-only tuple appears to point to a non-existing root. This was reported by Anastasia as occurring for BRIN summarization (which exists since 9.5), but I think it could theoretically also happen with CREATE INDEX CONCURRENTLY (much older) or REINDEX CONCURRENTLY (very recent). It seems a happy coincidence that BRIN forces us to backpatch this all the way to 9.5. Reported-by: Anastasia Lubennikova Diagnosed-by: Anastasia Lubennikova Co-authored-by: Anastasia Lubennikova Co-authored-by: Álvaro Herrera Discussion: https://postgr.es/m/602d8487-f0b2-5486-0088-0f372b2549fa@postgrespro.ru Backpatch: 9.5 - master --- src/backend/access/heap/heapam_handler.c | 20 ++++++++++++++++++++ src/backend/access/heap/pruneheap.c | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index e3e41fb75163..dcaea7135fb2 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1324,6 +1324,12 @@ heapam_index_build_range_scan(Relation heapRelation, * buffer continuously while visiting the page, so no pruning * operation can occur either. * + * In cases with only ShareUpdateExclusiveLock on the table, it's + * possible for some HOT tuples to appear that we didn't know about + * when we first read the page. To handle that case, we re-obtain the + * list of root offsets when a HOT tuple points to a root item that we + * don't know about. + * * Also, although our opinions about tuple liveness could change while * we scan the page (due to concurrent transaction commits/aborts), * the chain root locations won't, so this info doesn't need to be @@ -1625,6 +1631,20 @@ heapam_index_build_range_scan(Relation heapRelation, offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); + /* + * If a HOT tuple points to a root that we don't know + * about, obtain root items afresh. If that still fails, + * report it as corruption. + */ + if (root_offsets[offnum - 1] == InvalidOffsetNumber) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + } + if (!OffsetNumberIsValid(root_offsets[offnum - 1])) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 00a3cb106aac..3ad4222cb8af 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -855,7 +855,7 @@ heap_page_prune_execute(Buffer buffer, * root_offsets[k - 1] = j. * * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. - * We zero out all unused entries. + * Unused entries are filled with InvalidOffsetNumber (zero). * * The function must be called with at least share lock on the buffer, to * prevent concurrent prune operations. @@ -870,7 +870,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) OffsetNumber offnum, maxoff; - MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); + MemSet(root_offsets, InvalidOffsetNumber, + MaxHeapTuplesPerPage * sizeof(OffsetNumber)); maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) From 1f51c17c68d05c28d5b9294d8013cb9e7e653160 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 13 Aug 2020 16:25:21 -0700 Subject: [PATCH 290/334] snapshot scalability: Move PGXACT->xmin back to PGPROC. Now that xmin isn't needed for GetSnapshotData() anymore, it leads to unnecessary cacheline ping-pong to have it in PGXACT, as it is updated considerably more frequently than the other PGXACT members. After the changes in dc7420c2c92, this is a very straight-forward change. For highly concurrent, snapshot acquisition heavy, workloads this change alone can significantly increase scalability. E.g. plain pgbench on a smaller 2 socket machine gains 1.07x for read-only pgbench, 1.22x for read-only pgbench when submitting queries in batches of 100, and 2.85x for batches of 100 'SELECT';. The latter numbers are obviously not to be expected in the real-world, but micro-benchmark the snapshot computation scalability (previously spending ~80% of the time in GetSnapshotData()). Author: Andres Freund Reviewed-By: Robert Haas Reviewed-By: Thomas Munro Reviewed-By: David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- src/backend/access/gist/gistxlog.c | 2 +- src/backend/access/nbtree/nbtpage.c | 2 +- src/backend/access/transam/README | 4 +-- src/backend/access/transam/twophase.c | 2 +- src/backend/commands/indexcmds.c | 2 +- src/backend/replication/logical/snapbuild.c | 6 ++-- src/backend/replication/walsender.c | 10 +++--- src/backend/storage/ipc/procarray.c | 36 +++++++++------------ src/backend/storage/ipc/sinvaladt.c | 2 +- src/backend/storage/lmgr/proc.c | 4 +-- src/backend/utils/time/snapmgr.c | 28 ++++++++-------- src/include/storage/proc.h | 10 +++--- 12 files changed, 52 insertions(+), 56 deletions(-) diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index a63b05388c5d..dcd28f678b3d 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -389,7 +389,7 @@ gistRedoPageReuse(XLogReaderState *record) * * latestRemovedXid was the page's deleteXid. The * GlobalVisIsRemovableFullXid(deleteXid) test in gistPageRecyclable() - * conceptually mirrors the pgxact->xmin > limitXmin test in + * conceptually mirrors the PGPROC->xmin > limitXmin test in * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the * same exclusion effect on primary and standby. */ diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 74be3807bb7d..7f392480ac0f 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -2317,7 +2317,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * we're in VACUUM and would not otherwise have an XID. Having already * updated links to the target, ReadNewTransactionId() suffices as an * upper bound. Any scan having retained a now-stale link is advertising - * in its PGXACT an xmin less than or equal to the value we read here. It + * in its PGPROC an xmin less than or equal to the value we read here. It * will continue to do so, holding back the xmin horizon, for the duration * of that scan. */ diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 98acb429b67e..eab8edd20ec2 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -296,7 +296,7 @@ ensure that the C compiler does exactly what you tell it to.) Another important activity that uses the shared ProcArray is ComputeXidHorizons, which must determine a lower bound for the oldest xmin of any active MVCC snapshot, system-wide. Each individual backend -advertises the smallest xmin of its own snapshots in MyPgXact->xmin, or zero +advertises the smallest xmin of its own snapshots in MyProc->xmin, or zero if it currently has no live snapshots (eg, if it's between transactions or hasn't yet set a snapshot for a new transaction). ComputeXidHorizons takes the MIN() of the valid xmin fields. It does this with only shared lock on @@ -331,7 +331,7 @@ necessary. Note that while it is certain that two concurrent executions of GetSnapshotData will compute the same xmin for their own snapshots, there is no such guarantee for the horizons computed by ComputeXidHorizons. This is -because we allow XID-less transactions to clear their MyPgXact->xmin +because we allow XID-less transactions to clear their MyProc->xmin asynchronously (without taking ProcArrayLock), so one execution might see what had been the oldest xmin, and another not. This is OK since the thresholds need only be a valid lower bound. As noted above, we are already diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 31f135f5cedc..eb5f4680a3d9 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -464,7 +464,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, /* We set up the gxact's VXID as InvalidBackendId/XID */ proc->lxid = (LocalTransactionId) xid; pgxact->xid = xid; - pgxact->xmin = InvalidTransactionId; + Assert(proc->xmin == InvalidTransactionId); proc->delayChkpt = false; pgxact->vacuumFlags = 0; proc->pid = 0; diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 7819266a6306..254dbcdce52b 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1535,7 +1535,7 @@ DefineIndex(Oid relationId, StartTransactionCommand(); /* We should now definitely not be advertising any xmin. */ - Assert(MyPgXact->xmin == InvalidTransactionId); + Assert(MyProc->xmin == InvalidTransactionId); /* * The index is now valid in the sense that it contains all currently diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3089f0d5ddcd..e9701ea72215 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -553,8 +553,8 @@ SnapBuildInitialSnapshot(SnapBuild *builder) elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); /* so we don't overwrite the existing value */ - if (TransactionIdIsValid(MyPgXact->xmin)) - elog(ERROR, "cannot build an initial slot snapshot when MyPgXact->xmin already is valid"); + if (TransactionIdIsValid(MyProc->xmin)) + elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid"); snap = SnapBuildBuildSnapshot(builder); @@ -575,7 +575,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) } #endif - MyPgXact->xmin = snap->xmin; + MyProc->xmin = snap->xmin; /* allocate in transaction context */ newxip = (TransactionId *) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 460ca3f947f4..3f756b470af1 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -1964,7 +1964,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbac ReplicationSlot *slot = MyReplicationSlot; SpinLockAcquire(&slot->mutex); - MyPgXact->xmin = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; /* * For physical replication we don't need the interlock provided by xmin @@ -2093,7 +2093,7 @@ ProcessStandbyHSFeedbackMessage(void) if (!TransactionIdIsNormal(feedbackXmin) && !TransactionIdIsNormal(feedbackCatalogXmin)) { - MyPgXact->xmin = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; if (MyReplicationSlot != NULL) PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin); return; @@ -2135,7 +2135,7 @@ ProcessStandbyHSFeedbackMessage(void) * risk already since a VACUUM could already have determined the horizon.) * * If we're using a replication slot we reserve the xmin via that, - * otherwise via the walsender's PGXACT entry. We can only track the + * otherwise via the walsender's PGPROC entry. We can only track the * catalog xmin separately when using a slot, so we store the least of the * two provided when not using a slot. * @@ -2148,9 +2148,9 @@ ProcessStandbyHSFeedbackMessage(void) { if (TransactionIdIsNormal(feedbackCatalogXmin) && TransactionIdPrecedes(feedbackCatalogXmin, feedbackXmin)) - MyPgXact->xmin = feedbackCatalogXmin; + MyProc->xmin = feedbackCatalogXmin; else - MyPgXact->xmin = feedbackXmin; + MyProc->xmin = feedbackXmin; } } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e582d5af4291..185f581c8b6f 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -587,9 +587,9 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); proc->lxid = InvalidLocalTransactionId; - pgxact->xmin = InvalidTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; + proc->xmin = InvalidTransactionId; proc->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; @@ -609,9 +609,9 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, { pgxact->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; - pgxact->xmin = InvalidTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; + proc->xmin = InvalidTransactionId; proc->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; @@ -763,7 +763,7 @@ ProcArrayClearTransaction(PGPROC *proc) */ pgxact->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; - pgxact->xmin = InvalidTransactionId; + proc->xmin = InvalidTransactionId; proc->recoveryConflictPending = false; /* redundant, but just in case */ @@ -1563,7 +1563,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(pgxact->xid); - xmin = UINT32_ACCESS_ONCE(pgxact->xmin); + xmin = UINT32_ACCESS_ONCE(proc->xmin); /* * Consider both the transaction's Xmin, and its Xid. @@ -1838,7 +1838,7 @@ GetMaxSnapshotSubxidCount(void) * * We also update the following backend-global variables: * TransactionXmin: the oldest xmin of any snapshot in use in the - * current transaction (this is the same as MyPgXact->xmin). + * current transaction (this is the same as MyProc->xmin). * RecentXmin: the xmin computed for the most recent snapshot. XIDs * older than this are known not running any more. * @@ -1899,7 +1899,7 @@ GetSnapshotData(Snapshot snapshot) /* * It is sufficient to get shared lock on ProcArrayLock, even if we are - * going to set MyPgXact->xmin. + * going to set MyProc->xmin. */ LWLockAcquire(ProcArrayLock, LW_SHARED); @@ -2051,8 +2051,8 @@ GetSnapshotData(Snapshot snapshot) replication_slot_xmin = procArray->replication_slot_xmin; replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; - if (!TransactionIdIsValid(MyPgXact->xmin)) - MyPgXact->xmin = TransactionXmin = xmin; + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = xmin; LWLockRelease(ProcArrayLock); @@ -2172,7 +2172,7 @@ GetSnapshotData(Snapshot snapshot) } /* - * ProcArrayInstallImportedXmin -- install imported xmin into MyPgXact->xmin + * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin * * This is called when installing a snapshot imported from another * transaction. To ensure that OldestXmin doesn't go backwards, we must @@ -2225,7 +2225,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, /* * Likewise, let's just make real sure its xmin does cover us. */ - xid = UINT32_ACCESS_ONCE(pgxact->xmin); + xid = UINT32_ACCESS_ONCE(proc->xmin); if (!TransactionIdIsNormal(xid) || !TransactionIdPrecedesOrEquals(xid, xmin)) continue; @@ -2236,7 +2236,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, * GetSnapshotData first, we'll be overwriting a valid xmin here, so * we don't check that.) */ - MyPgXact->xmin = TransactionXmin = xmin; + MyProc->xmin = TransactionXmin = xmin; result = true; break; @@ -2248,7 +2248,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, } /* - * ProcArrayInstallRestoredXmin -- install restored xmin into MyPgXact->xmin + * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin * * This is like ProcArrayInstallImportedXmin, but we have a pointer to the * PGPROC of the transaction from which we imported the snapshot, rather than @@ -2261,7 +2261,6 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) { bool result = false; TransactionId xid; - PGXACT *pgxact; Assert(TransactionIdIsNormal(xmin)); Assert(proc != NULL); @@ -2269,20 +2268,18 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) /* Get lock so source xact can't end while we're doing this */ LWLockAcquire(ProcArrayLock, LW_SHARED); - pgxact = &allPgXact[proc->pgprocno]; - /* * Be certain that the referenced PGPROC has an advertised xmin which is * no later than the one we're installing, so that the system-wide xmin * can't go backwards. Also, make sure it's running in the same database, * so that the per-database xmin cannot go backwards. */ - xid = UINT32_ACCESS_ONCE(pgxact->xmin); + xid = UINT32_ACCESS_ONCE(proc->xmin); if (proc->databaseId == MyDatabaseId && TransactionIdIsNormal(xid) && TransactionIdPrecedesOrEquals(xid, xmin)) { - MyPgXact->xmin = TransactionXmin = xmin; + MyProc->xmin = TransactionXmin = xmin; result = true; } @@ -2908,7 +2905,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, if (allDbs || proc->databaseId == MyDatabaseId) { /* Fetch xmin just once - might change on us */ - TransactionId pxmin = UINT32_ACCESS_ONCE(pgxact->xmin); + TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); if (excludeXmin0 && !TransactionIdIsValid(pxmin)) continue; @@ -2994,7 +2991,6 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; /* Exclude prepared transactions */ if (proc->pid == 0) @@ -3004,7 +3000,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) proc->databaseId == dbOid) { /* Fetch xmin just once - can't change on us, but good coding */ - TransactionId pxmin = UINT32_ACCESS_ONCE(pgxact->xmin); + TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); /* * We ignore an invalid pxmin because this means that backend has diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c index e5c115b92f2b..ad048bc85fab 100644 --- a/src/backend/storage/ipc/sinvaladt.c +++ b/src/backend/storage/ipc/sinvaladt.c @@ -420,7 +420,7 @@ BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmi PGXACT *xact = &ProcGlobal->allPgXact[proc->pgprocno]; *xid = xact->xid; - *xmin = xact->xmin; + *xmin = proc->xmin; } } diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e57fcd253880..de346cd87fcd 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -388,7 +388,7 @@ InitProcess(void) MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyPgXact->xid = InvalidTransactionId; - MyPgXact->xmin = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; MyProc->pid = MyProcPid; /* backendId, databaseId and roleId will be filled in later */ MyProc->backendId = InvalidBackendId; @@ -572,7 +572,7 @@ InitAuxiliaryProcess(void) MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyPgXact->xid = InvalidTransactionId; - MyPgXact->xmin = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 604d823f6861..752af0c10dfc 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -27,11 +27,11 @@ * their lifetime is managed separately (as they live longer than one xact.c * transaction). * - * These arrangements let us reset MyPgXact->xmin when there are no snapshots + * These arrangements let us reset MyProc->xmin when there are no snapshots * referenced by this transaction, and advance it when the one with oldest * Xmin is no longer referenced. For simplicity however, only registered * snapshots not active snapshots participate in tracking which one is oldest; - * we don't try to change MyPgXact->xmin except when the active-snapshot + * we don't try to change MyProc->xmin except when the active-snapshot * stack is empty. * * @@ -187,7 +187,7 @@ static ActiveSnapshotElt *OldestActiveSnapshot = NULL; /* * Currently registered Snapshots. Ordered in a heap by xmin, so that we can - * quickly find the one with lowest xmin, to advance our MyPgXact->xmin. + * quickly find the one with lowest xmin, to advance our MyProc->xmin. */ static int xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg); @@ -475,7 +475,7 @@ GetNonHistoricCatalogSnapshot(Oid relid) /* * Make sure the catalog snapshot will be accounted for in decisions - * about advancing PGXACT->xmin. We could apply RegisterSnapshot, but + * about advancing PGPROC->xmin. We could apply RegisterSnapshot, but * that would result in making a physical copy, which is overkill; and * it would also create a dependency on some resource owner, which we * do not want for reasons explained at the head of this file. Instead @@ -596,7 +596,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, /* NB: curcid should NOT be copied, it's a local matter */ /* - * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and + * Now we have to fix what GetSnapshotData did with MyProc->xmin and * TransactionXmin. There is a race condition: to make sure we are not * causing the global xmin to go backwards, we have to test that the * source transaction is still running, and that has to be done @@ -950,13 +950,13 @@ xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) /* * SnapshotResetXmin * - * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid. + * If there are no more snapshots, we can reset our PGPROC->xmin to InvalidXid. * Note we can do this without locking because we assume that storing an Xid * is atomic. * * Even if there are some remaining snapshots, we may be able to advance our - * PGXACT->xmin to some degree. This typically happens when a portal is - * dropped. For efficiency, we only consider recomputing PGXACT->xmin when + * PGPROC->xmin to some degree. This typically happens when a portal is + * dropped. For efficiency, we only consider recomputing PGPROC->xmin when * the active snapshot stack is empty; this allows us not to need to track * which active snapshot is oldest. * @@ -977,15 +977,15 @@ SnapshotResetXmin(void) if (pairingheap_is_empty(&RegisteredSnapshots)) { - MyPgXact->xmin = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; return; } minSnapshot = pairingheap_container(SnapshotData, ph_node, pairingheap_first(&RegisteredSnapshots)); - if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin)) - MyPgXact->xmin = minSnapshot->xmin; + if (TransactionIdPrecedes(MyProc->xmin, minSnapshot->xmin)) + MyProc->xmin = minSnapshot->xmin; } /* @@ -1132,13 +1132,13 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) /* * During normal commit processing, we call ProcArrayEndTransaction() to - * reset the MyPgXact->xmin. That call happens prior to the call to + * reset the MyProc->xmin. That call happens prior to the call to * AtEOXact_Snapshot(), so we need not touch xmin here at all. */ if (resetXmin) SnapshotResetXmin(); - Assert(resetXmin || MyPgXact->xmin == 0); + Assert(resetXmin || MyProc->xmin == 0); } @@ -1830,7 +1830,7 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, */ if (old_snapshot_threshold == 0) { - if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) + if (TransactionIdPrecedes(latest_xmin, MyProc->xmin) && TransactionIdFollows(latest_xmin, xlimit)) xlimit = latest_xmin; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 52ff43cabaaf..5e4b028a5f98 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -101,6 +101,11 @@ struct PGPROC Latch procLatch; /* generic latch for process */ + TransactionId xmin; /* minimal running XID as it was when we were + * starting our xact, excluding LAZY VACUUM: + * vacuum must not remove tuples deleted by + * xid >= xmin ! */ + LocalTransactionId lxid; /* local id of top-level transaction currently * being executed by this proc, if running; * else InvalidLocalTransactionId */ @@ -223,11 +228,6 @@ typedef struct PGXACT * executed by this proc, if running and XID * is assigned; else InvalidTransactionId */ - TransactionId xmin; /* minimal running XID as it was when we were - * starting our xact, excluding LAZY VACUUM: - * vacuum must not remove tuples deleted by - * xid >= xmin ! */ - uint8 vacuumFlags; /* vacuum-related flags, see above */ bool overflowed; From a9306f10b95992ec7229cae3de507a9fa2f6aa3c Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 13 Aug 2020 20:00:38 -0400 Subject: [PATCH 291/334] Doc: improve examples for json_populate_record() and related functions. Make these examples self-contained by providing declarations of the user-defined row types they rely on. There wasn't room to do this in the old doc format, but now there is, and I think it makes the examples a good bit less confusing. --- doc/src/sgml/func.sgml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index f766c1bc67c1..9a4ac5a1ea36 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -15414,7 +15414,12 @@ table2-mapping calls. - select * from json_populate_record(null::myrowtype, '{"a": 1, "b": ["2", "a b"], "c": {"d": 4, "e": "a b c"}}') + create type subrowtype as (d int, e text); + create type myrowtype as (a int, b text[], c subrowtype); + + + select * from json_populate_record(null::myrowtype, + '{"a": 1, "b": ["2", "a b"], "c": {"d": 4, "e": "a b c"}, "x": "foo"}') a | b | c @@ -15446,7 +15451,10 @@ table2-mapping for json[b]_populate_record. - select * from json_populate_recordset(null::myrowtype, '[{"a":1,"b":2},{"a":3,"b":4}]') + create type twoints as (a int, b int); + + + select * from json_populate_recordset(null::twoints, '[{"a":1,"b":2},{"a":3,"b":4}]') a | b @@ -15483,7 +15491,10 @@ table2-mapping input record value, unmatched columns are always filled with nulls. - select * from json_to_record('{"a":1,"b":[1,2,3],"c":[1,2,3],"e":"bar","r": {"a": 123, "b": "a b c"}}') as x(a int, b text, c int[], d text, r myrowtype) + create type myrowtype as (a int, b text); + + + select * from json_to_record('{"a":1,"b":[1,2,3],"c":[1,2,3],"e":"bar","r": {"a": 123, "b": "a b c"}}') as x(a int, b text, c int[], d text, r myrowtype) a | b | c | d | r From 1f32136a9960df2e135e7b36363ea1f087b514a0 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 14 Aug 2020 09:30:34 +0900 Subject: [PATCH 292/334] Fix compilation warnings with libselinux 3.1 in contrib/sepgsql/ Upstream SELinux has recently marked security_context_t as officially deprecated, causing warnings with -Wdeprecated-declarations. This is considered as legacy code for some time now by upstream as security_context_t got removed from most of the code tree during the development of 2.3 back in 2014. This removes all the references to security_context_t in sepgsql/ to be consistent with SELinux, fixing the warnings. Note that this does not impact the minimum version of libselinux supported. Reviewed-by: Tom Lane Discussion: https://postgr.es/m/20200813012735.GC11663@paquier.xyz --- contrib/sepgsql/label.c | 10 +++++----- contrib/sepgsql/selinux.c | 10 +++++----- contrib/sepgsql/uavc.c | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/contrib/sepgsql/label.c b/contrib/sepgsql/label.c index 32e405530bb6..b00b91df5aa3 100644 --- a/contrib/sepgsql/label.c +++ b/contrib/sepgsql/label.c @@ -120,7 +120,7 @@ sepgsql_set_client_label(const char *new_label) tcontext = client_label_peer; else { - if (security_check_context_raw((security_context_t) new_label) < 0) + if (security_check_context_raw(new_label) < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("SELinux: invalid security label: \"%s\"", @@ -453,9 +453,9 @@ sepgsql_get_label(Oid classId, Oid objectId, int32 subId) object.objectSubId = subId; label = GetSecurityLabel(&object, SEPGSQL_LABEL_TAG); - if (!label || security_check_context_raw((security_context_t) label)) + if (!label || security_check_context_raw(label)) { - security_context_t unlabeled; + char *unlabeled; if (security_get_initial_context_raw("unlabeled", &unlabeled) < 0) ereport(ERROR, @@ -487,7 +487,7 @@ sepgsql_object_relabel(const ObjectAddress *object, const char *seclabel) * context of selinux. */ if (seclabel && - security_check_context_raw((security_context_t) seclabel) < 0) + security_check_context_raw(seclabel) < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("SELinux: invalid security label: \"%s\"", seclabel))); @@ -725,7 +725,7 @@ exec_object_restorecon(struct selabel_handle *sehnd, Oid catalogId) char *objname; int objtype = 1234; ObjectAddress object; - security_context_t context; + char *context; /* * The way to determine object name depends on object classes. So, any diff --git a/contrib/sepgsql/selinux.c b/contrib/sepgsql/selinux.c index 9fdc810f2ed4..2695e88f23c9 100644 --- a/contrib/sepgsql/selinux.c +++ b/contrib/sepgsql/selinux.c @@ -768,8 +768,8 @@ sepgsql_compute_avd(const char *scontext, * Ask SELinux what is allowed set of permissions on a pair of the * security contexts and the given object class. */ - if (security_compute_av_flags_raw((security_context_t) scontext, - (security_context_t) tcontext, + if (security_compute_av_flags_raw(scontext, + tcontext, tclass_ex, 0, &avd_ex) < 0) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -838,7 +838,7 @@ sepgsql_compute_create(const char *scontext, uint16 tclass, const char *objname) { - security_context_t ncontext; + char *ncontext; security_class_t tclass_ex; const char *tclass_name; char *result; @@ -853,8 +853,8 @@ sepgsql_compute_create(const char *scontext, * Ask SELinux what is the default context for the given object class on a * pair of security contexts */ - if (security_compute_create_name_raw((security_context_t) scontext, - (security_context_t) tcontext, + if (security_compute_create_name_raw(scontext, + tcontext, tclass_ex, objname, &ncontext) < 0) diff --git a/contrib/sepgsql/uavc.c b/contrib/sepgsql/uavc.c index 639a52c5567b..97189b7c46f0 100644 --- a/contrib/sepgsql/uavc.c +++ b/contrib/sepgsql/uavc.c @@ -171,7 +171,7 @@ sepgsql_avc_unlabeled(void) { if (!avc_unlabeled) { - security_context_t unlabeled; + char *unlabeled; if (security_get_initial_context_raw("unlabeled", &unlabeled) < 0) ereport(ERROR, @@ -216,7 +216,7 @@ sepgsql_avc_compute(const char *scontext, const char *tcontext, uint16 tclass) * policy is reloaded, validation status shall be kept, so we also cache * whether the supplied security context was valid, or not. */ - if (security_check_context_raw((security_context_t) tcontext) != 0) + if (security_check_context_raw(tcontext) != 0) ucontext = sepgsql_avc_unlabeled(); /* From 5bdf694568ef0b9eebef32002a9ebc1918dd0b4b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 14 Aug 2020 10:40:50 +0300 Subject: [PATCH 293/334] Fix typo in test comment. --- src/test/regress/expected/stats_ext.out | 2 +- src/test/regress/sql/stats_ext.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 0ae779a3b974..8c667d786a21 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -292,7 +292,7 @@ SELECT s.stxkind, d.stxdndistinct {d,f,m} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 5000} (1 row) --- correct esimates +-- correct estimates SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); estimated | actual -----------+-------- diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 2834a902a70c..f8d947af9e80 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -201,7 +201,7 @@ SELECT s.stxkind, d.stxdndistinct WHERE s.stxrelid = 'ndistinct'::regclass AND d.stxoid = s.oid; --- correct esimates +-- correct estimates SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c'); From 0038f943878286ce84b2dfac10d64e00eab02edd Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 14 Aug 2020 13:26:57 -0400 Subject: [PATCH 294/334] Fix postmaster's behavior during smart shutdown. Up to now, upon receipt of a SIGTERM ("smart shutdown" command), the postmaster has immediately killed all "optional" background processes, and subsequently refused to launch new ones while it's waiting for foreground client processes to exit. No doubt this seemed like an OK policy at some point; but it's a pretty bad one now, because it makes for a seriously degraded environment for the remaining clients: * Parallel queries are killed, and new ones fail to launch. (And our parallel-query infrastructure utterly fails to deal with the case in a reasonable way --- it just hangs waiting for workers that are not going to arrive. There is more work needed in that area IMO.) * Autovacuum ceases to function. We can tolerate that for awhile, but if bulk-update queries continue to run in the surviving client sessions, there's eventually going to be a mess. In the worst case the system could reach a forced shutdown to prevent XID wraparound. * The bgwriter and walwriter are also stopped immediately, likely resulting in performance degradation. Hence, let's rearrange things so that the only immediate change in behavior is refusing to let in new normal connections. Once the last normal connection is gone, shut everything down as though we'd received a "fast" shutdown. To implement this, remove the PM_WAIT_BACKUP and PM_WAIT_READONLY states, instead staying in PM_RUN or PM_HOT_STANDBY while normal connections remain. A subsidiary state variable tracks whether or not we're letting in new connections in those states. This also allows having just one copy of the logic for killing child processes in smart and fast shutdown modes. I moved that logic into PostmasterStateMachine() by inventing a new state PM_STOP_BACKENDS. Back-patch to 9.6 where parallel query was added. In principle this'd be a good idea in 9.5 as well, but the risk/reward ratio is not as good there, since lack of autovacuum is not a problem during typical uses of smart shutdown. Per report from Bharath Rupireddy. Patch by me, reviewed by Thomas Munro Discussion: https://postgr.es/m/CALj2ACXAZ5vKxT9P7P89D87i3MDO9bfS+_bjMHgnWJs8uwUOOw@mail.gmail.com --- doc/src/sgml/ref/pg_ctl-ref.sgml | 4 +- src/backend/postmaster/postmaster.c | 239 ++++++++++++++-------------- src/backend/utils/init/postinit.c | 2 +- src/include/libpq/libpq-be.h | 2 +- 4 files changed, 126 insertions(+), 121 deletions(-) diff --git a/doc/src/sgml/ref/pg_ctl-ref.sgml b/doc/src/sgml/ref/pg_ctl-ref.sgml index e31275a04e27..3946fa52eab7 100644 --- a/doc/src/sgml/ref/pg_ctl-ref.sgml +++ b/doc/src/sgml/ref/pg_ctl-ref.sgml @@ -185,8 +185,8 @@ PostgreSQL documentation mode shuts down the server that is running in the specified data directory. Three different shutdown methods can be selected with the - option. Smart mode waits for all active - clients to disconnect and any online backup to finish. + option. Smart mode disallows new connections, then waits + for all existing clients to disconnect and any online backup to finish. If the server is in hot standby, recovery and streaming replication will be terminated once all clients have disconnected. Fast mode (the default) does not wait for clients to disconnect and diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 38e2c16ac206..42223c0f61e2 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -148,8 +148,6 @@ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ #define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ -#define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER) - /* * List of active backends (or child processes anyway; we don't actually * know whether a given child has become a backend or is still in the @@ -304,8 +302,7 @@ static bool FatalError = false; /* T if recovering from backend crash */ * and we switch to PM_RUN state. * * Normal child backends can only be launched when we are in PM_RUN or - * PM_HOT_STANDBY state. (We also allow launch of normal - * child backends in PM_WAIT_BACKUP state, but only for superusers.) + * PM_HOT_STANDBY state. (connsAllowed can also restrict launching.) * In other states we handle connection requests by launching "dead_end" * child processes, which will simply send the client an error message and * quit. (We track these in the BackendList so that we can know when they @@ -319,10 +316,10 @@ static bool FatalError = false; /* T if recovering from backend crash */ * * Notice that this state variable does not distinguish *why* we entered * states later than PM_RUN --- Shutdown and FatalError must be consulted - * to find that out. FatalError is never true in PM_RECOVERY_* or PM_RUN - * states, nor in PM_SHUTDOWN states (because we don't enter those states - * when trying to recover from a crash). It can be true in PM_STARTUP state, - * because we don't clear it until we've successfully started WAL redo. + * to find that out. FatalError is never true in PM_RECOVERY, PM_HOT_STANDBY, + * or PM_RUN states, nor in PM_SHUTDOWN states (because we don't enter those + * states when trying to recover from a crash). It can be true in PM_STARTUP + * state, because we don't clear it until we've successfully started WAL redo. */ typedef enum { @@ -331,8 +328,7 @@ typedef enum PM_RECOVERY, /* in archive recovery mode */ PM_HOT_STANDBY, /* in hot standby mode */ PM_RUN, /* normal "database is alive" state */ - PM_WAIT_BACKUP, /* waiting for online backup mode to end */ - PM_WAIT_READONLY, /* waiting for read only backends to exit */ + PM_STOP_BACKENDS, /* need to stop remaining backends */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ PM_SHUTDOWN, /* waiting for checkpointer to do shutdown * ckpt */ @@ -344,6 +340,21 @@ typedef enum static PMState pmState = PM_INIT; +/* + * While performing a "smart shutdown", we restrict new connections but stay + * in PM_RUN or PM_HOT_STANDBY state until all the client backends are gone. + * connsAllowed is a sub-state indicator showing the active restriction. + * It is of no interest unless pmState is PM_RUN or PM_HOT_STANDBY. + */ +typedef enum +{ + ALLOW_ALL_CONNS, /* normal not-shutting-down state */ + ALLOW_SUPERUSER_CONNS, /* only superusers can connect */ + ALLOW_NO_CONNS /* no new connections allowed, period */ +} ConnsAllowedState; + +static ConnsAllowedState connsAllowed = ALLOW_ALL_CONNS; + /* Start time of SIGKILL timeout during immediate shutdown or child crash */ /* Zero means timeout is not running */ static time_t AbortStartTime = 0; @@ -2323,7 +2334,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("sorry, too many clients already"))); break; - case CAC_WAITBACKUP: + case CAC_SUPERUSER: /* OK for now, will check in InitPostgres */ break; case CAC_OK: @@ -2443,31 +2454,36 @@ canAcceptConnections(int backend_type) * state. We treat autovac workers the same as user backends for this * purpose. However, bgworkers are excluded from this test; we expect * bgworker_should_start_now() decided whether the DB state allows them. - * - * In state PM_WAIT_BACKUP only superusers can connect (this must be - * allowed so that a superuser can end online backup mode); we return - * CAC_WAITBACKUP code to indicate that this must be checked later. Note - * that neither CAC_OK nor CAC_WAITBACKUP can safely be returned until we - * have checked for too many children. */ - if (pmState != PM_RUN && + if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && backend_type != BACKEND_TYPE_BGWORKER) { - if (pmState == PM_WAIT_BACKUP) - result = CAC_WAITBACKUP; /* allow superusers only */ - else if (Shutdown > NoShutdown) + if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ else if (!FatalError && (pmState == PM_STARTUP || pmState == PM_RECOVERY)) return CAC_STARTUP; /* normal startup */ - else if (!FatalError && - pmState == PM_HOT_STANDBY) - result = CAC_OK; /* connection OK during hot standby */ else return CAC_RECOVERY; /* else must be crash recovery */ } + /* + * "Smart shutdown" restrictions are applied only to normal connections, + * not to autovac workers or bgworkers. When only superusers can connect, + * we return CAC_SUPERUSER to indicate that superuserness must be checked + * later. Note that neither CAC_OK nor CAC_SUPERUSER can safely be + * returned until we have checked for too many children. + */ + if (connsAllowed != ALLOW_ALL_CONNS && + backend_type == BACKEND_TYPE_NORMAL) + { + if (connsAllowed == ALLOW_SUPERUSER_CONNS) + result = CAC_SUPERUSER; /* allow superusers only */ + else + return CAC_SHUTDOWN; /* shutdown is pending */ + } + /* * Don't start too many children. * @@ -2793,34 +2809,22 @@ pmdie(SIGNAL_ARGS) sd_notify(0, "STOPPING=1"); #endif - if (pmState == PM_RUN || pmState == PM_RECOVERY || - pmState == PM_HOT_STANDBY || pmState == PM_STARTUP) + /* + * If we reached normal running, we have to wait for any online + * backup mode to end; otherwise go straight to waiting for client + * backends to exit. (The difference is that in the former state, + * we'll still let in new superuser clients, so that somebody can + * end the online backup mode.) If already in PM_STOP_BACKENDS or + * a later state, do not change it. + */ + if (pmState == PM_RUN) + connsAllowed = ALLOW_SUPERUSER_CONNS; + else if (pmState == PM_HOT_STANDBY) + connsAllowed = ALLOW_NO_CONNS; + else if (pmState == PM_STARTUP || pmState == PM_RECOVERY) { - /* autovac workers are told to shut down immediately */ - /* and bgworkers too; does this need tweaking? */ - SignalSomeChildren(SIGTERM, - BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER); - /* and the autovac launcher too */ - if (AutoVacPID != 0) - signal_child(AutoVacPID, SIGTERM); - /* and the bgwriter too */ - if (BgWriterPID != 0) - signal_child(BgWriterPID, SIGTERM); - /* and the walwriter too */ - if (WalWriterPID != 0) - signal_child(WalWriterPID, SIGTERM); - - /* - * If we're in recovery, we can't kill the startup process - * right away, because at present doing so does not release - * its locks. We might want to change this in a future - * release. For the time being, the PM_WAIT_READONLY state - * indicates that we're waiting for the regular (read only) - * backends to die off; once they do, we'll kill the startup - * and walreceiver processes. - */ - pmState = (pmState == PM_RUN) ? - PM_WAIT_BACKUP : PM_WAIT_READONLY; + /* There should be no clients, so proceed to stop children */ + pmState = PM_STOP_BACKENDS; } /* @@ -2851,48 +2855,23 @@ pmdie(SIGNAL_ARGS) sd_notify(0, "STOPPING=1"); #endif - if (StartupPID != 0) - signal_child(StartupPID, SIGTERM); - if (BgWriterPID != 0) - signal_child(BgWriterPID, SIGTERM); - if (WalReceiverPID != 0) - signal_child(WalReceiverPID, SIGTERM); if (pmState == PM_STARTUP || pmState == PM_RECOVERY) { - SignalSomeChildren(SIGTERM, BACKEND_TYPE_BGWORKER); - - /* - * Only startup, bgwriter, walreceiver, possibly bgworkers, - * and/or checkpointer should be active in this state; we just - * signaled the first four, and we don't want to kill - * checkpointer yet. - */ - pmState = PM_WAIT_BACKENDS; + /* Just shut down background processes silently */ + pmState = PM_STOP_BACKENDS; } else if (pmState == PM_RUN || - pmState == PM_WAIT_BACKUP || - pmState == PM_WAIT_READONLY || - pmState == PM_WAIT_BACKENDS || pmState == PM_HOT_STANDBY) { + /* Report that we're about to zap live client sessions */ ereport(LOG, (errmsg("aborting any active transactions"))); - /* shut down all backends and workers */ - SignalSomeChildren(SIGTERM, - BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC | - BACKEND_TYPE_BGWORKER); - /* and the autovac launcher too */ - if (AutoVacPID != 0) - signal_child(AutoVacPID, SIGTERM); - /* and the walwriter too */ - if (WalWriterPID != 0) - signal_child(WalWriterPID, SIGTERM); - pmState = PM_WAIT_BACKENDS; + pmState = PM_STOP_BACKENDS; } /* - * Now wait for backends to exit. If there are none, - * PostmasterStateMachine will take the next step. + * PostmasterStateMachine will issue any necessary signals, or + * take the next step if no child processes need to be killed. */ PostmasterStateMachine(); break; @@ -2987,7 +2966,7 @@ reaper(SIGNAL_ARGS) ereport(LOG, (errmsg("shutdown at recovery target"))); StartupStatus = STARTUP_NOT_RUNNING; - Shutdown = SmartShutdown; + Shutdown = Max(Shutdown, SmartShutdown); TerminateChildren(SIGTERM); pmState = PM_WAIT_BACKENDS; /* PostmasterStateMachine logic does the rest */ @@ -3051,6 +3030,7 @@ reaper(SIGNAL_ARGS) AbortStartTime = 0; ReachedNormalRunning = true; pmState = PM_RUN; + connsAllowed = ALLOW_ALL_CONNS; /* * Crank up the background tasks, if we didn't do that already @@ -3712,8 +3692,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) if (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_RUN || - pmState == PM_WAIT_BACKUP || - pmState == PM_WAIT_READONLY || + pmState == PM_STOP_BACKENDS || pmState == PM_SHUTDOWN) pmState = PM_WAIT_BACKENDS; @@ -3796,35 +3775,60 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) static void PostmasterStateMachine(void) { - if (pmState == PM_WAIT_BACKUP) + /* If we're doing a smart shutdown, try to advance that state. */ + if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) { - /* - * PM_WAIT_BACKUP state ends when online backup mode is not active. - */ - if (!BackupInProgress()) - pmState = PM_WAIT_BACKENDS; - } + if (connsAllowed == ALLOW_SUPERUSER_CONNS) + { + /* + * ALLOW_SUPERUSER_CONNS state ends as soon as online backup mode + * is not active. + */ + if (!BackupInProgress()) + connsAllowed = ALLOW_NO_CONNS; + } - if (pmState == PM_WAIT_READONLY) - { - /* - * PM_WAIT_READONLY state ends when we have no regular backends that - * have been started during recovery. We kill the startup and - * walreceiver processes and transition to PM_WAIT_BACKENDS. Ideally, - * we might like to kill these processes first and then wait for - * backends to die off, but that doesn't work at present because - * killing the startup process doesn't release its locks. - */ - if (CountChildren(BACKEND_TYPE_NORMAL) == 0) + if (connsAllowed == ALLOW_NO_CONNS) { - if (StartupPID != 0) - signal_child(StartupPID, SIGTERM); - if (WalReceiverPID != 0) - signal_child(WalReceiverPID, SIGTERM); - pmState = PM_WAIT_BACKENDS; + /* + * ALLOW_NO_CONNS state ends when we have no normal client + * backends running. Then we're ready to stop other children. + */ + if (CountChildren(BACKEND_TYPE_NORMAL) == 0) + pmState = PM_STOP_BACKENDS; } } + /* + * If we're ready to do so, signal child processes to shut down. (This + * isn't a persistent state, but treating it as a distinct pmState allows + * us to share this code across multiple shutdown code paths.) + */ + if (pmState == PM_STOP_BACKENDS) + { + /* Signal all backend children except walsenders */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + /* and the autovac launcher too */ + if (AutoVacPID != 0) + signal_child(AutoVacPID, SIGTERM); + /* and the bgwriter too */ + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGTERM); + /* and the walwriter too */ + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGTERM); + /* If we're in recovery, also stop startup and walreceiver procs */ + if (StartupPID != 0) + signal_child(StartupPID, SIGTERM); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, SIGTERM); + /* checkpointer, archiver, stats, and syslogger may continue for now */ + + /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ + pmState = PM_WAIT_BACKENDS; + } + /* * If we are in a state-machine state that implies waiting for backends to * exit, see if they're all gone, and change state if so. @@ -3843,7 +3847,7 @@ PostmasterStateMachine(void) * later after writing the checkpoint record, like the archiver * process. */ - if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_WORKER) == 0 && + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && StartupPID == 0 && WalReceiverPID == 0 && BgWriterPID == 0 && @@ -4184,7 +4188,7 @@ BackendStartup(Port *port) /* Pass down canAcceptConnections state */ port->canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL); bn->dead_end = (port->canAcceptConnections != CAC_OK && - port->canAcceptConnections != CAC_WAITBACKUP); + port->canAcceptConnections != CAC_SUPERUSER); /* * Unless it's a dead_end child, assign it a child slot number @@ -5255,6 +5259,8 @@ sigusr1_handler(SIGNAL_ARGS) #endif pmState = PM_HOT_STANDBY; + connsAllowed = ALLOW_ALL_CONNS; + /* Some workers may be scheduled to start now */ StartWorkerNeeded = true; } @@ -5287,7 +5293,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && - Shutdown == NoShutdown) + Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) { /* * Start one iteration of the autovacuum daemon, even if autovacuuming @@ -5302,7 +5308,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && - Shutdown == NoShutdown) + Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) { /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); @@ -5333,7 +5339,7 @@ sigusr1_handler(SIGNAL_ARGS) if (StartupPID != 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || - pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) && + pmState == PM_HOT_STANDBY) && CheckPromoteSignal()) { /* @@ -5651,8 +5657,8 @@ MaybeStartWalReceiver(void) { if (WalReceiverPID == 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || - pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) && - Shutdown == NoShutdown) + pmState == PM_HOT_STANDBY) && + Shutdown <= SmartShutdown) { WalReceiverPID = StartWalReceiver(); if (WalReceiverPID != 0) @@ -5905,8 +5911,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time) case PM_SHUTDOWN_2: case PM_SHUTDOWN: case PM_WAIT_BACKENDS: - case PM_WAIT_READONLY: - case PM_WAIT_BACKUP: + case PM_STOP_BACKENDS: break; case PM_RUN: diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 893be2f3ddbf..d4ab4c7e2333 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -795,7 +795,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, */ if ((!am_superuser || am_walsender) && MyProcPort != NULL && - MyProcPort->canAcceptConnections == CAC_WAITBACKUP) + MyProcPort->canAcceptConnections == CAC_SUPERUSER) { if (am_walsender) ereport(FATAL, diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h index 179ebaa104b3..0a23281ad59b 100644 --- a/src/include/libpq/libpq-be.h +++ b/src/include/libpq/libpq-be.h @@ -71,7 +71,7 @@ typedef struct typedef enum CAC_state { CAC_OK, CAC_STARTUP, CAC_SHUTDOWN, CAC_RECOVERY, CAC_TOOMANY, - CAC_WAITBACKUP + CAC_SUPERUSER } CAC_state; From 914140e85a79c63853c86334afa2d7e6e930c11a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 14 Aug 2020 11:09:08 -0700 Subject: [PATCH 295/334] Fix obsolete comment in xlogutils.c. Oversight in commit 2c03216d831. --- src/backend/access/transam/xlogutils.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index b2ca0cd4cf39..7e915bcadf10 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -260,10 +260,9 @@ XLogCheckInvalidPages(void) * determines what needs to be done to redo the changes to it. If the WAL * record includes a full-page image of the page, it is restored. * - * 'lsn' is the LSN of the record being replayed. It is compared with the - * page's LSN to determine if the record has already been replayed. - * 'block_id' is the ID number the block was registered with, when the WAL - * record was created. + * 'record.EndRecPtr' is compared to the page's LSN to determine if the record + * has already been replayed. 'block_id' is the ID number the block was + * registered with, when the WAL record was created. * * Returns one of the following: * From 2ba5b2db7943742e100834d99548c5d2661a105b Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Fri, 14 Aug 2020 17:33:31 -0400 Subject: [PATCH 296/334] pg_dump: fix dependencies on FKs to partitioned tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallel-restoring a foreign key that references a partitioned table with several levels of partitions can fail: pg_restore: while PROCESSING TOC: pg_restore: from TOC entry 6684; 2606 29166 FK CONSTRAINT fk fk_a_fkey postgres pg_restore: error: could not execute query: ERROR: there is no unique constraint matching given keys for referenced table "pk" Command was: ALTER TABLE fkpart3.fk ADD CONSTRAINT fk_a_fkey FOREIGN KEY (a) REFERENCES fkpart3.pk(a); This happens in parallel restore mode because some index partitions aren't yet attached to the topmost partitioned index that the FK uses, and so the index is still invalid. The current code marks the FK as dependent on the first level of index-attach dump objects; the bug is fixed by recursively marking the FK on their children. Backpatch to 12, where FKs to partitioned tables were introduced. Reported-by: Tom Lane Author: Álvaro Herrera Discussion: https://postgr.es/m/3170626.1594842723@sss.pgh.pa.us Backpatch: 12-master --- src/bin/pg_dump/pg_dump.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 9c8436dde6cc..2cb3f9b083ec 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -235,6 +235,7 @@ static DumpableObject *createBoundaryObjects(void); static void addBoundaryDependencies(DumpableObject **dobjs, int numObjs, DumpableObject *boundaryObjs); +static void addConstrChildIdxDeps(DumpableObject *dobj, IndxInfo *refidx); static void getDomainConstraints(Archive *fout, TypeInfo *tyinfo); static void getTableData(DumpOptions *dopt, TableInfo *tblinfo, int numTables, char relkind); static void makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo); @@ -7517,25 +7518,20 @@ getConstraints(Archive *fout, TableInfo tblinfo[], int numTables) reftable = findTableByOid(constrinfo[j].confrelid); if (reftable && reftable->relkind == RELKIND_PARTITIONED_TABLE) { - IndxInfo *refidx; Oid indexOid = atooid(PQgetvalue(res, j, i_conindid)); if (indexOid != InvalidOid) { for (int k = 0; k < reftable->numIndexes; k++) { - SimplePtrListCell *cell; + IndxInfo *refidx; /* not our index? */ if (reftable->indexes[k].dobj.catId.oid != indexOid) continue; refidx = &reftable->indexes[k]; - for (cell = refidx->partattaches.head; cell; - cell = cell->next) - addObjectDependency(&constrinfo[j].dobj, - ((DumpableObject *) - cell->ptr)->dumpId); + addConstrChildIdxDeps(&constrinfo[j].dobj, refidx); break; } } @@ -7548,6 +7544,35 @@ getConstraints(Archive *fout, TableInfo tblinfo[], int numTables) destroyPQExpBuffer(query); } +/* + * addConstrChildIdxDeps + * + * Recursive subroutine for getConstraints + * + * Given an object representing a foreign key constraint and an index on the + * partitioned table it references, mark the constraint object as dependent + * on the DO_INDEX_ATTACH object of each index partition, recursively + * drilling down to their partitions if any. This ensures that the FK is not + * restored until the index is fully marked valid. + */ +static void +addConstrChildIdxDeps(DumpableObject *dobj, IndxInfo *refidx) +{ + SimplePtrListCell *cell; + + Assert(dobj->objType == DO_FK_CONSTRAINT); + + for (cell = refidx->partattaches.head; cell; cell = cell->next) + { + IndexAttachInfo *attach = (IndexAttachInfo *) cell->ptr; + + addObjectDependency(dobj, attach->dobj.dumpId); + + if (attach->partitionIdx->partattaches.head != NULL) + addConstrChildIdxDeps(dobj, attach->partitionIdx); + } +} + /* * getDomainConstraints * From 941697c3c1ae5d6ee153065adb96e1e63ee11224 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 14 Aug 2020 12:15:38 -0700 Subject: [PATCH 297/334] snapshot scalability: Introduce dense array of in-progress xids. The new array contains the xids for all connected backends / in-use PGPROC entries in a dense manner (in contrast to the PGPROC/PGXACT arrays which can have unused entries interspersed). This improves performance because GetSnapshotData() always needs to scan the xids of all live procarray entries and now there's no need to go through the procArray->pgprocnos indirection anymore. As the set of running top-level xids changes rarely, compared to the number of snapshots taken, this substantially increases the likelihood of most data required for a snapshot being in l2 cache. In read-mostly workloads scanning the xids[] array will sufficient to build a snapshot, as most backends will not have an xid assigned. To keep the xid array dense ProcArrayRemove() needs to move entries behind the to-be-removed proc's one further up in the array. Obviously moving array entries cannot happen while a backend sets it xid. I.e. locking needs to prevent that array entries are moved while a backend modifies its xid. To avoid locking ProcArrayLock in GetNewTransactionId() - a fairly hot spot already - ProcArrayAdd() / ProcArrayRemove() now needs to hold XidGenLock in addition to ProcArrayLock. Adding / Removing a procarray entry is not a very frequent operation, even taking 2PC into account. Due to the above, the dense array entries can only be read or modified while holding ProcArrayLock and/or XidGenLock. This prevents a concurrent ProcArrayRemove() from shifting the dense array while it is accessed concurrently. While the new dense array is very good when needing to look at all xids it is less suitable when accessing a single backend's xid. In particular it would be problematic to have to acquire a lock to access a backend's own xid. Therefore a backend's xid is not just stored in the dense array, but also in PGPROC. This also allows a backend to only access the shared xid value when the backend had acquired an xid. The infrastructure added in this commit will be used for the remaining PGXACT fields in subsequent commits. They are kept separate to make review easier. Author: Andres Freund Reviewed-By: Robert Haas Reviewed-By: Thomas Munro Reviewed-By: David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- src/backend/access/heap/heapam_visibility.c | 8 +- src/backend/access/transam/README | 29 ++- src/backend/access/transam/clog.c | 8 +- src/backend/access/transam/twophase.c | 31 +-- src/backend/access/transam/varsup.c | 20 +- src/backend/commands/vacuum.c | 2 +- src/backend/storage/ipc/procarray.c | 271 +++++++++++++------- src/backend/storage/ipc/sinvaladt.c | 4 +- src/backend/storage/lmgr/lock.c | 3 +- src/backend/storage/lmgr/proc.c | 26 +- src/include/storage/proc.h | 79 +++++- 11 files changed, 327 insertions(+), 154 deletions(-) diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 528e75bafd45..80bd4940769c 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -11,12 +11,12 @@ * shared buffer content lock on the buffer containing the tuple. * * NOTE: When using a non-MVCC snapshot, we must check - * TransactionIdIsInProgress (which looks in the PGXACT array) + * TransactionIdIsInProgress (which looks in the PGPROC array) * before TransactionIdDidCommit/TransactionIdDidAbort (which look in * pg_xact). Otherwise we have a race condition: we might decide that a * just-committed transaction crashed, because none of the tests succeed. * xact.c is careful to record commit/abort in pg_xact before it unsets - * MyPgXact->xid in the PGXACT array. That fixes that problem, but it + * MyProc->xid in the PGPROC array. That fixes that problem, but it * also means there is a window where TransactionIdIsInProgress and * TransactionIdDidCommit will both return true. If we check only * TransactionIdDidCommit, we could consider a tuple committed when a @@ -956,7 +956,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, * coding where we tried to set the hint bits as soon as possible, we instead * did TransactionIdIsInProgress in each call --- to no avail, as long as the * inserting/deleting transaction was still running --- which was more cycles - * and more contention on the PGXACT array. + * and more contention on ProcArrayLock. */ static bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, @@ -1459,7 +1459,7 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, * HeapTupleSatisfiesMVCC) and, therefore, any hint bits that can be set * should already be set. We assume that if no hint bits are set, the xmin * or xmax transaction is still running. This is therefore faster than - * HeapTupleSatisfiesVacuum, because we don't consult PGXACT nor CLOG. + * HeapTupleSatisfiesVacuum, because we consult neither procarray nor CLOG. * It's okay to return false when in doubt, but we must return true only * if the tuple is removable. */ diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index eab8edd20ec2..c5f09667ba15 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -251,10 +251,10 @@ enforce, and it assists with some other issues as explained below.) The implementation of this is that GetSnapshotData takes the ProcArrayLock in shared mode (so that multiple backends can take snapshots in parallel), but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode -while clearing MyPgXact->xid at transaction end (either commit or abort). -(To reduce context switching, when multiple transactions commit nearly -simultaneously, we have one backend take ProcArrayLock and clear the XIDs -of multiple processes at once.) +while clearing the ProcGlobal->xids[] entry at transaction end (either +commit or abort). (To reduce context switching, when multiple transactions +commit nearly simultaneously, we have one backend take ProcArrayLock and +clear the XIDs of multiple processes at once.) ProcArrayEndTransaction also holds the lock while advancing the shared latestCompletedXid variable. This allows GetSnapshotData to use @@ -278,12 +278,12 @@ present in the ProcArray, or not running anymore. (This guarantee doesn't apply to subtransaction XIDs, because of the possibility that there's not room for them in the subxid array; instead we guarantee that they are present or the overflow flag is set.) If a backend released XidGenLock -before storing its XID into MyPgXact, then it would be possible for another -backend to allocate and commit a later XID, causing latestCompletedXid to -pass the first backend's XID, before that value became visible in the +before storing its XID into ProcGlobal->xids[], then it would be possible for +another backend to allocate and commit a later XID, causing latestCompletedXid +to pass the first backend's XID, before that value became visible in the ProcArray. That would break ComputeXidHorizons, as discussed below. -We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the +We allow GetNewTransactionId to store the XID into ProcGlobal->xids[] (or the subxid array) without taking ProcArrayLock. This was once necessary to avoid deadlock; while that is no longer the case, it's still beneficial for performance. We are thereby relying on fetch/store of an XID to be atomic, @@ -382,12 +382,13 @@ Top-level transactions do not have a parent, so they leave their pg_subtrans entries set to the default value of zero (InvalidTransactionId). pg_subtrans is used to check whether the transaction in question is still -running --- the main Xid of a transaction is recorded in the PGXACT struct, -but since we allow arbitrary nesting of subtransactions, we can't fit all Xids -in shared memory, so we have to store them on disk. Note, however, that for -each transaction we keep a "cache" of Xids that are known to be part of the -transaction tree, so we can skip looking at pg_subtrans unless we know the -cache has been overflowed. See storage/ipc/procarray.c for the gory details. +running --- the main Xid of a transaction is recorded in ProcGlobal->xids[], +with a copy in PGPROC->xid, but since we allow arbitrary nesting of +subtransactions, we can't fit all Xids in shared memory, so we have to store +them on disk. Note, however, that for each transaction we keep a "cache" of +Xids that are known to be part of the transaction tree, so we can skip looking +at pg_subtrans unless we know the cache has been overflowed. See +storage/ipc/procarray.c for the gory details. slru.c is the supporting mechanism for both pg_xact and pg_subtrans. It implements the LRU policy for in-memory buffer pages. The high-level routines diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index dd2f4d5bc7e7..a4599e966106 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -285,15 +285,15 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, * updates for multiple backends so that the number of times XactSLRULock * needs to be acquired is reduced. * - * For this optimization to be safe, the XID in MyPgXact and the subxids - * in MyProc must be the same as the ones for which we're setting the - * status. Check that this is the case. + * For this optimization to be safe, the XID and subxids in MyProc must be + * the same as the ones for which we're setting the status. Check that + * this is the case. * * For this optimization to be efficient, we shouldn't have too many * sub-XIDs and all of the XIDs for which we're adjusting clog should be * on the same page. Check those conditions, too. */ - if (all_xact_same_page && xid == MyPgXact->xid && + if (all_xact_same_page && xid == MyProc->xid && nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && nsubxids == MyPgXact->nxids && memcmp(subxids, MyProc->subxids.xids, diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index eb5f4680a3d9..a0398bf3a3e8 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -351,7 +351,7 @@ AtAbort_Twophase(void) /* * This is called after we have finished transferring state to the prepared - * PGXACT entry. + * PGPROC entry. */ void PostPrepare_Twophase(void) @@ -463,7 +463,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->waitStatus = PROC_WAIT_STATUS_OK; /* We set up the gxact's VXID as InvalidBackendId/XID */ proc->lxid = (LocalTransactionId) xid; - pgxact->xid = xid; + proc->xid = xid; Assert(proc->xmin == InvalidTransactionId); proc->delayChkpt = false; pgxact->vacuumFlags = 0; @@ -768,7 +768,6 @@ pg_prepared_xact(PG_FUNCTION_ARGS) { GlobalTransaction gxact = &status->array[status->currIdx++]; PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; - PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; Datum values[5]; bool nulls[5]; HeapTuple tuple; @@ -783,7 +782,7 @@ pg_prepared_xact(PG_FUNCTION_ARGS) MemSet(values, 0, sizeof(values)); MemSet(nulls, 0, sizeof(nulls)); - values[0] = TransactionIdGetDatum(pgxact->xid); + values[0] = TransactionIdGetDatum(proc->xid); values[1] = CStringGetTextDatum(gxact->gid); values[2] = TimestampTzGetDatum(gxact->prepared_at); values[3] = ObjectIdGetDatum(gxact->owner); @@ -829,9 +828,8 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; - if (pgxact->xid == xid) + if (gxact->xid == xid) { result = gxact; break; @@ -987,8 +985,7 @@ void StartPrepare(GlobalTransaction gxact) { PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; - PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; - TransactionId xid = pgxact->xid; + TransactionId xid = gxact->xid; TwoPhaseFileHeader hdr; TransactionId *children; RelFileNode *commitrels; @@ -1140,15 +1137,15 @@ EndPrepare(GlobalTransaction gxact) /* * Mark the prepared transaction as valid. As soon as xact.c marks - * MyPgXact as not running our XID (which it will do immediately after + * MyProc as not running our XID (which it will do immediately after * this function returns), others can commit/rollback the xact. * * NB: a side effect of this is to make a dummy ProcArray entry for the - * prepared XID. This must happen before we clear the XID from MyPgXact, - * else there is a window where the XID is not running according to - * TransactionIdIsInProgress, and onlookers would be entitled to assume - * the xact crashed. Instead we have a window where the same XID appears - * twice in ProcArray, which is OK. + * prepared XID. This must happen before we clear the XID from MyProc / + * ProcGlobal->xids[], else there is a window where the XID is not running + * according to TransactionIdIsInProgress, and onlookers would be entitled + * to assume the xact crashed. Instead we have a window where the same + * XID appears twice in ProcArray, which is OK. */ MarkAsPrepared(gxact, false); @@ -1404,7 +1401,6 @@ FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; PGPROC *proc; - PGXACT *pgxact; TransactionId xid; char *buf; char *bufptr; @@ -1423,8 +1419,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) */ gxact = LockGXact(gid, GetUserId()); proc = &ProcGlobal->allProcs[gxact->pgprocno]; - pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; - xid = pgxact->xid; + xid = gxact->xid; /* * Read and validate 2PC state data. State data will typically be stored @@ -1726,7 +1721,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { /* - * Note that we are using gxact not pgxact so this works in recovery + * Note that we are using gxact not PGPROC so this works in recovery * also */ GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 2ef0f4991caf..4c91b343ecd2 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -38,7 +38,8 @@ VariableCache ShmemVariableCache = NULL; * Allocate the next FullTransactionId for a new transaction or * subtransaction. * - * The new XID is also stored into MyPgXact before returning. + * The new XID is also stored into MyProc->xid/ProcGlobal->xids[] before + * returning. * * Note: when this is called, we are actually already inside a valid * transaction, since XIDs are now not allocated until the transaction @@ -65,7 +66,8 @@ GetNewTransactionId(bool isSubXact) if (IsBootstrapProcessingMode()) { Assert(!isSubXact); - MyPgXact->xid = BootstrapTransactionId; + MyProc->xid = BootstrapTransactionId; + ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId; return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId); } @@ -190,10 +192,10 @@ GetNewTransactionId(bool isSubXact) * latestCompletedXid is present in the ProcArray, which is essential for * correct OldestXmin tracking; see src/backend/access/transam/README. * - * Note that readers of PGXACT xid fields should be careful to fetch the - * value only once, rather than assume they can read a value multiple - * times and get the same answer each time. Note we are assuming that - * TransactionId and int fetch/store are atomic. + * Note that readers of ProcGlobal->xids/PGPROC->xid should be careful + * to fetch the value for each proc only once, rather than assume they can + * read a value multiple times and get the same answer each time. Note we + * are assuming that TransactionId and int fetch/store are atomic. * * The same comments apply to the subxact xid count and overflow fields. * @@ -219,7 +221,11 @@ GetNewTransactionId(bool isSubXact) * answer later on when someone does have a reason to inquire.) */ if (!isSubXact) - MyPgXact->xid = xid; /* LWLockRelease acts as barrier */ + { + /* LWLockRelease acts as barrier */ + MyProc->xid = xid; + ProcGlobal->xids[MyProc->pgxactoff] = xid; + } else { int nxids = MyPgXact->nxids; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 22228f5684f0..648e12c78d84 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1724,7 +1724,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params) * * Note: these flags remain set until CommitTransaction or * AbortTransaction. We don't want to clear them until we reset - * MyPgXact->xid/xmin, otherwise GetOldestNonRemovableTransactionId() + * MyProc->xid/xmin, otherwise GetOldestNonRemovableTransactionId() * might appear to go backwards, which is probably Not Good. */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 185f581c8b6f..0bf20a49375d 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -9,8 +9,9 @@ * one is as a means of determining the set of currently running transactions. * * Because of various subtle race conditions it is critical that a backend - * hold the correct locks while setting or clearing its MyPgXact->xid field. - * See notes in src/backend/access/transam/README. + * hold the correct locks while setting or clearing its xid (in + * ProcGlobal->xids[]/MyProc->xid). See notes in + * src/backend/access/transam/README. * * The process arrays now also include structures representing prepared * transactions. The xid and subxids fields of these are valid, as are the @@ -436,7 +437,9 @@ ProcArrayAdd(PGPROC *proc) ProcArrayStruct *arrayP = procArray; int index; + /* See ProcGlobal comment explaining why both locks are held */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); if (arrayP->numProcs >= arrayP->maxProcs) { @@ -445,7 +448,6 @@ ProcArrayAdd(PGPROC *proc) * fixed supply of PGPROC structs too, and so we should have failed * earlier.) */ - LWLockRelease(ProcArrayLock); ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("sorry, too many clients already"))); @@ -471,10 +473,25 @@ ProcArrayAdd(PGPROC *proc) } memmove(&arrayP->pgprocnos[index + 1], &arrayP->pgprocnos[index], - (arrayP->numProcs - index) * sizeof(int)); + (arrayP->numProcs - index) * sizeof(*arrayP->pgprocnos)); + memmove(&ProcGlobal->xids[index + 1], &ProcGlobal->xids[index], + (arrayP->numProcs - index) * sizeof(*ProcGlobal->xids)); + arrayP->pgprocnos[index] = proc->pgprocno; + ProcGlobal->xids[index] = proc->xid; + arrayP->numProcs++; + for (; index < arrayP->numProcs; index++) + { + allProcs[arrayP->pgprocnos[index]].pgxactoff = index; + } + + /* + * Release in reversed acquisition order, to reduce frequency of having to + * wait for XidGenLock while holding ProcArrayLock. + */ + LWLockRelease(XidGenLock); LWLockRelease(ProcArrayLock); } @@ -500,36 +517,58 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) DisplayXidCache(); #endif + /* See ProcGlobal comment explaining why both locks are held */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + Assert(ProcGlobal->allProcs[arrayP->pgprocnos[proc->pgxactoff]].pgxactoff == proc->pgxactoff); if (TransactionIdIsValid(latestXid)) { - Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); + Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff])); /* Advance global latestCompletedXid while holding the lock */ MaintainLatestCompletedXid(latestXid); + + ProcGlobal->xids[proc->pgxactoff] = 0; } else { /* Shouldn't be trying to remove a live transaction here */ - Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); + Assert(!TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff])); } + Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff] == 0)); + for (index = 0; index < arrayP->numProcs; index++) { if (arrayP->pgprocnos[index] == proc->pgprocno) { /* Keep the PGPROC array sorted. See notes above */ memmove(&arrayP->pgprocnos[index], &arrayP->pgprocnos[index + 1], - (arrayP->numProcs - index - 1) * sizeof(int)); + (arrayP->numProcs - index - 1) * sizeof(*arrayP->pgprocnos)); + memmove(&ProcGlobal->xids[index], &ProcGlobal->xids[index + 1], + (arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->xids)); + arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */ arrayP->numProcs--; + + /* adjust for removed PGPROC */ + for (; index < arrayP->numProcs; index++) + allProcs[arrayP->pgprocnos[index]].pgxactoff--; + + /* + * Release in reversed acquisition order, to reduce frequency of + * having to wait for XidGenLock while holding ProcArrayLock. + */ + LWLockRelease(XidGenLock); LWLockRelease(ProcArrayLock); return; } } /* Oops */ + LWLockRelease(XidGenLock); LWLockRelease(ProcArrayLock); elog(LOG, "failed to find proc %p in ProcArray", proc); @@ -562,7 +601,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * else is taking a snapshot. See discussion in * src/backend/access/transam/README. */ - Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); + Assert(TransactionIdIsValid(proc->xid)); /* * If we can immediately acquire ProcArrayLock, we clear our own XID @@ -584,7 +623,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * anyone else's calculation of a snapshot. We might change their * estimate of global xmin, but that's OK. */ - Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); + Assert(!TransactionIdIsValid(proc->xid)); proc->lxid = InvalidLocalTransactionId; /* must be cleared with xid/xmin: */ @@ -607,7 +646,13 @@ static inline void ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, TransactionId latestXid) { - pgxact->xid = InvalidTransactionId; + size_t pgxactoff = proc->pgxactoff; + + Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff])); + Assert(ProcGlobal->xids[pgxactoff] == proc->xid); + + ProcGlobal->xids[pgxactoff] = InvalidTransactionId; + proc->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; @@ -643,7 +688,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) uint32 wakeidx; /* We should definitely have an XID to clear. */ - Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); + Assert(TransactionIdIsValid(proc->xid)); /* Add ourselves to the list of processes needing a group XID clear. */ proc->procArrayGroupMember = true; @@ -748,20 +793,28 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) * This is used after successfully preparing a 2-phase transaction. We are * not actually reporting the transaction's XID as no longer running --- it * will still appear as running because the 2PC's gxact is in the ProcArray - * too. We just have to clear out our own PGXACT. + * too. We just have to clear out our own PGPROC. */ void ProcArrayClearTransaction(PGPROC *proc) { PGXACT *pgxact = &allPgXact[proc->pgprocno]; + size_t pgxactoff; /* - * We can skip locking ProcArrayLock here, because this action does not - * actually change anyone's view of the set of running XIDs: our entry is - * duplicate with the gxact that has already been inserted into the - * ProcArray. + * We can skip locking ProcArrayLock exclusively here, because this action + * does not actually change anyone's view of the set of running XIDs: our + * entry is duplicate with the gxact that has already been inserted into + * the ProcArray. But need it in shared mode for pgproc->pgxactoff to stay + * the same. */ - pgxact->xid = InvalidTransactionId; + LWLockAcquire(ProcArrayLock, LW_SHARED); + + pgxactoff = proc->pgxactoff; + + ProcGlobal->xids[pgxactoff] = InvalidTransactionId; + proc->xid = InvalidTransactionId; + proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; proc->recoveryConflictPending = false; @@ -773,6 +826,8 @@ ProcArrayClearTransaction(PGPROC *proc) /* Clear the subtransaction-XID cache too */ pgxact->nxids = 0; pgxact->overflowed = false; + + LWLockRelease(ProcArrayLock); } /* @@ -1167,7 +1222,7 @@ ProcArrayApplyXidAssignment(TransactionId topxid, * there are four possibilities for finding a running transaction: * * 1. The given Xid is a main transaction Id. We will find this out cheaply - * by looking at the PGXACT struct for each backend. + * by looking at ProcGlobal->xids. * * 2. The given Xid is one of the cached subxact Xids in the PGPROC array. * We can find this out cheaply too. @@ -1176,26 +1231,28 @@ ProcArrayApplyXidAssignment(TransactionId topxid, * if the Xid is running on the primary. * * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see - * if that is running according to PGXACT or KnownAssignedXids. This is the - * slowest way, but sadly it has to be done always if the others failed, - * unless we see that the cached subxact sets are complete (none have + * if that is running according to ProcGlobal->xids[] or KnownAssignedXids. + * This is the slowest way, but sadly it has to be done always if the others + * failed, unless we see that the cached subxact sets are complete (none have * overflowed). * * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids * while doing 1 and 3, we can release the ProcArrayLock while we do 4. * This buys back some concurrency (and we can't retrieve the main Xids from - * PGXACT again anyway; see GetNewTransactionId). + * ProcGlobal->xids[] again anyway; see GetNewTransactionId). */ bool TransactionIdIsInProgress(TransactionId xid) { static TransactionId *xids = NULL; + static TransactionId *other_xids; int nxids = 0; ProcArrayStruct *arrayP = procArray; TransactionId topxid; TransactionId latestCompletedXid; - int i, - j; + int mypgxactoff; + size_t numProcs; + int j; /* * Don't bother checking a transaction older than RecentXmin; it could not @@ -1250,6 +1307,8 @@ TransactionIdIsInProgress(TransactionId xid) errmsg("out of memory"))); } + other_xids = ProcGlobal->xids; + LWLockAcquire(ProcArrayLock, LW_SHARED); /* @@ -1266,20 +1325,22 @@ TransactionIdIsInProgress(TransactionId xid) } /* No shortcuts, gotta grovel through the array */ - for (i = 0; i < arrayP->numProcs; i++) + mypgxactoff = MyProc->pgxactoff; + numProcs = arrayP->numProcs; + for (size_t pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) { - int pgprocno = arrayP->pgprocnos[i]; - PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; + int pgprocno; + PGXACT *pgxact; + PGPROC *proc; TransactionId pxid; int pxids; - /* Ignore my own proc --- dealt with it above */ - if (proc == MyProc) + /* Ignore ourselves --- dealt with it above */ + if (pgxactoff == mypgxactoff) continue; /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(pgxact->xid); + pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); if (!TransactionIdIsValid(pxid)) continue; @@ -1304,8 +1365,12 @@ TransactionIdIsInProgress(TransactionId xid) /* * Step 2: check the cached child-Xids arrays */ + pgprocno = arrayP->pgprocnos[pgxactoff]; + pgxact = &allPgXact[pgprocno]; pxids = pgxact->nxids; pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */ + pgprocno = arrayP->pgprocnos[pgxactoff]; + proc = &allProcs[pgprocno]; for (j = pxids - 1; j >= 0; j--) { /* Fetch xid just once - see GetNewTransactionId */ @@ -1336,7 +1401,7 @@ TransactionIdIsInProgress(TransactionId xid) */ if (RecoveryInProgress()) { - /* none of the PGXACT entries should have XIDs in hot standby mode */ + /* none of the PGPROC entries should have XIDs in hot standby mode */ Assert(nxids == 0); if (KnownAssignedXidExists(xid)) @@ -1391,7 +1456,7 @@ TransactionIdIsInProgress(TransactionId xid) Assert(TransactionIdIsValid(topxid)); if (!TransactionIdEquals(topxid, xid)) { - for (i = 0; i < nxids; i++) + for (int i = 0; i < nxids; i++) { if (TransactionIdEquals(xids[i], topxid)) return true; @@ -1414,6 +1479,7 @@ TransactionIdIsActive(TransactionId xid) { bool result = false; ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; int i; /* @@ -1429,11 +1495,10 @@ TransactionIdIsActive(TransactionId xid) { int pgprocno = arrayP->pgprocnos[i]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; TransactionId pxid; /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(pgxact->xid); + pxid = UINT32_ACCESS_ONCE(other_xids[i]); if (!TransactionIdIsValid(pxid)) continue; @@ -1519,6 +1584,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) ProcArrayStruct *arrayP = procArray; TransactionId kaxmin; bool in_recovery = RecoveryInProgress(); + TransactionId *other_xids = ProcGlobal->xids; /* inferred after ProcArrayLock is released */ h->catalog_oldest_nonremovable = InvalidTransactionId; @@ -1562,7 +1628,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) TransactionId xmin; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(pgxact->xid); + xid = UINT32_ACCESS_ONCE(other_xids[pgprocno]); xmin = UINT32_ACCESS_ONCE(proc->xmin); /* @@ -1852,14 +1918,17 @@ Snapshot GetSnapshotData(Snapshot snapshot) { ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; TransactionId xmin; TransactionId xmax; - int index; - int count = 0; + size_t count = 0; int subcount = 0; bool suboverflowed = false; FullTransactionId latest_completed; TransactionId oldestxid; + int mypgxactoff; + TransactionId myxid; + TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; @@ -1904,6 +1973,10 @@ GetSnapshotData(Snapshot snapshot) LWLockAcquire(ProcArrayLock, LW_SHARED); latest_completed = ShmemVariableCache->latestCompletedXid; + mypgxactoff = MyProc->pgxactoff; + myxid = other_xids[mypgxactoff]; + Assert(myxid == MyProc->xid); + oldestxid = ShmemVariableCache->oldestXid; /* xmax is always latestCompletedXid + 1 */ @@ -1914,57 +1987,79 @@ GetSnapshotData(Snapshot snapshot) /* initialize xmin calculation with xmax */ xmin = xmax; + /* take own xid into account, saves a check inside the loop */ + if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin)) + xmin = myxid; + snapshot->takenDuringRecovery = RecoveryInProgress(); if (!snapshot->takenDuringRecovery) { + size_t numProcs = arrayP->numProcs; + TransactionId *xip = snapshot->xip; int *pgprocnos = arrayP->pgprocnos; - int numProcs; /* - * Spin over procArray checking xid, xmin, and subxids. The goal is - * to gather all active xids, find the lowest xmin, and try to record - * subxids. + * First collect set of pgxactoff/xids that need to be included in the + * snapshot. */ - numProcs = arrayP->numProcs; - for (index = 0; index < numProcs; index++) + for (size_t pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) { - int pgprocno = pgprocnos[index]; - PGXACT *pgxact = &allPgXact[pgprocno]; - TransactionId xid; + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + int pgprocno; + PGXACT *pgxact; + uint8 vacuumFlags; + + Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); /* - * Skip over backends doing logical decoding which manages xmin - * separately (check below) and ones running LAZY VACUUM. + * If the transaction has no XID assigned, we can skip it; it + * won't have sub-XIDs either. */ - if (pgxact->vacuumFlags & - (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) + if (likely(xid == InvalidTransactionId)) continue; - /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(pgxact->xid); + /* + * We don't include our own XIDs (if any) in the snapshot. It + * needs to be includeded in the xmin computation, but we did so + * outside the loop. + */ + if (pgxactoff == mypgxactoff) + continue; /* - * If the transaction has no XID assigned, we can skip it; it - * won't have sub-XIDs either. If the XID is >= xmax, we can also - * skip it; such transactions will be treated as running anyway - * (and any sub-XIDs will also be >= xmax). + * The only way we are able to get here with a non-normal xid + * is during bootstrap - with this backend using + * BootstrapTransactionId. But the above test should filter + * that out. */ - if (!TransactionIdIsNormal(xid) - || !NormalTransactionIdPrecedes(xid, xmax)) + Assert(TransactionIdIsNormal(xid)); + + /* + * If the XID is >= xmax, we can skip it; such transactions will + * be treated as running anyway (and any sub-XIDs will also be >= + * xmax). + */ + if (!NormalTransactionIdPrecedes(xid, xmax)) continue; + pgprocno = pgprocnos[pgxactoff]; + pgxact = &allPgXact[pgprocno]; + vacuumFlags = pgxact->vacuumFlags; + /* - * We don't include our own XIDs (if any) in the snapshot, but we - * must include them in xmin. + * Skip over backends doing logical decoding which manages xmin + * separately (check below) and ones running LAZY VACUUM. */ + if (vacuumFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) + continue; + if (NormalTransactionIdPrecedes(xid, xmin)) xmin = xid; - if (pgxact == MyPgXact) - continue; /* Add XID to snapshot. */ - snapshot->xip[count++] = xid; + xip[count++] = xid; /* * Save subtransaction XIDs if possible (if we've already @@ -1987,9 +2082,9 @@ GetSnapshotData(Snapshot snapshot) suboverflowed = true; else { - int nxids = pgxact->nxids; + int nsubxids = pgxact->nxids; - if (nxids > 0) + if (nsubxids > 0) { PGPROC *proc = &allProcs[pgprocno]; @@ -1997,8 +2092,8 @@ GetSnapshotData(Snapshot snapshot) memcpy(snapshot->subxip + subcount, (void *) proc->subxids.xids, - nxids * sizeof(TransactionId)); - subcount += nxids; + nsubxids * sizeof(TransactionId)); + subcount += nsubxids; } } } @@ -2130,6 +2225,7 @@ GetSnapshotData(Snapshot snapshot) } RecentXmin = xmin; + Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); snapshot->xmin = xmin; snapshot->xmax = xmax; @@ -2292,7 +2388,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) * GetRunningTransactionData -- returns information about running transactions. * * Similar to GetSnapshotData but returns more information. We include - * all PGXACTs with an assigned TransactionId, even VACUUM processes and + * all PGPROCs with an assigned TransactionId, even VACUUM processes and * prepared transactions. * * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for @@ -2307,7 +2403,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) * This is never executed during recovery so there is no need to look at * KnownAssignedXids. * - * Dummy PGXACTs from prepared transaction are included, meaning that this + * Dummy PGPROCs from prepared transaction are included, meaning that this * may return entries with duplicated TransactionId values coming from * transaction finishing to prepare. Nothing is done about duplicated * entries here to not hold on ProcArrayLock more than necessary. @@ -2326,6 +2422,7 @@ GetRunningTransactionData(void) static RunningTransactionsData CurrentRunningXactsData; ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData; TransactionId latestCompletedXid; TransactionId oldestRunningXid; @@ -2386,7 +2483,7 @@ GetRunningTransactionData(void) TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(pgxact->xid); + xid = UINT32_ACCESS_ONCE(other_xids[index]); /* * We don't need to store transactions that don't have a TransactionId @@ -2483,7 +2580,7 @@ GetRunningTransactionData(void) * GetOldestActiveTransactionId() * * Similar to GetSnapshotData but returns just oldestActiveXid. We include - * all PGXACTs with an assigned TransactionId, even VACUUM processes. + * all PGPROCs with an assigned TransactionId, even VACUUM processes. * We look at all databases, though there is no need to include WALSender * since this has no effect on hot standby conflicts. * @@ -2498,6 +2595,7 @@ TransactionId GetOldestActiveTransactionId(void) { ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; TransactionId oldestRunningXid; int index; @@ -2520,12 +2618,10 @@ GetOldestActiveTransactionId(void) LWLockAcquire(ProcArrayLock, LW_SHARED); for (index = 0; index < arrayP->numProcs; index++) { - int pgprocno = arrayP->pgprocnos[index]; - PGXACT *pgxact = &allPgXact[pgprocno]; TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(pgxact->xid); + xid = UINT32_ACCESS_ONCE(other_xids[index]); if (!TransactionIdIsNormal(xid)) continue; @@ -2603,8 +2699,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * If we're not in recovery, we walk over the procarray and collect the * lowest xid. Since we're called with ProcArrayLock held and have * acquired XidGenLock, no entries can vanish concurrently, since - * PGXACT->xid is only set with XidGenLock held and only cleared with - * ProcArrayLock held. + * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared + * with ProcArrayLock held. * * In recovery we can't lower the safe value besides what we've computed * above, so we'll have to wait a bit longer there. We unfortunately can @@ -2613,17 +2709,17 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) */ if (!recovery_in_progress) { + TransactionId *other_xids = ProcGlobal->xids; + /* - * Spin over procArray collecting all min(PGXACT->xid) + * Spin over procArray collecting min(ProcGlobal->xids[i]) */ for (index = 0; index < arrayP->numProcs; index++) { - int pgprocno = arrayP->pgprocnos[index]; - PGXACT *pgxact = &allPgXact[pgprocno]; TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(pgxact->xid); + xid = UINT32_ACCESS_ONCE(other_xids[index]); if (!TransactionIdIsNormal(xid)) continue; @@ -2811,6 +2907,7 @@ BackendXidGetPid(TransactionId xid) { int result = 0; ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; int index; if (xid == InvalidTransactionId) /* never match invalid xid */ @@ -2822,9 +2919,8 @@ BackendXidGetPid(TransactionId xid) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; - if (pgxact->xid == xid) + if (other_xids[index] == xid) { result = proc->pid; break; @@ -3104,7 +3200,6 @@ MinimumActiveBackends(int min) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; /* * Since we're not holding a lock, need to be prepared to deal with @@ -3121,7 +3216,7 @@ MinimumActiveBackends(int min) continue; /* do not count deleted entries */ if (proc == MyProc) continue; /* do not count myself */ - if (pgxact->xid == InvalidTransactionId) + if (proc->xid == InvalidTransactionId) continue; /* do not count if no XID assigned */ if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3547,8 +3642,8 @@ XidCacheRemoveRunningXids(TransactionId xid, * * Note that we do not have to be careful about memory ordering of our own * reads wrt. GetNewTransactionId() here - only this process can modify - * relevant fields of MyProc/MyPgXact. But we do have to be careful about - * our own writes being well ordered. + * relevant fields of MyProc/ProcGlobal->xids[]. But we do have to be + * careful about our own writes being well ordered. */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); @@ -3906,7 +4001,7 @@ FullXidRelativeTo(FullTransactionId rel, TransactionId xid) * In Hot Standby mode, we maintain a list of transactions that are (or were) * running on the primary at the current point in WAL. These XIDs must be * treated as running by standby transactions, even though they are not in - * the standby server's PGXACT array. + * the standby server's PGPROC array. * * We record all XIDs that we know have been assigned. That includes all the * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c index ad048bc85fab..a9477ccb4a30 100644 --- a/src/backend/storage/ipc/sinvaladt.c +++ b/src/backend/storage/ipc/sinvaladt.c @@ -417,9 +417,7 @@ BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmi if (proc != NULL) { - PGXACT *xact = &ProcGlobal->allPgXact[proc->pgprocno]; - - *xid = xact->xid; + *xid = proc->xid; *xmin = proc->xmin; } } diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 95989ce79bd6..d86566f4554b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -3974,9 +3974,8 @@ GetRunningTransactionLocks(int *nlocks) proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION) { PGPROC *proc = proclock->tag.myProc; - PGXACT *pgxact = &ProcGlobal->allPgXact[proc->pgprocno]; LOCK *lock = proclock->tag.myLock; - TransactionId xid = pgxact->xid; + TransactionId xid = proc->xid; /* * Don't record locks for transactions if we know they have diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index de346cd87fcd..7fad49544ce0 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -102,21 +102,18 @@ Size ProcGlobalShmemSize(void) { Size size = 0; + Size TotalProcs = + add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts)); /* ProcGlobal */ size = add_size(size, sizeof(PROC_HDR)); - /* MyProcs, including autovacuum workers and launcher */ - size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC))); - /* AuxiliaryProcs */ - size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGPROC))); - /* Prepared xacts */ - size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGPROC))); - /* ProcStructLock */ + size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC))); size = add_size(size, sizeof(slock_t)); size = add_size(size, mul_size(MaxBackends, sizeof(PGXACT))); size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGXACT))); size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGXACT))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); return size; } @@ -216,6 +213,17 @@ InitProcGlobal(void) MemSet(pgxacts, 0, TotalProcs * sizeof(PGXACT)); ProcGlobal->allPgXact = pgxacts; + /* + * Allocate arrays mirroring PGPROC fields in a dense manner. See + * PROC_HDR. + * + * XXX: It might make sense to increase padding for these arrays, given + * how hotly they are accessed. + */ + ProcGlobal->xids = + (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); + MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids)); + for (i = 0; i < TotalProcs; i++) { /* Common initialization for all PGPROCs, regardless of type. */ @@ -387,7 +395,7 @@ InitProcess(void) MyProc->lxid = InvalidLocalTransactionId; MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; - MyPgXact->xid = InvalidTransactionId; + MyProc->xid = InvalidTransactionId; MyProc->xmin = InvalidTransactionId; MyProc->pid = MyProcPid; /* backendId, databaseId and roleId will be filled in later */ @@ -571,7 +579,7 @@ InitAuxiliaryProcess(void) MyProc->lxid = InvalidLocalTransactionId; MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; - MyPgXact->xid = InvalidTransactionId; + MyProc->xid = InvalidTransactionId; MyProc->xmin = InvalidTransactionId; MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 5e4b028a5f98..e29ed85e53db 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -89,6 +89,17 @@ typedef enum * distinguished from a real one at need by the fact that it has pid == 0. * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused, * but its myProcLocks[] lists are valid. + * + * Mirrored fields: + * + * Some fields in PGPROC (see "mirrored in ..." comment) are mirrored into an + * element of more densely packed ProcGlobal arrays. These arrays are indexed + * by PGPROC->pgxactoff. Both copies need to be maintained coherently. + * + * NB: The pgxactoff indexed value can *never* be accessed without holding + * locks. + * + * See PROC_HDR for details. */ struct PGPROC { @@ -101,6 +112,12 @@ struct PGPROC Latch procLatch; /* generic latch for process */ + + TransactionId xid; /* id of top-level transaction currently being + * executed by this proc, if running and XID + * is assigned; else InvalidTransactionId. + * mirrored in ProcGlobal->xids[pgxactoff] */ + TransactionId xmin; /* minimal running XID as it was when we were * starting our xact, excluding LAZY VACUUM: * vacuum must not remove tuples deleted by @@ -110,6 +127,9 @@ struct PGPROC * being executed by this proc, if running; * else InvalidLocalTransactionId */ int pid; /* Backend's process ID; 0 if prepared xact */ + + int pgxactoff; /* offset into various ProcGlobal->arrays + * with data mirrored from this PGPROC */ int pgprocno; /* These fields are zero while a backend is still starting up: */ @@ -224,10 +244,6 @@ extern PGDLLIMPORT struct PGXACT *MyPgXact; */ typedef struct PGXACT { - TransactionId xid; /* id of top-level transaction currently being - * executed by this proc, if running and XID - * is assigned; else InvalidTransactionId */ - uint8 vacuumFlags; /* vacuum-related flags, see above */ bool overflowed; @@ -236,6 +252,57 @@ typedef struct PGXACT /* * There is one ProcGlobal struct for the whole database cluster. + * + * Adding/Removing an entry into the procarray requires holding *both* + * ProcArrayLock and XidGenLock in exclusive mode (in that order). Both are + * needed because the dense arrays (see below) are accessed from + * GetNewTransactionId() and GetSnapshotData(), and we don't want to add + * further contention by both using the same lock. Adding/Removing a procarray + * entry is much less frequent. + * + * Some fields in PGPROC are mirrored into more densely packed arrays (e.g. + * xids), with one entry for each backend. These arrays only contain entries + * for PGPROCs that have been added to the shared array with ProcArrayAdd() + * (in contrast to PGPROC array which has unused PGPROCs interspersed). + * + * The dense arrays are indexed by PGPROC->pgxactoff. Any concurrent + * ProcArrayAdd() / ProcArrayRemove() can lead to pgxactoff of a procarray + * member to change. Therefore it is only safe to use PGPROC->pgxactoff to + * access the dense array while holding either ProcArrayLock or XidGenLock. + * + * As long as a PGPROC is in the procarray, the mirrored values need to be + * maintained in both places in a coherent manner. + * + * The denser separate arrays are beneficial for three main reasons: First, to + * allow for as tight loops accessing the data as possible. Second, to prevent + * updates of frequently changing data (e.g. xmin) from invalidating + * cachelines also containing less frequently changing data (e.g. xid, + * vacuumFlags). Third to condense frequently accessed data into as few + * cachelines as possible. + * + * There are two main reasons to have the data mirrored between these dense + * arrays and PGPROC. First, as explained above, a PGPROC's array entries can + * only be accessed with either ProcArrayLock or XidGenLock held, whereas the + * PGPROC entries do not require that (obviously there may still be locking + * requirements around the individual field, separate from the concerns + * here). That is particularly important for a backend to efficiently checks + * it own values, which it often can safely do without locking. Second, the + * PGPROC fields allow to avoid unnecessary accesses and modification to the + * dense arrays. A backend's own PGPROC is more likely to be in a local cache, + * whereas the cachelines for the dense array will be modified by other + * backends (often removing it from the cache for other cores/sockets). At + * commit/abort time a check of the PGPROC value can avoid accessing/dirtying + * the corresponding array value. + * + * Basically it makes sense to access the PGPROC variable when checking a + * single backend's data, especially when already looking at the PGPROC for + * other reasons already. It makes sense to look at the "dense" arrays if we + * need to look at many / most entries, because we then benefit from the + * reduced indirection and better cross-process cache-ability. + * + * When entering a PGPROC for 2PC transactions with ProcArrayAdd(), the data + * in the dense arrays is initialized from the PGPROC while it already holds + * ProcArrayLock. */ typedef struct PROC_HDR { @@ -243,6 +310,10 @@ typedef struct PROC_HDR PGPROC *allProcs; /* Array of PGXACT structures (not including dummies for prepared txns) */ PGXACT *allPgXact; + + /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ + TransactionId *xids; + /* Length of allProcs array */ uint32 allProcCount; /* Head of list of free PGPROC structures */ From 5788e258bb26495fab65ff3aa486268d1c50b123 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 15 Jul 2020 15:35:07 -0700 Subject: [PATCH 298/334] snapshot scalability: Move PGXACT->vacuumFlags to ProcGlobal->vacuumFlags. Similar to the previous commit this increases the chance that data frequently needed by GetSnapshotData() stays in l2 cache. As we now take care to not unnecessarily write to ProcGlobal->vacuumFlags, there should be very few modifications to the ProcGlobal->vacuumFlags array. Author: Andres Freund Reviewed-By: Robert Haas Reviewed-By: Thomas Munro Reviewed-By: David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- src/backend/access/transam/twophase.c | 2 +- src/backend/commands/vacuum.c | 5 +- src/backend/postmaster/autovacuum.c | 6 +-- src/backend/replication/logical/logical.c | 3 +- src/backend/replication/slot.c | 3 +- src/backend/storage/ipc/procarray.c | 66 +++++++++++++++-------- src/backend/storage/lmgr/deadlock.c | 4 +- src/backend/storage/lmgr/proc.c | 16 +++--- src/include/storage/proc.h | 12 ++++- 9 files changed, 75 insertions(+), 42 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index a0398bf3a3e8..744b8a7f3935 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -466,7 +466,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->xid = xid; Assert(proc->xmin == InvalidTransactionId); proc->delayChkpt = false; - pgxact->vacuumFlags = 0; + proc->vacuumFlags = 0; proc->pid = 0; proc->backendId = InvalidBackendId; proc->databaseId = databaseid; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 648e12c78d84..aba13c31d1bc 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1728,9 +1728,10 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params) * might appear to go backwards, which is probably Not Good. */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyPgXact->vacuumFlags |= PROC_IN_VACUUM; + MyProc->vacuumFlags |= PROC_IN_VACUUM; if (params->is_wraparound) - MyPgXact->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND; + MyProc->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND; + ProcGlobal->vacuumFlags[MyProc->pgxactoff] = MyProc->vacuumFlags; LWLockRelease(ProcArrayLock); } diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index ac97e28be19c..c6ec657a9367 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2493,7 +2493,7 @@ do_autovacuum(void) tab->at_datname, tab->at_nspname, tab->at_relname); EmitErrorReport(); - /* this resets the PGXACT flags too */ + /* this resets ProcGlobal->vacuumFlags[i] too */ AbortOutOfAnyTransaction(); FlushErrorState(); MemoryContextResetAndDeleteChildren(PortalContext); @@ -2509,7 +2509,7 @@ do_autovacuum(void) did_vacuum = true; - /* the PGXACT flags are reset at the next end of transaction */ + /* ProcGlobal->vacuumFlags[i] are reset at the next end of xact */ /* be tidy */ deleted: @@ -2686,7 +2686,7 @@ perform_work_item(AutoVacuumWorkItem *workitem) cur_datname, cur_nspname, cur_relname); EmitErrorReport(); - /* this resets the PGXACT flags too */ + /* this resets ProcGlobal->vacuumFlags[i] too */ AbortOutOfAnyTransaction(); FlushErrorState(); MemoryContextResetAndDeleteChildren(PortalContext); diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 57c5b513ccf8..0f6af952f939 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -181,7 +181,8 @@ StartupDecodingContext(List *output_plugin_options, if (!IsTransactionOrTransactionBlock()) { LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyPgXact->vacuumFlags |= PROC_IN_LOGICAL_DECODING; + MyProc->vacuumFlags |= PROC_IN_LOGICAL_DECODING; + ProcGlobal->vacuumFlags[MyProc->pgxactoff] = MyProc->vacuumFlags; LWLockRelease(ProcArrayLock); } diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 3dc01b6df22a..42c78eabd4eb 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -520,7 +520,8 @@ ReplicationSlotRelease(void) /* might not have been set when we've been a plain slot */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING; + MyProc->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING; + ProcGlobal->vacuumFlags[MyProc->pgxactoff] = MyProc->vacuumFlags; LWLockRelease(ProcArrayLock); } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 0bf20a49375d..224da4f9510b 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -476,9 +476,12 @@ ProcArrayAdd(PGPROC *proc) (arrayP->numProcs - index) * sizeof(*arrayP->pgprocnos)); memmove(&ProcGlobal->xids[index + 1], &ProcGlobal->xids[index], (arrayP->numProcs - index) * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->vacuumFlags[index + 1], &ProcGlobal->vacuumFlags[index], + (arrayP->numProcs - index) * sizeof(*ProcGlobal->vacuumFlags)); arrayP->pgprocnos[index] = proc->pgprocno; ProcGlobal->xids[index] = proc->xid; + ProcGlobal->vacuumFlags[index] = proc->vacuumFlags; arrayP->numProcs++; @@ -539,6 +542,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) } Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff] == 0)); + ProcGlobal->vacuumFlags[proc->pgxactoff] = 0; for (index = 0; index < arrayP->numProcs; index++) { @@ -549,6 +553,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) (arrayP->numProcs - index - 1) * sizeof(*arrayP->pgprocnos)); memmove(&ProcGlobal->xids[index], &ProcGlobal->xids[index + 1], (arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->vacuumFlags[index], &ProcGlobal->vacuumFlags[index + 1], + (arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->vacuumFlags)); arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */ arrayP->numProcs--; @@ -626,14 +632,24 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) Assert(!TransactionIdIsValid(proc->xid)); proc->lxid = InvalidLocalTransactionId; - /* must be cleared with xid/xmin: */ - pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; proc->xmin = InvalidTransactionId; proc->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; Assert(pgxact->nxids == 0); Assert(pgxact->overflowed == false); + + /* must be cleared with xid/xmin: */ + /* avoid unnecessarily dirtying shared cachelines */ + if (proc->vacuumFlags & PROC_VACUUM_STATE_MASK) + { + Assert(!LWLockHeldByMe(ProcArrayLock)); + LWLockAcquire(ProcArrayLock, LW_SHARED); + Assert(proc->vacuumFlags == ProcGlobal->vacuumFlags[proc->pgxactoff]); + proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; + ProcGlobal->vacuumFlags[proc->pgxactoff] = proc->vacuumFlags; + LWLockRelease(ProcArrayLock); + } } } @@ -654,12 +670,18 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, ProcGlobal->xids[pgxactoff] = InvalidTransactionId; proc->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; - /* must be cleared with xid/xmin: */ - pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; proc->xmin = InvalidTransactionId; proc->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; + /* must be cleared with xid/xmin: */ + /* avoid unnecessarily dirtying shared cachelines */ + if (proc->vacuumFlags & PROC_VACUUM_STATE_MASK) + { + proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; + ProcGlobal->vacuumFlags[proc->pgxactoff] = proc->vacuumFlags; + } + /* Clear the subtransaction-XID cache too while holding the lock */ pgxact->nxids = 0; pgxact->overflowed = false; @@ -819,9 +841,8 @@ ProcArrayClearTransaction(PGPROC *proc) proc->xmin = InvalidTransactionId; proc->recoveryConflictPending = false; - /* redundant, but just in case */ - pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - proc->delayChkpt = false; + Assert(!(proc->vacuumFlags & PROC_VACUUM_STATE_MASK)); + Assert(!proc->delayChkpt); /* Clear the subtransaction-XID cache too */ pgxact->nxids = 0; @@ -1623,7 +1644,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; + int8 vacuumFlags = ProcGlobal->vacuumFlags[index]; TransactionId xid; TransactionId xmin; @@ -1640,8 +1661,8 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) */ xmin = TransactionIdOlder(xmin, xid); - /* if neither is set, this proc doesn't influence the horizon */ - if (!TransactionIdIsValid(xmin)) + /* if neither is set, this proc doesn't influence the horizon */ + if (!TransactionIdIsValid(xmin)) continue; /* @@ -1658,7 +1679,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) * removed, as long as pg_subtrans is not truncated) or doing logical * decoding (which manages xmin separately, check below). */ - if (pgxact->vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) + if (vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) continue; /* shared tables need to take backends in all database into account */ @@ -1998,6 +2019,7 @@ GetSnapshotData(Snapshot snapshot) size_t numProcs = arrayP->numProcs; TransactionId *xip = snapshot->xip; int *pgprocnos = arrayP->pgprocnos; + uint8 *allVacuumFlags = ProcGlobal->vacuumFlags; /* * First collect set of pgxactoff/xids that need to be included in the @@ -2007,8 +2029,6 @@ GetSnapshotData(Snapshot snapshot) { /* Fetch xid just once - see GetNewTransactionId */ TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); - int pgprocno; - PGXACT *pgxact; uint8 vacuumFlags; Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); @@ -2044,14 +2064,11 @@ GetSnapshotData(Snapshot snapshot) if (!NormalTransactionIdPrecedes(xid, xmax)) continue; - pgprocno = pgprocnos[pgxactoff]; - pgxact = &allPgXact[pgprocno]; - vacuumFlags = pgxact->vacuumFlags; - /* * Skip over backends doing logical decoding which manages xmin * separately (check below) and ones running LAZY VACUUM. */ + vacuumFlags = allVacuumFlags[pgxactoff]; if (vacuumFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) continue; @@ -2078,6 +2095,9 @@ GetSnapshotData(Snapshot snapshot) */ if (!suboverflowed) { + int pgprocno = pgprocnos[pgxactoff]; + PGXACT *pgxact = &allPgXact[pgprocno]; + if (pgxact->overflowed) suboverflowed = true; else @@ -2296,11 +2316,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; + int vacuumFlags = ProcGlobal->vacuumFlags[index]; TransactionId xid; /* Ignore procs running LAZY VACUUM */ - if (pgxact->vacuumFlags & PROC_IN_VACUUM) + if (vacuumFlags & PROC_IN_VACUUM) continue; /* We are only interested in the specific virtual transaction. */ @@ -2990,12 +3010,12 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; + uint8 vacuumFlags = ProcGlobal->vacuumFlags[index]; if (proc == MyProc) continue; - if (excludeVacuum & pgxact->vacuumFlags) + if (excludeVacuum & vacuumFlags) continue; if (allDbs || proc->databaseId == MyDatabaseId) @@ -3410,7 +3430,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; + uint8 vacuumFlags = ProcGlobal->vacuumFlags[index]; if (proc->databaseId != databaseId) continue; @@ -3424,7 +3444,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) else { (*nbackends)++; - if ((pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && + if ((vacuumFlags & PROC_IS_AUTOVACUUM) && nautovacs < MAXAUTOVACPIDS) autovac_pids[nautovacs++] = proc->pid; } diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c index beedc7947db9..e1246b8a4da1 100644 --- a/src/backend/storage/lmgr/deadlock.c +++ b/src/backend/storage/lmgr/deadlock.c @@ -544,7 +544,6 @@ FindLockCycleRecurseMember(PGPROC *checkProc, { PGPROC *proc; LOCK *lock = checkProc->waitLock; - PGXACT *pgxact; PROCLOCK *proclock; SHM_QUEUE *procLocks; LockMethod lockMethodTable; @@ -582,7 +581,6 @@ FindLockCycleRecurseMember(PGPROC *checkProc, PGPROC *leader; proc = proclock->tag.myProc; - pgxact = &ProcGlobal->allPgXact[proc->pgprocno]; leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader; /* A proc never blocks itself or any other lock group member */ @@ -630,7 +628,7 @@ FindLockCycleRecurseMember(PGPROC *checkProc, * ProcArrayLock. */ if (checkProc == MyProc && - pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) + proc->vacuumFlags & PROC_IS_AUTOVACUUM) blocking_autovacuum_proc = proc; /* We're done looking at this proclock */ diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 7fad49544ce0..f6113b2d2432 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -114,6 +114,7 @@ ProcGlobalShmemSize(void) size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGXACT))); size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGXACT))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->vacuumFlags))); return size; } @@ -223,6 +224,8 @@ InitProcGlobal(void) ProcGlobal->xids = (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids)); + ProcGlobal->vacuumFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->vacuumFlags)); + MemSet(ProcGlobal->vacuumFlags, 0, TotalProcs * sizeof(*ProcGlobal->vacuumFlags)); for (i = 0; i < TotalProcs; i++) { @@ -405,10 +408,10 @@ InitProcess(void) MyProc->tempNamespaceId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; MyProc->delayChkpt = false; - MyPgXact->vacuumFlags = 0; + MyProc->vacuumFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ if (IsAutoVacuumWorkerProcess()) - MyPgXact->vacuumFlags |= PROC_IS_AUTOVACUUM; + MyProc->vacuumFlags |= PROC_IS_AUTOVACUUM; MyProc->lwWaiting = false; MyProc->lwWaitMode = 0; MyProc->waitLock = NULL; @@ -587,7 +590,7 @@ InitAuxiliaryProcess(void) MyProc->tempNamespaceId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; MyProc->delayChkpt = false; - MyPgXact->vacuumFlags = 0; + MyProc->vacuumFlags = 0; MyProc->lwWaiting = false; MyProc->lwWaitMode = 0; MyProc->waitLock = NULL; @@ -1323,7 +1326,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) { PGPROC *autovac = GetBlockingAutoVacuumPgproc(); - PGXACT *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno]; + uint8 vacuumFlags; LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); @@ -1331,8 +1334,9 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) * Only do it if the worker is not working to protect against Xid * wraparound. */ - if ((autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && - !(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND)) + vacuumFlags = ProcGlobal->vacuumFlags[proc->pgxactoff]; + if ((vacuumFlags & PROC_IS_AUTOVACUUM) && + !(vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND)) { int pid = autovac->pid; StringInfoData locktagbuf; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index e29ed85e53db..9f3a8b518eb2 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -41,7 +41,7 @@ struct XidCache }; /* - * Flags for PGXACT->vacuumFlags + * Flags for ProcGlobal->vacuumFlags[] */ #define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ #define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ @@ -167,6 +167,9 @@ struct PGPROC bool delayChkpt; /* true if this proc delays checkpoint start */ + uint8 vacuumFlags; /* this backend's vacuum flags, see PROC_* + * above. mirrored in + * ProcGlobal->vacuumFlags[pgxactoff] */ /* * Info to allow us to wait for synchronous replication, if needed. * waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend. @@ -244,7 +247,6 @@ extern PGDLLIMPORT struct PGXACT *MyPgXact; */ typedef struct PGXACT { - uint8 vacuumFlags; /* vacuum-related flags, see above */ bool overflowed; uint8 nxids; @@ -314,6 +316,12 @@ typedef struct PROC_HDR /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ TransactionId *xids; + /* + * Array mirroring PGPROC.vacuumFlags for each PGPROC currently in the + * procarray. + */ + uint8 *vacuumFlags; + /* Length of allProcs array */ uint32 allProcCount; /* Head of list of free PGPROC structures */ From 73487a60fc1063ba4b5178b69aee4ee210c182c4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 14 Aug 2020 14:30:38 -0700 Subject: [PATCH 299/334] snapshot scalability: Move subxact info to ProcGlobal, remove PGXACT. Similar to the previous changes this increases the chance that data frequently needed by GetSnapshotData() stays in l2 cache. In many workloads subtransactions are very rare, and this makes the check for that considerably cheaper. As this removes the last member of PGXACT, there is no need to keep it around anymore. On a larger 2 socket machine this and the two preceding commits result in a ~1.07x performance increase in read-only pgbench. For read-heavy mixed r/w workloads without row level contention, I see about 1.1x. Author: Andres Freund Reviewed-By: Robert Haas Reviewed-By: Thomas Munro Reviewed-By: David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- src/backend/access/transam/clog.c | 7 +- src/backend/access/transam/twophase.c | 17 ++-- src/backend/access/transam/varsup.c | 15 ++- src/backend/storage/ipc/procarray.c | 128 ++++++++++++++------------ src/backend/storage/lmgr/proc.c | 24 +---- src/include/storage/proc.h | 34 ++++--- src/tools/pgindent/typedefs.list | 1 - 7 files changed, 113 insertions(+), 113 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index a4599e966106..65aa8841f7ce 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -295,7 +295,7 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, */ if (all_xact_same_page && xid == MyProc->xid && nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && - nsubxids == MyPgXact->nxids && + nsubxids == MyProc->subxidStatus.count && memcmp(subxids, MyProc->subxids.xids, nsubxids * sizeof(TransactionId)) == 0) { @@ -510,16 +510,15 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, while (nextidx != INVALID_PGPROCNO) { PGPROC *proc = &ProcGlobal->allProcs[nextidx]; - PGXACT *pgxact = &ProcGlobal->allPgXact[nextidx]; /* * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs * should not use group XID status update mechanism. */ - Assert(pgxact->nxids <= THRESHOLD_SUBTRANS_CLOG_OPT); + Assert(proc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT); TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid, - pgxact->nxids, + proc->subxidStatus.count, proc->subxids.xids, proc->clogGroupMemberXidStatus, proc->clogGroupMemberLsn, diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 744b8a7f3935..ef4f9981e359 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -21,9 +21,9 @@ * GIDs and aborts the transaction if there already is a global * transaction in prepared state with the same GID. * - * A global transaction (gxact) also has dummy PGXACT and PGPROC; this is - * what keeps the XID considered running by TransactionIdIsInProgress. - * It is also convenient as a PGPROC to hook the gxact's locks to. + * A global transaction (gxact) also has dummy PGPROC; this is what keeps + * the XID considered running by TransactionIdIsInProgress. It is also + * convenient as a PGPROC to hook the gxact's locks to. * * Information to recover prepared transactions in case of crash is * now stored in WAL for the common case. In some cases there will be @@ -447,14 +447,12 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { PGPROC *proc; - PGXACT *pgxact; int i; Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); Assert(gxact != NULL); proc = &ProcGlobal->allProcs[gxact->pgprocno]; - pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; /* Initialize the PGPROC entry */ MemSet(proc, 0, sizeof(PGPROC)); @@ -480,8 +478,8 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, for (i = 0; i < NUM_LOCK_PARTITIONS; i++) SHMQueueInit(&(proc->myProcLocks[i])); /* subxid data must be filled later by GXactLoadSubxactData */ - pgxact->overflowed = false; - pgxact->nxids = 0; + proc->subxidStatus.overflowed = false; + proc->subxidStatus.count = 0; gxact->prepared_at = prepared_at; gxact->xid = xid; @@ -510,19 +508,18 @@ GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, TransactionId *children) { PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; - PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; /* We need no extra lock since the GXACT isn't valid yet */ if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS) { - pgxact->overflowed = true; + proc->subxidStatus.overflowed = true; nsubxacts = PGPROC_MAX_CACHED_SUBXIDS; } if (nsubxacts > 0) { memcpy(proc->subxids.xids, children, nsubxacts * sizeof(TransactionId)); - pgxact->nxids = nsubxacts; + proc->subxidStatus.count = nsubxacts; } } diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 4c91b343ecd2..2d2b05be36c4 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -222,22 +222,31 @@ GetNewTransactionId(bool isSubXact) */ if (!isSubXact) { + Assert(ProcGlobal->subxidStates[MyProc->pgxactoff].count == 0); + Assert(!ProcGlobal->subxidStates[MyProc->pgxactoff].overflowed); + Assert(MyProc->subxidStatus.count == 0); + Assert(!MyProc->subxidStatus.overflowed); + /* LWLockRelease acts as barrier */ MyProc->xid = xid; ProcGlobal->xids[MyProc->pgxactoff] = xid; } else { - int nxids = MyPgXact->nxids; + XidCacheStatus *substat = &ProcGlobal->subxidStates[MyProc->pgxactoff]; + int nxids = MyProc->subxidStatus.count; + + Assert(substat->count == MyProc->subxidStatus.count); + Assert(substat->overflowed == MyProc->subxidStatus.overflowed); if (nxids < PGPROC_MAX_CACHED_SUBXIDS) { MyProc->subxids.xids[nxids] = xid; pg_write_barrier(); - MyPgXact->nxids = nxids + 1; + MyProc->subxidStatus.count = substat->count = nxids + 1; } else - MyPgXact->overflowed = true; + MyProc->subxidStatus.overflowed = substat->overflowed = true; } LWLockRelease(XidGenLock); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 224da4f9510b..8262abd42e6b 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -4,9 +4,10 @@ * POSTGRES process array code. * * - * This module maintains arrays of the PGPROC and PGXACT structures for all - * active backends. Although there are several uses for this, the principal - * one is as a means of determining the set of currently running transactions. + * This module maintains arrays of PGPROC substructures, as well as associated + * arrays in ProcGlobal, for all active backends. Although there are several + * uses for this, the principal one is as a means of determining the set of + * currently running transactions. * * Because of various subtle race conditions it is critical that a backend * hold the correct locks while setting or clearing its xid (in @@ -85,7 +86,7 @@ typedef struct ProcArrayStruct /* * Highest subxid that has been removed from KnownAssignedXids array to * prevent overflow; or InvalidTransactionId if none. We track this for - * similar reasons to tracking overflowing cached subxids in PGXACT + * similar reasons to tracking overflowing cached subxids in PGPROC * entries. Must hold exclusive ProcArrayLock to change this, and shared * lock to read it. */ @@ -96,7 +97,7 @@ typedef struct ProcArrayStruct /* oldest catalog xmin of any replication slot */ TransactionId replication_slot_catalog_xmin; - /* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */ + /* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */ int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; } ProcArrayStruct; @@ -239,7 +240,6 @@ typedef struct ComputeXidHorizonsResult static ProcArrayStruct *procArray; static PGPROC *allProcs; -static PGXACT *allPgXact; /* * Bookkeeping for tracking emulated transactions in recovery @@ -325,8 +325,7 @@ static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, static TransactionId KnownAssignedXidsGetOldestXmin(void); static void KnownAssignedXidsDisplay(int trace_level); static void KnownAssignedXidsReset(void); -static inline void ProcArrayEndTransactionInternal(PGPROC *proc, - PGXACT *pgxact, TransactionId latestXid); +static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid); static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); static void MaintainLatestCompletedXid(TransactionId latestXid); static void MaintainLatestCompletedXidRecovery(TransactionId latestXid); @@ -411,7 +410,6 @@ CreateSharedProcArray(void) } allProcs = ProcGlobal->allProcs; - allPgXact = ProcGlobal->allPgXact; /* Create or attach to the KnownAssignedXids arrays too, if needed */ if (EnableHotStandby) @@ -476,11 +474,14 @@ ProcArrayAdd(PGPROC *proc) (arrayP->numProcs - index) * sizeof(*arrayP->pgprocnos)); memmove(&ProcGlobal->xids[index + 1], &ProcGlobal->xids[index], (arrayP->numProcs - index) * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->subxidStates[index + 1], &ProcGlobal->subxidStates[index], + (arrayP->numProcs - index) * sizeof(*ProcGlobal->subxidStates)); memmove(&ProcGlobal->vacuumFlags[index + 1], &ProcGlobal->vacuumFlags[index], (arrayP->numProcs - index) * sizeof(*ProcGlobal->vacuumFlags)); arrayP->pgprocnos[index] = proc->pgprocno; ProcGlobal->xids[index] = proc->xid; + ProcGlobal->subxidStates[index] = proc->subxidStatus; ProcGlobal->vacuumFlags[index] = proc->vacuumFlags; arrayP->numProcs++; @@ -534,6 +535,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) MaintainLatestCompletedXid(latestXid); ProcGlobal->xids[proc->pgxactoff] = 0; + ProcGlobal->subxidStates[proc->pgxactoff].overflowed = false; + ProcGlobal->subxidStates[proc->pgxactoff].count = 0; } else { @@ -542,6 +545,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) } Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff] == 0)); + Assert(TransactionIdIsValid(ProcGlobal->subxidStates[proc->pgxactoff].count == 0)); + Assert(TransactionIdIsValid(ProcGlobal->subxidStates[proc->pgxactoff].overflowed == false)); ProcGlobal->vacuumFlags[proc->pgxactoff] = 0; for (index = 0; index < arrayP->numProcs; index++) @@ -553,6 +558,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) (arrayP->numProcs - index - 1) * sizeof(*arrayP->pgprocnos)); memmove(&ProcGlobal->xids[index], &ProcGlobal->xids[index + 1], (arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->subxidStates[index], &ProcGlobal->subxidStates[index + 1], + (arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->subxidStates)); memmove(&ProcGlobal->vacuumFlags[index], &ProcGlobal->vacuumFlags[index + 1], (arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->vacuumFlags)); @@ -597,8 +604,6 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) { - PGXACT *pgxact = &allPgXact[proc->pgprocno]; - if (TransactionIdIsValid(latestXid)) { /* @@ -616,7 +621,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) */ if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE)) { - ProcArrayEndTransactionInternal(proc, pgxact, latestXid); + ProcArrayEndTransactionInternal(proc, latestXid); LWLockRelease(ProcArrayLock); } else @@ -630,15 +635,14 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * estimate of global xmin, but that's OK. */ Assert(!TransactionIdIsValid(proc->xid)); + Assert(proc->subxidStatus.count == 0); + Assert(!proc->subxidStatus.overflowed); proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; proc->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; - Assert(pgxact->nxids == 0); - Assert(pgxact->overflowed == false); - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->vacuumFlags & PROC_VACUUM_STATE_MASK) @@ -659,8 +663,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * We don't do any locking here; caller must handle that. */ static inline void -ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, - TransactionId latestXid) +ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) { size_t pgxactoff = proc->pgxactoff; @@ -683,8 +686,15 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, } /* Clear the subtransaction-XID cache too while holding the lock */ - pgxact->nxids = 0; - pgxact->overflowed = false; + Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && + ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); + if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed) + { + ProcGlobal->subxidStates[pgxactoff].count = 0; + ProcGlobal->subxidStates[pgxactoff].overflowed = false; + proc->subxidStatus.count = 0; + proc->subxidStatus.overflowed = false; + } /* Also advance global latestCompletedXid while holding the lock */ MaintainLatestCompletedXid(latestXid); @@ -774,9 +784,8 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) while (nextidx != INVALID_PGPROCNO) { PGPROC *proc = &allProcs[nextidx]; - PGXACT *pgxact = &allPgXact[nextidx]; - ProcArrayEndTransactionInternal(proc, pgxact, proc->procArrayGroupMemberXid); + ProcArrayEndTransactionInternal(proc, proc->procArrayGroupMemberXid); /* Move to next proc in list. */ nextidx = pg_atomic_read_u32(&proc->procArrayGroupNext); @@ -820,7 +829,6 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) void ProcArrayClearTransaction(PGPROC *proc) { - PGXACT *pgxact = &allPgXact[proc->pgprocno]; size_t pgxactoff; /* @@ -845,8 +853,15 @@ ProcArrayClearTransaction(PGPROC *proc) Assert(!proc->delayChkpt); /* Clear the subtransaction-XID cache too */ - pgxact->nxids = 0; - pgxact->overflowed = false; + Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && + ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); + if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed) + { + ProcGlobal->subxidStates[pgxactoff].count = 0; + ProcGlobal->subxidStates[pgxactoff].overflowed = false; + proc->subxidStatus.count = 0; + proc->subxidStatus.overflowed = false; + } LWLockRelease(ProcArrayLock); } @@ -1267,6 +1282,7 @@ TransactionIdIsInProgress(TransactionId xid) { static TransactionId *xids = NULL; static TransactionId *other_xids; + XidCacheStatus *other_subxidstates; int nxids = 0; ProcArrayStruct *arrayP = procArray; TransactionId topxid; @@ -1329,6 +1345,7 @@ TransactionIdIsInProgress(TransactionId xid) } other_xids = ProcGlobal->xids; + other_subxidstates = ProcGlobal->subxidStates; LWLockAcquire(ProcArrayLock, LW_SHARED); @@ -1351,7 +1368,6 @@ TransactionIdIsInProgress(TransactionId xid) for (size_t pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) { int pgprocno; - PGXACT *pgxact; PGPROC *proc; TransactionId pxid; int pxids; @@ -1386,9 +1402,7 @@ TransactionIdIsInProgress(TransactionId xid) /* * Step 2: check the cached child-Xids arrays */ - pgprocno = arrayP->pgprocnos[pgxactoff]; - pgxact = &allPgXact[pgprocno]; - pxids = pgxact->nxids; + pxids = other_subxidstates[pgxactoff].count; pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */ pgprocno = arrayP->pgprocnos[pgxactoff]; proc = &allProcs[pgprocno]; @@ -1412,7 +1426,7 @@ TransactionIdIsInProgress(TransactionId xid) * we hold ProcArrayLock. So we can't miss an Xid that we need to * worry about.) */ - if (pgxact->overflowed) + if (other_subxidstates[pgxactoff].overflowed) xids[nxids++] = pxid; } @@ -2019,6 +2033,7 @@ GetSnapshotData(Snapshot snapshot) size_t numProcs = arrayP->numProcs; TransactionId *xip = snapshot->xip; int *pgprocnos = arrayP->pgprocnos; + XidCacheStatus *subxidStates = ProcGlobal->subxidStates; uint8 *allVacuumFlags = ProcGlobal->vacuumFlags; /* @@ -2095,17 +2110,16 @@ GetSnapshotData(Snapshot snapshot) */ if (!suboverflowed) { - int pgprocno = pgprocnos[pgxactoff]; - PGXACT *pgxact = &allPgXact[pgprocno]; - if (pgxact->overflowed) + if (subxidStates[pgxactoff].overflowed) suboverflowed = true; else { - int nsubxids = pgxact->nxids; + int nsubxids = subxidStates[pgxactoff].count; if (nsubxids > 0) { + int pgprocno = pgprocnos[pgxactoff]; PGPROC *proc = &allProcs[pgprocno]; pg_read_barrier(); /* pairs with GetNewTransactionId */ @@ -2498,8 +2512,6 @@ GetRunningTransactionData(void) */ for (index = 0; index < arrayP->numProcs; index++) { - int pgprocno = arrayP->pgprocnos[index]; - PGXACT *pgxact = &allPgXact[pgprocno]; TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ @@ -2520,7 +2532,7 @@ GetRunningTransactionData(void) if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; - if (pgxact->overflowed) + if (ProcGlobal->subxidStates[index].overflowed) suboverflowed = true; /* @@ -2540,27 +2552,28 @@ GetRunningTransactionData(void) */ if (!suboverflowed) { + XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates; + for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - PGXACT *pgxact = &allPgXact[pgprocno]; - int nxids; + int nsubxids; /* * Save subtransaction XIDs. Other backends can't add or remove * entries while we're holding XidGenLock. */ - nxids = pgxact->nxids; - if (nxids > 0) + nsubxids = other_subxidstates[index].count; + if (nsubxids > 0) { /* barrier not really required, as XidGenLock is held, but ... */ pg_read_barrier(); /* pairs with GetNewTransactionId */ memcpy(&xids[count], (void *) proc->subxids.xids, - nxids * sizeof(TransactionId)); - count += nxids; - subcount += nxids; + nsubxids * sizeof(TransactionId)); + count += nsubxids; + subcount += nsubxids; /* * Top-level XID of a transaction is always less than any of @@ -3627,14 +3640,6 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin, LWLockRelease(ProcArrayLock); } - -#define XidCacheRemove(i) \ - do { \ - MyProc->subxids.xids[i] = MyProc->subxids.xids[MyPgXact->nxids - 1]; \ - pg_write_barrier(); \ - MyPgXact->nxids--; \ - } while (0) - /* * XidCacheRemoveRunningXids * @@ -3650,6 +3655,7 @@ XidCacheRemoveRunningXids(TransactionId xid, { int i, j; + XidCacheStatus *mysubxidstat; Assert(TransactionIdIsValid(xid)); @@ -3667,6 +3673,8 @@ XidCacheRemoveRunningXids(TransactionId xid, */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff]; + /* * Under normal circumstances xid and xids[] will be in increasing order, * as will be the entries in subxids. Scan backwards to avoid O(N^2) @@ -3676,11 +3684,14 @@ XidCacheRemoveRunningXids(TransactionId xid, { TransactionId anxid = xids[i]; - for (j = MyPgXact->nxids - 1; j >= 0; j--) + for (j = MyProc->subxidStatus.count - 1; j >= 0; j--) { if (TransactionIdEquals(MyProc->subxids.xids[j], anxid)) { - XidCacheRemove(j); + MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1]; + pg_write_barrier(); + mysubxidstat->count--; + MyProc->subxidStatus.count--; break; } } @@ -3692,20 +3703,23 @@ XidCacheRemoveRunningXids(TransactionId xid, * error during AbortSubTransaction. So instead of Assert, emit a * debug warning. */ - if (j < 0 && !MyPgXact->overflowed) + if (j < 0 && !MyProc->subxidStatus.overflowed) elog(WARNING, "did not find subXID %u in MyProc", anxid); } - for (j = MyPgXact->nxids - 1; j >= 0; j--) + for (j = MyProc->subxidStatus.count - 1; j >= 0; j--) { if (TransactionIdEquals(MyProc->subxids.xids[j], xid)) { - XidCacheRemove(j); + MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1]; + pg_write_barrier(); + mysubxidstat->count--; + MyProc->subxidStatus.count--; break; } } /* Ordinarily we should have found it, unless the cache has overflowed */ - if (j < 0 && !MyPgXact->overflowed) + if (j < 0 && !MyProc->subxidStatus.overflowed) elog(WARNING, "did not find subXID %u in MyProc", xid); /* Also advance global latestCompletedXid while holding the lock */ diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index f6113b2d2432..aa9fbd80545b 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -63,9 +63,8 @@ int LockTimeout = 0; int IdleInTransactionSessionTimeout = 0; bool log_lock_waits = false; -/* Pointer to this process's PGPROC and PGXACT structs, if any */ +/* Pointer to this process's PGPROC struct, if any */ PGPROC *MyProc = NULL; -PGXACT *MyPgXact = NULL; /* * This spinlock protects the freelist of recycled PGPROC structures. @@ -110,10 +109,8 @@ ProcGlobalShmemSize(void) size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC))); size = add_size(size, sizeof(slock_t)); - size = add_size(size, mul_size(MaxBackends, sizeof(PGXACT))); - size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGXACT))); - size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGXACT))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->vacuumFlags))); return size; @@ -161,7 +158,6 @@ void InitProcGlobal(void) { PGPROC *procs; - PGXACT *pgxacts; int i, j; bool found; @@ -202,18 +198,6 @@ InitProcGlobal(void) /* XXX allProcCount isn't really all of them; it excludes prepared xacts */ ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS; - /* - * Also allocate a separate array of PGXACT structures. This is separate - * from the main PGPROC array so that the most heavily accessed data is - * stored contiguously in memory in as few cache lines as possible. This - * provides significant performance benefits, especially on a - * multiprocessor system. There is one PGXACT structure for every PGPROC - * structure. - */ - pgxacts = (PGXACT *) ShmemAlloc(TotalProcs * sizeof(PGXACT)); - MemSet(pgxacts, 0, TotalProcs * sizeof(PGXACT)); - ProcGlobal->allPgXact = pgxacts; - /* * Allocate arrays mirroring PGPROC fields in a dense manner. See * PROC_HDR. @@ -224,6 +208,8 @@ InitProcGlobal(void) ProcGlobal->xids = (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids)); + ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates)); + MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates)); ProcGlobal->vacuumFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->vacuumFlags)); MemSet(ProcGlobal->vacuumFlags, 0, TotalProcs * sizeof(*ProcGlobal->vacuumFlags)); @@ -372,7 +358,6 @@ InitProcess(void) (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("sorry, too many clients already"))); } - MyPgXact = &ProcGlobal->allPgXact[MyProc->pgprocno]; /* * Cross-check that the PGPROC is of the type we expect; if this were not @@ -569,7 +554,6 @@ InitAuxiliaryProcess(void) ((volatile PGPROC *) auxproc)->pid = MyProcPid; MyProc = auxproc; - MyPgXact = &ProcGlobal->allPgXact[auxproc->pgprocno]; SpinLockRelease(ProcStructLock); diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 9f3a8b518eb2..9c9a50ae457f 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -35,6 +35,14 @@ */ #define PGPROC_MAX_CACHED_SUBXIDS 64 /* XXX guessed-at value */ +typedef struct XidCacheStatus +{ + /* number of cached subxids, never more than PGPROC_MAX_CACHED_SUBXIDS */ + uint8 count; + /* has PGPROC->subxids overflowed */ + bool overflowed; +} XidCacheStatus; + struct XidCache { TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS]; @@ -187,6 +195,8 @@ struct PGPROC */ SHM_QUEUE myProcLocks[NUM_LOCK_PARTITIONS]; + XidCacheStatus subxidStatus; /* mirrored with + * ProcGlobal->subxidStates[i] */ struct XidCache subxids; /* cache for subtransaction XIDs */ /* Support for group XID clearing. */ @@ -235,22 +245,6 @@ struct PGPROC extern PGDLLIMPORT PGPROC *MyProc; -extern PGDLLIMPORT struct PGXACT *MyPgXact; - -/* - * Prior to PostgreSQL 9.2, the fields below were stored as part of the - * PGPROC. However, benchmarking revealed that packing these particular - * members into a separate array as tightly as possible sped up GetSnapshotData - * considerably on systems with many CPU cores, by reducing the number of - * cache lines needing to be fetched. Thus, think very carefully before adding - * anything else here. - */ -typedef struct PGXACT -{ - bool overflowed; - - uint8 nxids; -} PGXACT; /* * There is one ProcGlobal struct for the whole database cluster. @@ -310,12 +304,16 @@ typedef struct PROC_HDR { /* Array of PGPROC structures (not including dummies for prepared txns) */ PGPROC *allProcs; - /* Array of PGXACT structures (not including dummies for prepared txns) */ - PGXACT *allPgXact; /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ TransactionId *xids; + /* + * Array mirroring PGPROC.subxidStatus for each PGPROC currently in the + * procarray. + */ + XidCacheStatus *subxidStates; + /* * Array mirroring PGPROC.vacuumFlags for each PGPROC currently in the * procarray. diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b4948ac675f7..3d990463ce9c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1536,7 +1536,6 @@ PGSetenvStatusType PGShmemHeader PGTransactionStatusType PGVerbosity -PGXACT PG_Locale_Strategy PG_Lock_Status PG_init_t From 1e7629d2c95ffd290ab0e18d7618ca9d9da94265 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 14 Aug 2020 22:14:03 -0400 Subject: [PATCH 300/334] Be more careful about the shape of hashable subplan clauses. nodeSubplan.c expects that the testexpr for a hashable ANY SubPlan has the form of one or more OpExprs whose LHS is an expression of the outer query's, while the RHS is an expression over Params representing output columns of the subquery. However, the planner only went as far as verifying that the clauses were all binary OpExprs. This works 99.99% of the time, because the clauses have the right shape when emitted by the parser --- but it's possible for function inlining to break that, as reported by PegoraroF10. To fix, teach the planner to check that the LHS and RHS contain the right things, or more accurately don't contain the wrong things. Given that this has been broken for years without anyone noticing, it seems sufficient to just give up hashing when it happens, rather than go to the trouble of commuting the clauses back again (which wouldn't necessarily work anyway). While poking at that, I also noticed that nodeSubplan.c had a baked-in assumption that the number of hash clauses is identical to the number of subquery output columns. Again, that's fine as far as parser output goes, but it's not hard to break it via function inlining. There seems little reason for that assumption though --- AFAICS, the only thing it's buying us is not having to store the number of hash clauses explicitly. Adding code to the planner to reject such cases would take more code than getting nodeSubplan.c to cope, so I fixed it that way. This has been broken for as long as we've had hashable SubPlans, so back-patch to all supported branches. Discussion: https://postgr.es/m/1549209182255-0.post@n3.nabble.com --- src/backend/executor/nodeSubplan.c | 16 ++--- src/backend/optimizer/plan/subselect.c | 77 ++++++++++++++++++------- src/backend/optimizer/util/clauses.c | 35 +++++++++++ src/include/nodes/execnodes.h | 2 + src/include/optimizer/clauses.h | 1 + src/test/regress/expected/subselect.out | 77 +++++++++++++++++++++++++ src/test/regress/sql/subselect.sql | 41 +++++++++++++ 7 files changed, 219 insertions(+), 30 deletions(-) diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index 38c2fc0b50b6..9a7962518ee6 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -471,7 +471,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) { SubPlan *subplan = node->subplan; PlanState *planstate = node->planstate; - int ncols = list_length(subplan->paramIds); + int ncols = node->numCols; ExprContext *innerecontext = node->innerecontext; MemoryContext oldcontext; long nbuckets; @@ -878,11 +878,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) ALLOCSET_SMALL_SIZES); /* and a short-lived exprcontext for function evaluation */ sstate->innerecontext = CreateExprContext(estate); - /* Silly little array of column numbers 1..n */ - ncols = list_length(subplan->paramIds); - sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber)); - for (i = 0; i < ncols; i++) - sstate->keyColIdx[i] = i + 1; /* * We use ExecProject to evaluate the lefthand and righthand @@ -914,13 +909,15 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) (int) nodeTag(subplan->testexpr)); oplist = NIL; /* keep compiler quiet */ } - Assert(list_length(oplist) == ncols); + ncols = list_length(oplist); lefttlist = righttlist = NIL; + sstate->numCols = ncols; + sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber)); sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid)); + sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid)); sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); sstate->tab_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); - sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid)); sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); /* we'll need the cross-type equality fns below, but not in sstate */ @@ -979,6 +976,9 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) /* Set collation */ sstate->tab_collations[i - 1] = opexpr->inputcollid; + /* keyColIdx is just column numbers 1..n */ + sstate->keyColIdx[i - 1] = i; + i++; } diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 9a8f738c9d05..6eb794669fe3 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -69,7 +69,7 @@ typedef struct inline_cte_walker_context static Node *build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, List *plan_params, SubLinkType subLinkType, int subLinkId, - Node *testexpr, bool adjust_testexpr, + Node *testexpr, List *testexpr_paramids, bool unknownEqFalse); static List *generate_subquery_params(PlannerInfo *root, List *tlist, List **paramIds); @@ -81,7 +81,8 @@ static Node *convert_testexpr(PlannerInfo *root, static Node *convert_testexpr_mutator(Node *node, convert_testexpr_context *context); static bool subplan_is_hashable(Plan *plan); -static bool testexpr_is_hashable(Node *testexpr); +static bool testexpr_is_hashable(Node *testexpr, List *param_ids); +static bool test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids); static bool hash_ok_operator(OpExpr *expr); static bool contain_dml(Node *node); static bool contain_dml_walker(Node *node, void *context); @@ -237,7 +238,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, /* And convert to SubPlan or InitPlan format. */ result = build_subplan(root, plan, subroot, plan_params, subLinkType, subLinkId, - testexpr, true, isTopQual); + testexpr, NIL, isTopQual); /* * If it's a correlated EXISTS with an unimportant targetlist, we might be @@ -291,12 +292,11 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, plan_params, ANY_SUBLINK, 0, newtestexpr, - false, true)); + paramIds, + true)); /* Check we got what we expected */ Assert(hashplan->parParam == NIL); Assert(hashplan->useHashTable); - /* build_subplan won't have filled in paramIds */ - hashplan->paramIds = paramIds; /* Leave it to the executor to decide which plan to use */ asplan = makeNode(AlternativeSubPlan); @@ -319,7 +319,7 @@ static Node * build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, List *plan_params, SubLinkType subLinkType, int subLinkId, - Node *testexpr, bool adjust_testexpr, + Node *testexpr, List *testexpr_paramids, bool unknownEqFalse) { Node *result; @@ -484,10 +484,10 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, else { /* - * Adjust the Params in the testexpr, unless caller said it's not - * needed. + * Adjust the Params in the testexpr, unless caller already took care + * of it (as indicated by passing a list of Param IDs). */ - if (testexpr && adjust_testexpr) + if (testexpr && testexpr_paramids == NIL) { List *params; @@ -499,7 +499,10 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, params); } else + { splan->testexpr = testexpr; + splan->paramIds = testexpr_paramids; + } /* * We can't convert subplans of ALL_SUBLINK or ANY_SUBLINK types to @@ -511,7 +514,7 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot, if (subLinkType == ANY_SUBLINK && splan->parParam == NIL && subplan_is_hashable(plan) && - testexpr_is_hashable(splan->testexpr)) + testexpr_is_hashable(splan->testexpr, splan->paramIds)) splan->useHashTable = true; /* @@ -734,24 +737,20 @@ subplan_is_hashable(Plan *plan) /* * testexpr_is_hashable: is an ANY SubLink's test expression hashable? + * + * To identify LHS vs RHS of the hash expression, we must be given the + * list of output Param IDs of the SubLink's subquery. */ static bool -testexpr_is_hashable(Node *testexpr) +testexpr_is_hashable(Node *testexpr, List *param_ids) { /* * The testexpr must be a single OpExpr, or an AND-clause containing only - * OpExprs. - * - * The combining operators must be hashable and strict. The need for - * hashability is obvious, since we want to use hashing. Without - * strictness, behavior in the presence of nulls is too unpredictable. We - * actually must assume even more than plain strictness: they can't yield - * NULL for non-null inputs, either (see nodeSubplan.c). However, hash - * indexes and hash joins assume that too. + * OpExprs, each of which satisfy test_opexpr_is_hashable(). */ if (testexpr && IsA(testexpr, OpExpr)) { - if (hash_ok_operator((OpExpr *) testexpr)) + if (test_opexpr_is_hashable((OpExpr *) testexpr, param_ids)) return true; } else if (is_andclause(testexpr)) @@ -764,7 +763,7 @@ testexpr_is_hashable(Node *testexpr) if (!IsA(andarg, OpExpr)) return false; - if (!hash_ok_operator((OpExpr *) andarg)) + if (!test_opexpr_is_hashable((OpExpr *) andarg, param_ids)) return false; } return true; @@ -773,6 +772,40 @@ testexpr_is_hashable(Node *testexpr) return false; } +static bool +test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids) +{ + /* + * The combining operator must be hashable and strict. The need for + * hashability is obvious, since we want to use hashing. Without + * strictness, behavior in the presence of nulls is too unpredictable. We + * actually must assume even more than plain strictness: it can't yield + * NULL for non-null inputs, either (see nodeSubplan.c). However, hash + * indexes and hash joins assume that too. + */ + if (!hash_ok_operator(testexpr)) + return false; + + /* + * The left and right inputs must belong to the outer and inner queries + * respectively; hence Params that will be supplied by the subquery must + * not appear in the LHS, and Vars of the outer query must not appear in + * the RHS. (Ordinarily, this must be true because of the way that the + * parser builds an ANY SubLink's testexpr ... but inlining of functions + * could have changed the expression's structure, so we have to check. + * Such cases do not occur often enough to be worth trying to optimize, so + * we don't worry about trying to commute the clause or anything like + * that; we just need to be sure not to build an invalid plan.) + */ + if (list_length(testexpr->args) != 2) + return false; + if (contain_exec_param((Node *) linitial(testexpr->args), param_ids)) + return false; + if (contain_var_clause((Node *) lsecond(testexpr->args))) + return false; + return true; +} + /* * Check expression is hashable + strict * diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index e04b14407236..7105d0a2db9a 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -108,6 +108,7 @@ static bool contain_volatile_functions_not_nextval_walker(Node *node, void *cont static bool max_parallel_hazard_walker(Node *node, max_parallel_hazard_context *context); static bool contain_nonstrict_functions_walker(Node *node, void *context); +static bool contain_exec_param_walker(Node *node, List *param_ids); static bool contain_context_dependent_node(Node *clause); static bool contain_context_dependent_node_walker(Node *node, int *flags); static bool contain_leaked_vars_walker(Node *node, void *context); @@ -1221,6 +1222,40 @@ contain_nonstrict_functions_walker(Node *node, void *context) context); } +/***************************************************************************** + * Check clauses for Params + *****************************************************************************/ + +/* + * contain_exec_param + * Recursively search for PARAM_EXEC Params within a clause. + * + * Returns true if the clause contains any PARAM_EXEC Param with a paramid + * appearing in the given list of Param IDs. Does not descend into + * subqueries! + */ +bool +contain_exec_param(Node *clause, List *param_ids) +{ + return contain_exec_param_walker(clause, param_ids); +} + +static bool +contain_exec_param_walker(Node *node, List *param_ids) +{ + if (node == NULL) + return false; + if (IsA(node, Param)) + { + Param *p = (Param *) node; + + if (p->paramkind == PARAM_EXEC && + list_member_int(param_ids, p->paramid)) + return true; + } + return expression_tree_walker(node, contain_exec_param_walker, param_ids); +} + /***************************************************************************** * Check clauses for context-dependent nodes *****************************************************************************/ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index cf832d7f9097..0b42dd6f9441 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -867,6 +867,8 @@ typedef struct SubPlanState MemoryContext hashtablecxt; /* memory context containing hash tables */ MemoryContext hashtempcxt; /* temp memory context for hash tables */ ExprContext *innerecontext; /* econtext for computing inner tuples */ + int numCols; /* number of columns being hashed */ + /* each of the remaining fields is an array of length numCols: */ AttrNumber *keyColIdx; /* control data for hash tables */ Oid *tab_eq_funcoids; /* equality func oids for table * datatype(s) */ diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index b7456e3e595b..7ef8cce79eec 100644 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -38,6 +38,7 @@ extern bool contain_subplans(Node *clause); extern char max_parallel_hazard(Query *parse); extern bool is_parallel_safe(PlannerInfo *root, Node *node); extern bool contain_nonstrict_functions(Node *clause); +extern bool contain_exec_param(Node *clause, List *param_ids); extern bool contain_leaked_vars(Node *clause); extern Relids find_nonnullable_rels(Node *clause); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 1c5d80da323e..b81923f2e741 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -757,6 +757,7 @@ insert into outer_text values ('a', null); insert into outer_text values ('b', null); create temp table inner_text (c1 text, c2 text); insert into inner_text values ('a', null); +insert into inner_text values ('123', '456'); select * from outer_text where (f1, f2) not in (select * from inner_text); f1 | f2 ----+---- @@ -797,6 +798,82 @@ select '1'::text in (select '1'::name union all select '1'::name); t (1 row) +-- +-- Test that we don't try to use a hashed subplan if the simplified +-- testexpr isn't of the right shape +-- +-- this fails by default, of course +select * from int8_tbl where q1 in (select c1 from inner_text); +ERROR: operator does not exist: bigint = text +LINE 1: select * from int8_tbl where q1 in (select c1 from inner_tex... + ^ +HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +begin; +-- make an operator to allow it to succeed +create function bogus_int8_text_eq(int8, text) returns boolean +language sql as 'select $1::text = $2'; +create operator = (procedure=bogus_int8_text_eq, leftarg=int8, rightarg=text); +explain (costs off) +select * from int8_tbl where q1 in (select c1 from inner_text); + QUERY PLAN +-------------------------------- + Seq Scan on int8_tbl + Filter: (hashed SubPlan 1) + SubPlan 1 + -> Seq Scan on inner_text +(4 rows) + +select * from int8_tbl where q1 in (select c1 from inner_text); + q1 | q2 +-----+------------------ + 123 | 456 + 123 | 4567890123456789 +(2 rows) + +-- inlining of this function results in unusual number of hash clauses, +-- which we can still cope with +create or replace function bogus_int8_text_eq(int8, text) returns boolean +language sql as 'select $1::text = $2 and $1::text = $2'; +explain (costs off) +select * from int8_tbl where q1 in (select c1 from inner_text); + QUERY PLAN +-------------------------------- + Seq Scan on int8_tbl + Filter: (hashed SubPlan 1) + SubPlan 1 + -> Seq Scan on inner_text +(4 rows) + +select * from int8_tbl where q1 in (select c1 from inner_text); + q1 | q2 +-----+------------------ + 123 | 456 + 123 | 4567890123456789 +(2 rows) + +-- inlining of this function causes LHS and RHS to be switched, +-- which we can't cope with, so hashing should be abandoned +create or replace function bogus_int8_text_eq(int8, text) returns boolean +language sql as 'select $2 = $1::text'; +explain (costs off) +select * from int8_tbl where q1 in (select c1 from inner_text); + QUERY PLAN +-------------------------------------- + Seq Scan on int8_tbl + Filter: (SubPlan 1) + SubPlan 1 + -> Materialize + -> Seq Scan on inner_text +(5 rows) + +select * from int8_tbl where q1 in (select c1 from inner_text); + q1 | q2 +-----+------------------ + 123 | 456 + 123 | 4567890123456789 +(2 rows) + +rollback; -- to get rid of the bogus operator -- -- Test case for planner bug with nested EXISTS handling -- diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index a56057bd4fad..cce8ebdb3d9f 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -449,6 +449,7 @@ insert into outer_text values ('b', null); create temp table inner_text (c1 text, c2 text); insert into inner_text values ('a', null); +insert into inner_text values ('123', '456'); select * from outer_text where (f1, f2) not in (select * from inner_text); @@ -468,6 +469,46 @@ select 'foo'::text in (select 'bar'::name union all select 'bar'::name); select '1'::text in (select '1'::name union all select '1'::name); +-- +-- Test that we don't try to use a hashed subplan if the simplified +-- testexpr isn't of the right shape +-- + +-- this fails by default, of course +select * from int8_tbl where q1 in (select c1 from inner_text); + +begin; + +-- make an operator to allow it to succeed +create function bogus_int8_text_eq(int8, text) returns boolean +language sql as 'select $1::text = $2'; + +create operator = (procedure=bogus_int8_text_eq, leftarg=int8, rightarg=text); + +explain (costs off) +select * from int8_tbl where q1 in (select c1 from inner_text); +select * from int8_tbl where q1 in (select c1 from inner_text); + +-- inlining of this function results in unusual number of hash clauses, +-- which we can still cope with +create or replace function bogus_int8_text_eq(int8, text) returns boolean +language sql as 'select $1::text = $2 and $1::text = $2'; + +explain (costs off) +select * from int8_tbl where q1 in (select c1 from inner_text); +select * from int8_tbl where q1 in (select c1 from inner_text); + +-- inlining of this function causes LHS and RHS to be switched, +-- which we can't cope with, so hashing should be abandoned +create or replace function bogus_int8_text_eq(int8, text) returns boolean +language sql as 'select $2 = $1::text'; + +explain (costs off) +select * from int8_tbl where q1 in (select c1 from inner_text); +select * from int8_tbl where q1 in (select c1 from inner_text); + +rollback; -- to get rid of the bogus operator + -- -- Test case for planner bug with nested EXISTS handling -- From b48cac3b10a02fea2bed684469dd4d36a6616405 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Sat, 15 Aug 2020 08:34:48 +0530 Subject: [PATCH 301/334] Mark a few logical decoding related variables with PGDLLIMPORT. Commit 7259736a6e added two variables CheckXidAlive and bsysscan to detect concurrent aborts and used these in inline functions that are part of the API that can be used by extensions. So it is better to mark them with PGDLLIMPORT. Reported-by: Thomas Munro Discussion: https://postgr.es/m/CA+hUKGJ7+HYupd=Pz9+QrXa-C_YnbC4rAYu8V+=OKg=UgdzMeg@mail.gmail.com --- src/include/access/xact.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/access/xact.h b/src/include/access/xact.h index c18554bae2c2..c59de9bebaf8 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -82,8 +82,8 @@ typedef enum extern int synchronous_commit; /* used during logical streaming of a transaction */ -extern TransactionId CheckXidAlive; -extern bool bsysscan; +extern PGDLLIMPORT TransactionId CheckXidAlive; +extern PGDLLIMPORT bool bsysscan; /* * Miscellaneous flag bits to record events which occur on the top level From bacda6a327efb820d0e9f3262b81e803b2d5702b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sat, 15 Aug 2020 11:23:18 +0200 Subject: [PATCH 302/334] Remove obsolete HAVE_BUGGY_SOLARIS_STRTOD Fixed more than 10 years ago. Reviewed-by: Noah Misch Discussion: https://www.postgresql.org/message-id/flat/aa266ede-baaa-f4e6-06cf-5b1737610e9a%402ndquadrant.com --- src/backend/utils/adt/float.c | 24 ------------------------ src/include/port/solaris.h | 12 ------------ 2 files changed, 36 deletions(-) diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index ffd1ce8c7610..429c9280c0cf 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -271,18 +271,6 @@ float4in(PG_FUNCTION_ARGS) errmsg("invalid input syntax for type %s: \"%s\"", "real", orig_num))); } -#ifdef HAVE_BUGGY_SOLARIS_STRTOD - else - { - /* - * Many versions of Solaris have a bug wherein strtod sets endptr to - * point one byte beyond the end of the string when given "inf" or - * "infinity". - */ - if (endptr != num && endptr[-1] == '\0') - endptr--; - } -#endif /* HAVE_BUGGY_SOLARIS_STRTOD */ /* skip trailing whitespace */ while (*endptr != '\0' && isspace((unsigned char) *endptr)) @@ -499,18 +487,6 @@ float8in_internal_opt_error(char *num, char **endptr_p, type_name, orig_string))), have_error); } -#ifdef HAVE_BUGGY_SOLARIS_STRTOD - else - { - /* - * Many versions of Solaris have a bug wherein strtod sets endptr to - * point one byte beyond the end of the string when given "inf" or - * "infinity". - */ - if (endptr != num && endptr[-1] == '\0') - endptr--; - } -#endif /* HAVE_BUGGY_SOLARIS_STRTOD */ /* skip trailing whitespace */ while (*endptr != '\0' && isspace((unsigned char) *endptr)) diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h index eeb1a320bd5b..e63a3bd824d6 100644 --- a/src/include/port/solaris.h +++ b/src/include/port/solaris.h @@ -24,15 +24,3 @@ #if defined(__i386__) #include #endif - -/* - * Many versions of Solaris have broken strtod() --- see bug #4751182. - * This has been fixed in current versions of Solaris: - * - * http://sunsolve.sun.com/search/document.do?assetkey=1-21-108993-62-1&searchclause=108993-62 - * http://sunsolve.sun.com/search/document.do?assetkey=1-21-112874-34-1&searchclause=112874-34 - * - * However, many people might not have patched versions, so - * still use our own fix for the buggy version. - */ -#define HAVE_BUGGY_SOLARIS_STRTOD From 53095b5fe650270118bc2ab77416d08e19472cd3 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sat, 15 Aug 2020 11:23:18 +0200 Subject: [PATCH 303/334] Remove obsolete cygwin.h hack The version being checked for is 20 years old. Reviewed-by: Marco Atzeri Discussion: https://www.postgresql.org/message-id/flat/aa266ede-baaa-f4e6-06cf-5b1737610e9a%402ndquadrant.com --- src/include/port/cygwin.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/include/port/cygwin.h b/src/include/port/cygwin.h index f1fc1a93d76c..64d69936e5e0 100644 --- a/src/include/port/cygwin.h +++ b/src/include/port/cygwin.h @@ -1,14 +1,5 @@ /* src/include/port/cygwin.h */ -#include - -/* - * Check for b20.1 and disable AF_UNIX family socket support. - */ -#if CYGWIN_VERSION_DLL_MAJOR < 1001 -#undef HAVE_UNIX_SOCKETS -#endif - #ifdef BUILDING_DLL #define PGDLLIMPORT __declspec (dllexport) #else From d4d443b3bbbb3eb9cdc511564ef3c57fde7dd3ac Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 15 Aug 2020 12:04:19 -0400 Subject: [PATCH 304/334] Remove no-longer-usable hstore--1.0--1.1.sql update script. Since commit 865f14a2d made "=>" unusable as an operator name, it's been impossible either to install hstore 1.0 or to execute this update script. There's not much point in continuing to ship it. Discussion: https://postgr.es/m/653936.1597431032@sss.pgh.pa.us --- contrib/hstore/Makefile | 2 +- contrib/hstore/hstore--1.0--1.1.sql | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 contrib/hstore/hstore--1.0--1.1.sql diff --git a/contrib/hstore/Makefile b/contrib/hstore/Makefile index 872ca03cd1fb..72376d900763 100644 --- a/contrib/hstore/Makefile +++ b/contrib/hstore/Makefile @@ -15,7 +15,7 @@ DATA = hstore--1.4.sql \ hstore--1.5--1.6.sql \ hstore--1.4--1.5.sql \ hstore--1.3--1.4.sql hstore--1.2--1.3.sql \ - hstore--1.1--1.2.sql hstore--1.0--1.1.sql + hstore--1.1--1.2.sql PGFILEDESC = "hstore - key/value pair data type" HEADERS = hstore.h diff --git a/contrib/hstore/hstore--1.0--1.1.sql b/contrib/hstore/hstore--1.0--1.1.sql deleted file mode 100644 index 4e32a575c5f6..000000000000 --- a/contrib/hstore/hstore--1.0--1.1.sql +++ /dev/null @@ -1,7 +0,0 @@ -/* contrib/hstore/hstore--1.0--1.1.sql */ - --- complain if script is sourced in psql, rather than via ALTER EXTENSION -\echo Use "ALTER EXTENSION hstore UPDATE TO '1.1'" to load this file. \quit - -ALTER EXTENSION hstore DROP OPERATOR => (text, text); -DROP OPERATOR => (text, text); From 566372b3d6435639e4cc4476d79b8505a0297c87 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Sat, 15 Aug 2020 10:15:53 -0700 Subject: [PATCH 305/334] Prevent concurrent SimpleLruTruncate() for any given SLRU. The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com --- doc/src/sgml/catalogs.sgml | 4 ++- doc/src/sgml/monitoring.sgml | 16 ++++++++++ src/backend/access/transam/slru.c | 8 +++++ src/backend/access/transam/subtrans.c | 4 +-- src/backend/commands/async.c | 37 +++++++++++++++++------- src/backend/commands/vacuum.c | 13 +++++++++ src/backend/storage/lmgr/lmgr.c | 20 +++++++++++++ src/backend/storage/lmgr/lwlocknames.txt | 3 ++ src/backend/utils/adt/lockfuncs.c | 12 ++++++++ src/include/storage/lmgr.h | 3 ++ src/include/storage/lock.h | 10 +++++++ 11 files changed, 117 insertions(+), 13 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 26fda20d1939..fc329c5cff96 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -10226,7 +10226,8 @@ SCRAM-SHA-256$<iteration count>:&l and general database objects (identified by class OID and object OID, in the same way as in pg_description or pg_depend). Also, the right to extend a - relation is represented as a separate lockable object. + relation is represented as a separate lockable object, as is the right to + update pg_database.datfrozenxid. Also, advisory locks can be taken on numbers that have user-defined meanings. @@ -10254,6 +10255,7 @@ SCRAM-SHA-256$<iteration count>:&l Type of the lockable object: relation, extend, + frozenid, page, tuple, transactionid, diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 7dcddf478a11..304c49f07b76 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1742,6 +1742,12 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser extend Waiting to extend a relation. + + frozenid + Waiting to + update pg_database.datfrozenxid + and pg_database.datminmxid. + object Waiting to acquire a lock on a non-relation database object. @@ -1910,6 +1916,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser NotifyQueue Waiting to read or update NOTIFY messages. + + NotifyQueueTail + Waiting to update limit on NOTIFY message + storage. + NotifySLRU Waiting to access the NOTIFY message SLRU @@ -2086,6 +2097,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser WALWrite Waiting for WAL buffers to be written to disk. + + WrapLimitsVacuum + Waiting to update limits on transaction id and multixact + consumption. + XactBuffer Waiting for I/O on a transaction status SLRU buffer. diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index d1dbb43e096c..7640f153c227 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1191,6 +1191,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) /* * Remove all segments before the one holding the passed page number + * + * All SLRUs prevent concurrent calls to this function, either with an LWLock + * or by calling it only as part of a checkpoint. Mutual exclusion must begin + * before computing cutoffPage. Mutual exclusion must end after any limit + * update that would permit other backends to write fresh data into the + * segment immediately preceding the one containing cutoffPage. Otherwise, + * when the SLRU is quite full, SimpleLruTruncate() might delete that segment + * after it has accrued freshly-written data. */ void SimpleLruTruncate(SlruCtl ctl, int cutoffPage) diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index a087a5554210..a50f60b99af2 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -349,8 +349,8 @@ ExtendSUBTRANS(TransactionId newestXact) /* * Remove all SUBTRANS segments before the one holding the passed transaction ID * - * This is normally called during checkpoint, with oldestXact being the - * oldest TransactionXmin of any running transaction. + * oldestXact is the oldest TransactionXmin of any running transaction. This + * is called only during checkpoint. */ void TruncateSUBTRANS(TransactionId oldestXact) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 71b7577afc06..4c1286eb988e 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -244,19 +244,22 @@ typedef struct QueueBackendStatus /* * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff) * - * The AsyncQueueControl structure is protected by the NotifyQueueLock. + * The AsyncQueueControl structure is protected by the NotifyQueueLock and + * NotifyQueueTailLock. * - * When holding the lock in SHARED mode, backends may only inspect their own - * entries as well as the head and tail pointers. Consequently we can allow a - * backend to update its own record while holding only SHARED lock (since no - * other backend will inspect it). + * When holding NotifyQueueLock in SHARED mode, backends may only inspect + * their own entries as well as the head and tail pointers. Consequently we + * can allow a backend to update its own record while holding only SHARED lock + * (since no other backend will inspect it). * - * When holding the lock in EXCLUSIVE mode, backends can inspect the entries - * of other backends and also change the head and tail pointers. + * When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the + * entries of other backends and also change the head pointer. When holding + * both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends + * can change the tail pointer. * * NotifySLRULock is used as the control lock for the pg_notify SLRU buffers. - * In order to avoid deadlocks, whenever we need both locks, we always first - * get NotifyQueueLock and then NotifySLRULock. + * In order to avoid deadlocks, whenever we need multiple locks, we first get + * NotifyQueueTailLock, then NotifyQueueLock, and lastly NotifySLRULock. * * Each backend uses the backend[] array entry with index equal to its * BackendId (which can range from 1 to MaxBackends). We rely on this to make @@ -2177,6 +2180,10 @@ asyncQueueAdvanceTail(void) int newtailpage; int boundary; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE); + + /* Compute the new tail. */ LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE); min = QUEUE_HEAD; for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i)) @@ -2185,7 +2192,6 @@ asyncQueueAdvanceTail(void) min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i)); } oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL); - QUEUE_TAIL = min; LWLockRelease(NotifyQueueLock); /* @@ -2205,6 +2211,17 @@ asyncQueueAdvanceTail(void) */ SimpleLruTruncate(NotifyCtl, newtailpage); } + + /* + * Advertise the new tail. This changes asyncQueueIsFull()'s verdict for + * the segment immediately prior to the new tail, allowing fresh data into + * that segment. + */ + LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE); + QUEUE_TAIL = min; + LWLockRelease(NotifyQueueLock); + + LWLockRelease(NotifyQueueTailLock); } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index aba13c31d1bc..5189a5ad5e37 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1361,6 +1361,14 @@ vac_update_datfrozenxid(void) bool bogus = false; bool dirty = false; + /* + * Restrict this task to one backend per database. This avoids race + * conditions that would move datfrozenxid or datminmxid backward. It + * avoids calling vac_truncate_clog() with a datfrozenxid preceding a + * datfrozenxid passed to an earlier vac_truncate_clog() call. + */ + LockDatabaseFrozenIds(ExclusiveLock); + /* * Initialize the "min" calculation with * GetOldestNonRemovableTransactionId(), which is a reasonable @@ -1551,6 +1559,9 @@ vac_truncate_clog(TransactionId frozenXID, bool bogus = false; bool frozenAlreadyWrapped = false; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE); + /* init oldest datoids to sync with my frozenXID/minMulti values */ oldestxid_datoid = MyDatabaseId; minmulti_datoid = MyDatabaseId; @@ -1660,6 +1671,8 @@ vac_truncate_clog(TransactionId frozenXID, */ SetTransactionIdLimit(frozenXID, oldestxid_datoid); SetMultiXactIdLimit(minMulti, minmulti_datoid, false); + + LWLockRelease(WrapLimitsVacuumLock); } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 20103200952e..7409de940592 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -460,6 +460,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) LockRelease(&tag, lockmode, false); } +/* + * LockDatabaseFrozenIds + * + * This allows one backend per database to execute vac_update_datfrozenxid(). + */ +void +LockDatabaseFrozenIds(LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + /* * LockPage * @@ -1098,6 +1113,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field2, tag->locktag_field1); break; + case LOCKTAG_DATABASE_FROZEN_IDS: + appendStringInfo(buf, + _("pg_database.datfrozenxid of database %u"), + tag->locktag_field1); + break; case LOCKTAG_PAGE: appendStringInfo(buf, _("page %u of relation %u of database %u"), diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6985e8eedfb..774292fd9427 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,3 +50,6 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 LogicalRepWorkerLock 43 XactTruncationLock 44 +# 45 was XactTruncationLock until removal of BackendRandomLock +WrapLimitsVacuumLock 46 +NotifyQueueTailLock 47 diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index e992d1bbfced..f592292d067b 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -29,6 +29,7 @@ const char *const LockTagTypeNames[] = { "relation", "extend", + "frozenid", "page", "tuple", "transactionid", @@ -254,6 +255,17 @@ pg_lock_status(PG_FUNCTION_ARGS) nulls[8] = true; nulls[9] = true; break; + case LOCKTAG_DATABASE_FROZEN_IDS: + values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[7] = true; + nulls[8] = true; + nulls[9] = true; + break; case LOCKTAG_PAGE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 3acc11aa5a3b..f7cabcbbf550 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -59,6 +59,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode); extern int RelationExtensionLockWaiterCount(Relation relation); +/* Lock to recompute pg_database.datfrozenxid in the current database */ +extern void LockDatabaseFrozenIds(LOCKMODE lockmode); + /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index fdabf427210a..1c3e9c1999f5 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -138,6 +138,7 @@ typedef enum LockTagType { LOCKTAG_RELATION, /* whole relation */ LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ + LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ LOCKTAG_PAGE, /* one page of a relation */ LOCKTAG_TUPLE, /* one physical tuple */ LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */ @@ -194,6 +195,15 @@ typedef struct LOCKTAG (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) +/* ID info for frozen IDs is DB OID */ +#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + /* ID info for a page is RELATION info + BlockNumber */ #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ ((locktag).locktag_field1 = (dboid), \ From db659a3416b967d716806e558efbb9d1ec610cd1 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 15 Aug 2020 15:43:34 -0400 Subject: [PATCH 306/334] Doc: various improvements for pg_basebackup reference page. Put the -r option in the right section (it certainly isn't an option controlling "the location and format of the output"). Clarify the behavior of the tablespace and waldir options (that part per gripe from robert@interactive.co.uk). Make a large number of small copy-editing fixes in text that visibly wasn't written by native speakers, and try to avoid grammatical inconsistencies between the descriptions of the different options. Back-patch to v13, since HEAD hasn't meaningfully diverged yet. Discussion: https://postgr.es/m/159749418850.14322.216503677134569752@wrigleys.postgresql.org --- doc/src/sgml/ref/pg_basebackup.sgml | 324 +++++++++++++++------------- 1 file changed, 171 insertions(+), 153 deletions(-) diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index e246efbdb520..aa0b27c9f300 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -29,51 +29,51 @@ PostgreSQL documentation Description - pg_basebackup is used to take base backups of - a running PostgreSQL database cluster. These - are taken without affecting other clients to the database, and can be used + pg_basebackup is used to take a base backup of + a running PostgreSQL database cluster. The backup + is taken without affecting other clients of the database, and can be used both for point-in-time recovery (see ) - and as the starting point for a log shipping or streaming replication standby - servers (see ). + and as the starting point for a log-shipping or streaming-replication standby + server (see ). - pg_basebackup makes a binary copy of the database - cluster files, while making sure the system is put in and + pg_basebackup makes an exact copy of the database + cluster's files, while making sure the server is put into and out of backup mode automatically. Backups are always taken of the entire database cluster; it is not possible to back up individual databases or - database objects. For individual database backups, a tool such as + database objects. For selective backups, another tool such as must be used. The backup is made over a regular PostgreSQL - connection, and uses the replication protocol. The connection must be made - with a user having REPLICATION permissions - (see ) or a superuser, - and pg_hba.conf must explicitly permit the replication - connection. The server must also be configured - with set high enough to leave at least - one session available for the backup and one for WAL streaming (if used). + connection that uses the replication protocol. The connection must be made + with a user ID that has REPLICATION permissions + (see ) or is a superuser, + and pg_hba.conf + must permit the replication connection. The server must also be configured + with set high enough to provide at + least one walsender for the backup plus one for WAL streaming (if used). - There can be multiple pg_basebackups running at the same time, but it is + There can be multiple pg_basebackups running at the same time, but it is usually better from a performance point of view to take only one backup, and copy the result. pg_basebackup can make a base backup from - not only the primary but also the standby. To take a backup from the standby, + not only a primary server but also a standby. To take a backup from a standby, set up the standby so that it can accept replication connections (that is, set max_wal_senders and , - and configure host-based authentication). + and configure its pg_hba.conf appropriately). You will also need to enable on the primary. - Note that there are some limitations in an online backup from the standby: + Note that there are some limitations in taking a backup from a standby: @@ -89,7 +89,7 @@ PostgreSQL documentation - If the standby is promoted to the primary during online backup, the backup fails. + If the standby is promoted to be primary during backup, the backup fails. @@ -105,7 +105,7 @@ PostgreSQL documentation Whenever pg_basebackup is taking a base - backup, the pg_stat_progress_basebackup + backup, the server's pg_stat_progress_basebackup view will report the progress of the backup. See for details. @@ -116,7 +116,7 @@ PostgreSQL documentation The following command-line options control the location and format of the - output. + output: @@ -124,15 +124,15 @@ PostgreSQL documentation - Directory to write the output to. - pg_basebackup will create the directory and - any parent directories if necessary. The directory may already exist, - but it is an error if the directory already exists and is not empty. + Sets the target directory to write the output to. + pg_basebackup will create this directory + (and any missing parent directories) if it does not exist. If it + already exists, it must be empty. - When the backup is in tar mode, and the directory is specified as - - (dash), the tar file will be written to - stdout. + When the backup is in tar format, the target directory may be + specified as - (dash), causing the tar file to be + written to stdout. This option is required. @@ -155,12 +155,12 @@ PostgreSQL documentation Write the output as plain files, with the same layout as the - current data directory and tablespaces. When the cluster has + source server's data directory and tablespaces. When the cluster has no additional tablespaces, the whole database will be placed in the target directory. If the cluster contains additional tablespaces, the main data directory will be placed in the target directory, but all other tablespaces will be placed - in the same absolute path as they have on the server. + in the same absolute path as they have on the source server. This is the default format. @@ -174,15 +174,15 @@ PostgreSQL documentation Write the output as tar files in the target directory. The main - data directory will be written to a file named - base.tar, and all other tablespaces will - be named after the tablespace OID. - + data directory's contents will be written to a file named + base.tar, and each other tablespace will be + written to a separate tar file named after that tablespace's OID. + - If the value - (dash) is specified as - target directory, the tar contents will be written to - standard output, suitable for piping to for example - gzip. This is only possible if + If the target directory is specified as - + (dash), the tar contents will be written to + standard output, suitable for piping to (for example) + gzip. This is only allowed if the cluster has no additional tablespaces and WAL streaming is not used. @@ -192,40 +192,22 @@ PostgreSQL documentation - - - - - - The maximum transfer rate of data transferred from the server. Values are - in kilobytes per second. Use a suffix of M to indicate megabytes - per second. A suffix of k is also accepted, and has no effect. - Valid values are between 32 kilobytes per second and 1024 megabytes per second. - - - The purpose is to limit the impact of pg_basebackup - on the running server. - - - This option always affects transfer of the data directory. Transfer of - WAL files is only affected if the collection method is fetch. - - - - - Create standby.signal and append connection settings - to postgresql.auto.conf in the output - directory (or into the base archive file when using tar format) to - ease setting up a standby server. + Creates a standby.signal file and appends + connection settings to the postgresql.auto.conf + file in the target directory (or within the base archive file when + using tar format). This eases setting up a standby server using the + results of the backup. + + The postgresql.auto.conf file will record the connection settings and, if specified, the replication slot - that pg_basebackup is using, so that the + that pg_basebackup is using, so that streaming replication will use the same settings later on. @@ -237,17 +219,21 @@ PostgreSQL documentation - Relocate the tablespace in directory olddir + Relocates the tablespace in directory olddir to newdir during the backup. To be effective, olddir must exactly match the - path specification of the tablespace as it is currently defined. (But - it is not an error if there is no tablespace - in olddir contained in the backup.) + path specification of the tablespace as it is defined on the source + server. (But it is not an error if there is no tablespace + in olddir on the source server.) + Meanwhile newdir is a directory in the + receiving host's filesystem. As with the main target directory, + newdir need not exist already, but if + it does exist it must be empty. Both olddir - and newdir must be absolute paths. If a - path happens to contain a = sign, escape it with a - backslash. This option can be specified multiple times for multiple - tablespaces. See examples below. + and newdir must be absolute paths. If + either path needs to contain an equal sign (=), + precede that with a backslash. This option can be specified multiple + times for multiple tablespaces. @@ -263,10 +249,16 @@ PostgreSQL documentation - Specifies the location for the write-ahead log directory. + Sets the directory to write WAL (write-ahead log) files to. + By default WAL files will be placed in + the pg_wal subdirectory of the target + directory, but this option can be used to place them elsewhere. waldir must be an absolute path. - The write-ahead log directory can only be specified when - the backup is in plain mode. + As with the main target directory, + waldir need not exist already, but if + it does exist it must be empty. + This option can only be specified when + the backup is in plain format. @@ -276,16 +268,16 @@ PostgreSQL documentation - Includes the required write-ahead log files (WAL files) in the + Includes the required WAL (write-ahead log) files in the backup. This will include all write-ahead logs generated during the backup. Unless the method none is specified, - it is possible to start a postmaster directly in the extracted + it is possible to start a postmaster in the target directory without the need to consult the log archive, thus - making this a completely standalone backup. + making the output a completely standalone backup. - The following methods for collecting the write-ahead logs are - supported: + The following methods for collecting the + write-ahead logs are supported: @@ -293,7 +285,7 @@ PostgreSQL documentation none - Don't include write-ahead log in the backup. + Don't include write-ahead logs in the backup. @@ -304,15 +296,16 @@ PostgreSQL documentation The write-ahead log files are collected at the end of the backup. - Therefore, it is necessary for the + Therefore, it is necessary for the source server's parameter to be set high - enough that the log is not removed before the end of the backup. - If the log has been rotated when it's time to transfer it, the - backup will fail and be unusable. + enough that the required log data is not removed before the end + of the backup. If the required log data has been recycled + before it's time to transfer it, the backup will fail and be + unusable. - When tar format mode is used, the write-ahead log files will be - written to the base.tar file. + When tar format is used, the write-ahead log files will be + included in the base.tar file. @@ -322,16 +315,16 @@ PostgreSQL documentation stream - Stream the write-ahead log while the backup is created. This will - open a second connection to the server and start streaming the - write-ahead log in parallel while running the backup. Therefore, - it will use up two connections configured by the - parameter. As long as the - client can keep up with write-ahead log received, using this mode - requires no extra write-ahead logs to be saved on the primary. + Stream write-ahead log data while the backup is being taken. + This method will open a second connection to the server and + start streaming the write-ahead log in parallel while running + the backup. Therefore, it will require two replication + connections not just one. As long as the client can keep up + with the write-ahead log data, using this method requires no + extra write-ahead logs to be saved on the source server. - When tar format mode is used, the write-ahead log files will be + When tar format is used, the write-ahead log files will be written to a separate file named pg_wal.tar (if the server is a version earlier than 10, the file will be named pg_xlog.tar). @@ -375,7 +368,7 @@ PostgreSQL documentation The following command-line options control the generation of the - backup and the running of the program. + backup and the running of the program: @@ -383,7 +376,8 @@ PostgreSQL documentation - Sets checkpoint mode to fast (immediate) or spread (default) (see ). + Sets checkpoint mode to fast (immediate) or spread (the default) + (see ). @@ -393,9 +387,9 @@ PostgreSQL documentation - This option causes creation of a replication slot named by the - --slot option before starting the backup. - An error is raised if the slot already exists. + Specifies that the replication slot named by the + --slot option should be created before starting + the backup. An error is raised if the slot already exists. @@ -418,9 +412,9 @@ PostgreSQL documentation By default, when pg_basebackup aborts with an error, it removes any directories it might have created before - discovering that it cannot finish the job (for example, data directory - and write-ahead log directory). This option inhibits tidying-up and is - thus useful for debugging. + discovering that it cannot finish the job (for example, the target + directory and write-ahead log directory). This option inhibits + tidying-up and is thus useful for debugging. @@ -460,19 +454,41 @@ PostgreSQL documentation + + + + + + Sets the maximum transfer rate at which data is collected from the + source server. This can be useful to limit the impact + of pg_basebackup on the server. Values + are in kilobytes per second. Use a suffix of M + to indicate megabytes per second. A suffix of k + is also accepted, and has no effect. Valid values are between 32 + kilobytes per second and 1024 megabytes per second. + + + This option always affects transfer of the data directory. Transfer of + WAL files is only affected if the collection method + is fetch. + + + + This option can only be used together with -X - stream. It causes the WAL streaming to use the specified + stream. It causes WAL streaming to use the specified replication slot. If the base backup is intended to be used as a - streaming replication standby using replication slots, it should then - use the same replication slot name - in . That way, it is ensured that - the server does not remove any necessary WAL data in the time between - the end of the base backup and the start of streaming replication. + streaming-replication standby using a replication slot, the standby + should then use the same replication slot name as + . This ensures that the + primary server does not remove any necessary WAL data in the time + between the end of the base backup and the start of streaming + replication on the new standby. The specified replication slot has to exist unless the @@ -522,15 +538,15 @@ PostgreSQL documentation Using a SHA hash function provides a cryptographically secure digest of each file for users who wish to verify that the backup has not been - tampered with, while the CRC32C algorithm provides a checksum which is - much faster to calculate and good at catching errors due to accidental + tampered with, while the CRC32C algorithm provides a checksum that is + much faster to calculate; it is good at catching errors due to accidental changes but is not resistant to targeted modifications. Note that, to be useful against an adversary who has access to the backup, the backup manifest would need to be stored securely elsewhere or otherwise verified not to have been modified since the backup was taken. - can be used to check the + can be used to check the integrity of a backup against the backup manifest. @@ -552,11 +568,11 @@ PostgreSQL documentation - This option prevents the server from estimating the total + Prevents the server from estimating the total amount of backup data that will be streamed, resulting in the - backup_total column in the - pg_stat_progress_basebackup - to be NULL. + backup_total column in the + pg_stat_progress_basebackup view + always being NULL. Without this option, the backup will start by enumerating @@ -578,7 +594,7 @@ PostgreSQL documentation Disables generation of a backup manifest. If this option is not specified, the server will generate and send a backup manifest - which can be verified using . + which can be verified using . The manifest is a list of every file present in the backup with the exception of any WAL files that may be included. It also stores the size, last modification time, and an optional checksum for each file. @@ -590,16 +606,17 @@ PostgreSQL documentation - This option prevents the creation of a temporary replication slot - during the backup even if it's supported by the server. + Prevents the creation of a temporary replication slot + for the backup. - Temporary replication slots are created by default if no slot name - is given with the option when using log streaming. + By default, if log streaming is selected but no slot name is given + with the option, then a temporary replication + slot is created (if supported by the source server). The main purpose of this option is to allow taking a base backup when - the server is out of free replication slots. Using replication slots + the server has no free replication slots. Using a replication slot is almost always preferred, because it prevents needed WAL from being removed by the server during the backup. @@ -617,7 +634,7 @@ PostgreSQL documentation By default, checksums are verified and checksum failures will result in a non-zero exit status. However, the base backup will not be removed in such a case, as if the option - had been used. Checksum verifications failures will also be reported + had been used. Checksum verification failures will also be reported in the pg_stat_database view. @@ -627,7 +644,8 @@ PostgreSQL documentation - The following command-line options control the database connection parameters. + The following command-line options control the connection to the source + server: @@ -641,7 +659,7 @@ PostgreSQL documentation The option is called --dbname for consistency with other client applications, but because pg_basebackup - doesn't connect to any particular database in the cluster, database + doesn't connect to any particular database in the cluster, any database name in the connection string will be ignored. @@ -654,7 +672,7 @@ PostgreSQL documentation Specifies the host name of the machine on which the server is running. If the value begins with a slash, it is used as the - directory for the Unix domain socket. The default is taken + directory for a Unix domain socket. The default is taken from the PGHOST environment variable, if set, else a Unix domain socket connection is attempted. @@ -679,11 +697,12 @@ PostgreSQL documentation - Specifies the number of seconds between status packets sent back to the - server. This allows for easier monitoring of the progress from server. - A value of zero disables the periodic status updates completely, + Specifies the number of seconds between status packets sent back to + the source server. Smaller values allow more accurate monitoring of + backup progress from the server. + A value of zero disables periodic status updates completely, although an update will still be sent when requested by the server, to - avoid timeout disconnect. The default value is 10 seconds. + avoid timeout-based disconnects. The default value is 10 seconds. @@ -693,7 +712,7 @@ PostgreSQL documentation - User name to connect as. + Specifies the user name to connect as. @@ -703,7 +722,7 @@ PostgreSQL documentation - Never issue a password prompt. If the server requires + Prevents issuing a password prompt. If the server requires password authentication and a password is not available by other means such as a .pgpass file, the connection attempt will fail. This option can be useful in @@ -718,8 +737,8 @@ PostgreSQL documentation - Force pg_basebackup to prompt for a - password before connecting to a database. + Forces pg_basebackup to prompt for a + password before connecting to the source server. @@ -745,7 +764,7 @@ PostgreSQL documentation - Print the pg_basebackup version and exit. + Prints the pg_basebackup version and exits. @@ -755,8 +774,8 @@ PostgreSQL documentation - Show help about pg_basebackup command line - arguments, and exit. + Shows help about pg_basebackup command line + arguments, and exits. @@ -787,11 +806,10 @@ PostgreSQL documentation Notes - At the beginning of the backup, a checkpoint needs to be written on the - server the backup is taken from. Especially if the option - --checkpoint=fast is not used, this can take some time - during which pg_basebackup will be appear - to be idle. + At the beginning of the backup, a checkpoint needs to be performed on the + source server. This can take some time (especially if the option + --checkpoint=fast is not used), during + which pg_basebackup will appear to be idle. @@ -806,8 +824,8 @@ PostgreSQL documentation - Tablespaces will in plain format by default be backed up to the same path - they have on the server, unless the + In plain format, tablespaces will be backed up to the same path + they have on the source server, unless the option --tablespace-mapping is used. Without this option, running a plain format base backup on the same host as the server will not work if tablespaces are in use, because the backup would @@ -816,8 +834,9 @@ PostgreSQL documentation - When tar format mode is used, it is the user's responsibility to unpack each - tar file before starting the PostgreSQL server. If there are additional tablespaces, the + When tar format is used, it is the user's responsibility to unpack each + tar file before starting a PostgreSQL server that uses the data. If there + are additional tablespaces, the tar files for them need to be unpacked in the correct locations. In this case the symbolic links for those tablespaces will be created by the server according to the contents of the tablespace_map file that is @@ -827,15 +846,14 @@ PostgreSQL documentation pg_basebackup works with servers of the same or an older major version, down to 9.1. However, WAL streaming mode (-X - stream) only works with server version 9.3 and later, and tar format mode - (--format=tar) of the current version only works with server version 9.5 - or later. + stream) only works with server version 9.3 and later, and tar format + (--format=tar) only works with server version 9.5 + and later. - pg_basebackup will preserve group permissions in - both the plain and tar formats if group - permissions are enabled on the source cluster. + pg_basebackup will preserve group permissions + for data files if group permissions are enabled on the source cluster. From 676a9c3cc4b5f1d262c29de318868948513f0fa0 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Sat, 15 Aug 2020 20:21:52 -0700 Subject: [PATCH 307/334] Correct several behavior descriptions in comments. Reuse cautionary language from src/test/ssl/README in src/test/kerberos/README. SLRUs have had access to six-character segments names since commit 73c986adde5d73a5e2555da9b5c8facedb146dcd, and recovery stopped calling HeapTupleHeaderAdvanceLatestRemovedXid() in commit 558a9165e081d1936573e5a7d576f5febd7fb55a. The other corrections are more self-evident. --- src/backend/access/heap/heapam.c | 2 -- src/backend/access/transam/README | 11 +++++------ src/backend/access/transam/varsup.c | 13 +++++++------ src/backend/commands/async.c | 11 ++++------- src/backend/commands/vacuum.c | 10 +++++----- src/backend/storage/buffer/bufmgr.c | 2 +- src/bin/pg_waldump/pg_waldump.c | 11 +++-------- src/include/access/xlog_internal.h | 7 ++----- src/test/kerberos/README | 10 ++++++---- src/test/perl/PostgresNode.pm | 6 ++---- .../recovery/t/010_logical_decoding_timelines.pl | 2 +- src/test/ssl/t/SSLServer.pm | 1 - 12 files changed, 36 insertions(+), 50 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f75e1cf0e7b0..9b5f417eac44 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6920,8 +6920,6 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, * updated/deleted by the inserting transaction. * * Look for a committed hint bit, or if no xmin bit is set, check clog. - * This needs to work on both primary and standby, where it is used to - * assess btree delete records. */ if (HeapTupleHeaderXminCommitted(tuple) || (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin))) diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index c5f09667ba15..1edc8180c128 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -635,12 +635,11 @@ be reconstructed later following a crash and the action is simply a way of optimising for performance. When a hint is written we use MarkBufferDirtyHint() to mark the block dirty. -If the buffer is clean and checksums are in use then -MarkBufferDirtyHint() inserts an XLOG_FPI record to ensure that we -take a full page image that includes the hint. We do this to avoid -a partial page write, when we write the dirtied page. WAL is not -written during recovery, so we simply skip dirtying blocks because -of hints when in recovery. +If the buffer is clean and checksums are in use then MarkBufferDirtyHint() +inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image +that includes the hint. We do this to avoid a partial page write, when we +write the dirtied page. WAL is not written during recovery, so we simply skip +dirtying blocks because of hints when in recovery. If you do decide to optimise away a WAL record, then any calls to MarkBufferDirty() must be replaced by MarkBufferDirtyHint(), diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 2d2b05be36c4..a4944faa32e3 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -367,12 +367,13 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) * We'll refuse to continue assigning XIDs in interactive mode once we get * within 3M transactions of data loss. This leaves lots of room for the * DBA to fool around fixing things in a standalone backend, while not - * being significant compared to total XID space. (Note that since - * vacuuming requires one transaction per table cleaned, we had better be - * sure there's lots of XIDs left...) Also, at default BLCKSZ, this - * leaves two completely-idle segments. In the event of edge-case bugs - * involving page or segment arithmetic, idle segments render the bugs - * unreachable outside of single-user mode. + * being significant compared to total XID space. (VACUUM requires an XID + * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA + * might do by reflex, assigns an XID. Hence, we had better be sure + * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two + * completely-idle segments. In the event of edge-case bugs involving + * page or segment arithmetic, idle segments render the bugs unreachable + * outside of single-user mode. */ xidStopLimit = xidWrapLimit - 3000000; if (xidStopLimit < FirstNormalTransactionId) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 4c1286eb988e..774b26fd2c4d 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -302,13 +302,10 @@ static SlruCtlData NotifyCtlData; #define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */ /* - * slru.c currently assumes that all filenames are four characters of hex - * digits. That means that we can use segments 0000 through FFFF. - * Each segment contains SLRU_PAGES_PER_SEGMENT pages which gives us - * the pages from 0 to SLRU_PAGES_PER_SEGMENT * 0x10000 - 1. - * - * It's of course possible to enhance slru.c, but this gives us so much - * space already that it doesn't seem worth the trouble. + * Use segments 0000 through FFFF. Each contains SLRU_PAGES_PER_SEGMENT pages + * which gives us the pages from 0 to SLRU_PAGES_PER_SEGMENT * 0x10000 - 1. + * We could use as many segments as SlruScanDirectory() allows, but this gives + * us so much space already that it doesn't seem worth the trouble. * * The most data we can have in the queue at a time is QUEUE_MAX_PAGE/2 * pages, because more than that would confuse slru.c into thinking there diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 5189a5ad5e37..23eb605d4cb2 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -949,11 +949,11 @@ vacuum_set_xid_limits(Relation rel, /* * We can always ignore processes running lazy vacuum. This is because we * use these values only for deciding which tuples we must keep in the - * tables. Since lazy vacuum doesn't write its XID anywhere, it's safe to - * ignore it. In theory it could be problematic to ignore lazy vacuums in - * a full vacuum, but keep in mind that only one vacuum process can be - * working on a particular table at any time, and that each vacuum is - * always an independent transaction. + * tables. Since lazy vacuum doesn't write its XID anywhere (usually no + * XID assigned), it's safe to ignore it. In theory it could be + * problematic to ignore lazy vacuums in a full vacuum, but keep in mind + * that only one vacuum process can be working on a particular table at + * any time, and that each vacuum is always an independent transaction. */ *oldestXmin = GetOldestNonRemovableTransactionId(rel); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f1ae6f9f8443..a2a963bd5b41 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3578,7 +3578,7 @@ IncrBufferRefCount(Buffer buffer) * This is essentially the same as MarkBufferDirty, except: * * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI WAL record to protect against torn pages. + * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. * 2. The caller might have only share-lock instead of exclusive-lock on the * buffer's content lock. * 3. This function does not guarantee that the buffer is always marked dirty diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index d1a067893539..31e99c2a6da5 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -611,14 +611,9 @@ XLogDumpDisplayStats(XLogDumpConfig *config, XLogDumpStats *stats) double rec_len_pct, fpi_len_pct; - /* --- - * Make a first pass to calculate column totals: - * count(*), - * sum(xl_len+SizeOfXLogRecord), - * sum(xl_tot_len-xl_len-SizeOfXLogRecord), and - * sum(xl_tot_len). - * These are used to calculate percentages for each record type. - * --- + /* + * Each row shows its percentages of the total, so make a first pass to + * calculate column totals. */ for (ri = 0; ri < RM_NEXT_ID; ri++) diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 9b2da56379e1..4146753d4765 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -43,11 +43,8 @@ typedef struct XLogPageHeaderData /* * When there is not enough space on current page for whole record, we * continue on the next page. xlp_rem_len is the number of bytes - * remaining from a previous page. - * - * Note that xlp_rem_len includes backup-block data; that is, it tracks - * xl_tot_len not xl_len in the initial header. Also note that the - * continuation data isn't necessarily aligned. + * remaining from a previous page; it tracks xl_tot_len in the initial + * header. Note that the continuation data isn't necessarily aligned. */ uint32 xlp_rem_len; /* total len of remaining data for record */ } XLogPageHeaderData; diff --git a/src/test/kerberos/README b/src/test/kerberos/README index 93af72e16367..fa9c03e78291 100644 --- a/src/test/kerberos/README +++ b/src/test/kerberos/README @@ -8,10 +8,12 @@ functionality. This requires a full MIT Kerberos installation, including server and client tools, and is therefore kept separate and not run by default. -Also, this test suite creates a KDC server that listens for TCP/IP -connections on localhost without any real access control, so it is not -safe to run this on a system where there might be untrusted local -users. +CAUTION: The test server run by this test is configured to listen for TCP +connections on localhost. Any user on the same host is able to log in to the +test server while the tests are running. Do not run this suite on a multi-user +system where you don't trust all local users! Also, this test suite creates a +KDC server that listens for TCP/IP connections on localhost without any real +access control. Running the tests ================= diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index 8c1b77376fb0..1488bffa2ba3 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -1234,10 +1234,8 @@ sub can_bind return $ret; } -# Automatically shut down any still-running nodes when the test script exits. -# Note that this just stops the postmasters (in the same order the nodes were -# created in). Any temporary directories are deleted, in an unspecified -# order, later when the File::Temp objects are destroyed. +# Automatically shut down any still-running nodes (in the same order the nodes +# were created in) when the test script exits. END { diff --git a/src/test/recovery/t/010_logical_decoding_timelines.pl b/src/test/recovery/t/010_logical_decoding_timelines.pl index 09aaefa9f032..329500f0ae5b 100644 --- a/src/test/recovery/t/010_logical_decoding_timelines.pl +++ b/src/test/recovery/t/010_logical_decoding_timelines.pl @@ -111,7 +111,7 @@ # Examine the physical slot the replica uses to stream changes # from the primary to make sure its hot_standby_feedback # has locked in a catalog_xmin on the physical slot, and that -# any xmin is < the catalog_xmin +# any xmin is >= the catalog_xmin $node_primary->poll_query_until( 'postgres', q[ SELECT catalog_xmin IS NOT NULL diff --git a/src/test/ssl/t/SSLServer.pm b/src/test/ssl/t/SSLServer.pm index 1e392b8fbf61..f5987a003efd 100644 --- a/src/test/ssl/t/SSLServer.pm +++ b/src/test/ssl/t/SSLServer.pm @@ -9,7 +9,6 @@ # - a database called trustdb that lets anyone in # - another database called certdb that uses certificate authentication, ie. # the client must present a valid certificate signed by the client CA -# - two users, called ssltestuser and anotheruser. # # The server is configured to only accept connections from localhost. If you # want to run the client from another host, you'll have to configure that From 49967da65aec970fcda123acc681f1df5d70bfc6 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Sun, 16 Aug 2020 12:57:01 -0700 Subject: [PATCH 308/334] Make vacuum a bit more verbose to debug BF failure. This is temporary. While possibly some more error checking / debugging in this path would be a good thing, it'll not look exactly like this. Discussion: https://postgr.es/m/20200816181604.l54m6kss5ntd6xow@alap3.anarazel.de --- src/backend/access/heap/heapam.c | 11 ++++++++++- src/backend/access/heap/vacuumlazy.c | 7 +++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9b5f417eac44..8eb276e46449 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6048,7 +6048,16 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, TransactionIdIsInProgress(members[i].xid)) { /* running locker cannot possibly be older than the cutoff */ - Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid)); + if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) + { + /* temporary on-bf debugging */ + elog(PANIC, "too old alive locker: multi: %u, member xid: %u, memb-current: %d, memb-progress: %d, cutoff: %u, cutoff-multi: %u, relfrozenxid: %u, relminmxid: %u", + multi, members[i].xid, + TransactionIdIsCurrentTransactionId(members[i].xid), + TransactionIdIsInProgress(members[i].xid), + cutoff_xid, cutoff_multi, + relfrozenxid, relminmxid); + } newmembers[nnewmembers++] = members[i]; has_lockers = true; } diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 44e2224dd557..03c8e1ff7ea9 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1350,7 +1350,14 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple) || params->index_cleanup == VACOPT_TERNARY_DISABLED) + { + /* temporary on-bf debugging */ + elog(LOG, "treating dead HOT tuple (updated %d, heap only: %d, index cleanup: %d) as alive", + HeapTupleIsHotUpdated(&tuple), HeapTupleIsHeapOnly(&tuple), + params->index_cleanup == VACOPT_TERNARY_DISABLED); + nkeep += 1; + } else tupgone = true; /* we can delete the tuple */ all_visible = false; From f6661d3df228dbbf50efb04f2b760774a6f2bfff Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Sun, 16 Aug 2020 14:21:37 -0700 Subject: [PATCH 309/334] Fix use of wrong index in ComputeXidHorizons(). This bug, recently introduced in 941697c3c1a, at least lead to vacuum failing because it found tuples inserted by a running transaction, but below the freeze limit. The freeze limit in turn is directly affected by the aforementioned bug. Thanks to Tom Lane figuring how to make the bug reproducible. We should add a few more assertions to make sure this type of bug isn't as hard to notice, but it's not yet clear how to best do so. Co-Diagnosed-By: Tom Lane Author: Andres Freund Discussion: https://postgr.es/m/1013484.1597609043@sss.pgh.pa.us --- src/backend/storage/ipc/procarray.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 8262abd42e6b..96e4a8785760 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1663,7 +1663,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) TransactionId xmin; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(other_xids[pgprocno]); + xid = UINT32_ACCESS_ONCE(other_xids[index]); xmin = UINT32_ACCESS_ONCE(proc->xmin); /* From b4f16397af460d9d6ead31b86cb3e7f562806866 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 17 Aug 2020 10:23:17 +0900 Subject: [PATCH 310/334] doc: Fix description about bgwriter and checkpoint in HA section Since 806a2ae, the work of the bgwriter is split the checkpointer, but a portion of the documentation did not get the message. Author: Masahiko Sawada Discussion: https://postgr.es/m/CA+fd4k6jXxjAtjMVC=wG3=QGpauZBtcgN3Jhw+oV7zXGKVLKzQ@mail.gmail.com Backpatch-through: 9.5 --- doc/src/sgml/high-availability.sgml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index a824d383f2d8..d6f79fc435ea 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -2380,9 +2380,10 @@ LOG: database system is ready to accept read only connections - The background writer is active during recovery and will perform - restartpoints (similar to checkpoints on the primary) and normal block - cleaning activities. This can include updates of the hint bit + The checkpointer process and the background writer process are active during + recovery. The checkpointer process will perform restartpoints (similar to + checkpoints on the primary) and the background writer process will perform + normal block cleaning activities. This can include updates of the hint bit information stored on the standby server. The CHECKPOINT command is accepted during recovery, though it performs a restartpoint rather than a new checkpoint. From d7ec8337f9093b097f08f94e5ecec36303ad73fd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 17 Aug 2020 09:27:29 +0300 Subject: [PATCH 311/334] Fix printing last progress report line in client programs. A number of client programs have a "--progress" option that when printing to a TTY, updates the current line by printing a '\r' and overwriting it. After the last line, '\n' needs to be printed to move the cursor to the next line. pg_basebackup and pgbench got this right, but pg_rewind and pg_checksums were slightly wrong. pg_rewind printed the newline to stdout instead of stderr, and pg_checksums printed the newline even when not printing to a TTY. Fix them, and also add a 'finished' argument to pg_basebackup's progress_report() function, to keep it consistent with the other programs. Backpatch to v12. pg_rewind's newline was broken with the logging changes in commit cc8d415117 in v12, and pg_checksums was introduced in v12. Discussion: https://www.postgresql.org/message-id/82b539e5-ae33-34b0-1aee-22b3379fd3eb@iki.fi --- src/bin/pg_basebackup/pg_basebackup.c | 38 ++++++++++++++------------- src/bin/pg_checksums/pg_checksums.c | 14 +++++----- src/bin/pg_rewind/pg_rewind.c | 22 +++++++++------- src/bin/pg_rewind/pg_rewind.h | 2 +- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 4f29671d0cdc..8158c8e41957 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -188,7 +188,8 @@ static PQExpBuffer recoveryconfcontents = NULL; /* Function headers */ static void usage(void); static void verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found); -static void progress_report(int tablespacenum, const char *filename, bool force); +static void progress_report(int tablespacenum, const char *filename, bool force, + bool finished); static void ReceiveTarFile(PGconn *conn, PGresult *res, int rownum); static void ReceiveTarCopyChunk(size_t r, char *copybuf, void *callback_data); @@ -765,11 +766,15 @@ verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found) * Print a progress report based on the global variables. If verbose output * is enabled, also print the current file name. * - * Progress report is written at maximum once per second, unless the - * force parameter is set to true. + * Progress report is written at maximum once per second, unless the force + * parameter is set to true. + * + * If finished is set to true, this is the last progress report. The cursor + * is moved to the next line. */ static void -progress_report(int tablespacenum, const char *filename, bool force) +progress_report(int tablespacenum, const char *filename, + bool force, bool finished) { int percent; char totaldone_str[32]; @@ -780,7 +785,7 @@ progress_report(int tablespacenum, const char *filename, bool force) return; now = time(NULL); - if (now == last_progress_report && !force) + if (now == last_progress_report && !force && !finished) return; /* Max once per second */ last_progress_report = now; @@ -851,10 +856,11 @@ progress_report(int tablespacenum, const char *filename, bool force) totaldone_str, totalsize_str, percent, tablespacenum, tablespacecount); - if (isatty(fileno(stderr))) - fprintf(stderr, "\r"); - else - fprintf(stderr, "\n"); + /* + * Stay on the same line if reporting to a terminal and we're not done + * yet. + */ + fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n"); } static int32 @@ -1277,7 +1283,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum) } } - progress_report(rownum, state.filename, true); + progress_report(rownum, state.filename, true, false); /* * Do not sync the resulting tar file yet, all files are synced once at @@ -1470,7 +1476,7 @@ ReceiveTarCopyChunk(size_t r, char *copybuf, void *callback_data) } } totaldone += r; - progress_report(state->tablespacenum, state->filename, false); + progress_report(state->tablespacenum, state->filename, false, false); } @@ -1528,7 +1534,7 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum) if (state.file) fclose(state.file); - progress_report(rownum, state.filename, true); + progress_report(rownum, state.filename, true, false); if (state.file != NULL) { @@ -1709,7 +1715,7 @@ ReceiveTarAndUnpackCopyChunk(size_t r, char *copybuf, void *callback_data) exit(1); } totaldone += r; - progress_report(state->tablespacenum, state->filename, false); + progress_report(state->tablespacenum, state->filename, false, false); state->current_len_left -= r; if (state->current_len_left == 0 && state->current_padding == 0) @@ -2027,11 +2033,7 @@ BaseBackup(void) ReceiveBackupManifest(conn); if (showprogress) - { - progress_report(PQntuples(res), NULL, true); - if (isatty(fileno(stderr))) - fprintf(stderr, "\n"); /* Need to move to next line */ - } + progress_report(PQntuples(res), NULL, true, true); PQclear(res); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 1daa5aed0e0f..0696db69bbd5 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -125,7 +125,7 @@ static const struct exclude_list_item skip[] = { * src/bin/pg_basebackup/pg_basebackup.c. */ static void -progress_report(bool force) +progress_report(bool finished) { int percent; char total_size_str[32]; @@ -135,7 +135,7 @@ progress_report(bool force) Assert(showprogress); now = time(NULL); - if (now == last_progress_report && !force) + if (now == last_progress_report && !finished) return; /* Max once per second */ /* Save current time */ @@ -162,8 +162,11 @@ progress_report(bool force) (int) strlen(current_size_str), current_size_str, total_size_str, percent); - /* Stay on the same line if reporting to a terminal */ - fprintf(stderr, isatty(fileno(stderr)) ? "\r" : "\n"); + /* + * Stay on the same line if reporting to a terminal and we're not done + * yet. + */ + fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n"); } static bool @@ -624,10 +627,7 @@ main(int argc, char *argv[]) (void) scan_directory(DataDir, "pg_tblspc", false); if (showprogress) - { progress_report(true); - fprintf(stderr, "\n"); /* Need to move to next line */ - } printf(_("Checksum operation completed\n")); printf(_("Files scanned: %s\n"), psprintf(INT64_FORMAT, files)); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 0015d3b461a7..a9aecc790528 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -422,7 +422,6 @@ main(int argc, char **argv) executeFileMap(); progress_report(true); - printf("\n"); if (showprogress) pg_log_info("creating backup label and updating control file"); @@ -519,11 +518,14 @@ sanityChecks(void) /* * Print a progress report based on the fetch_size and fetch_done variables. * - * Progress report is written at maximum once per second, unless the - * force parameter is set to true. + * Progress report is written at maximum once per second, except that the + * last progress report is always printed. + * + * If finished is set to true, this is the last progress report. The cursor + * is moved to the next line. */ void -progress_report(bool force) +progress_report(bool finished) { static pg_time_t last_progress_report = 0; int percent; @@ -535,7 +537,7 @@ progress_report(bool force) return; now = time(NULL); - if (now == last_progress_report && !force) + if (now == last_progress_report && !finished) return; /* Max once per second */ last_progress_report = now; @@ -565,10 +567,12 @@ progress_report(bool force) fprintf(stderr, _("%*s/%s kB (%d%%) copied"), (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str, percent); - if (isatty(fileno(stderr))) - fprintf(stderr, "\r"); - else - fprintf(stderr, "\n"); + + /* + * Stay on the same line if reporting to a terminal and we're not done + * yet. + */ + fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n"); } /* diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index 5cf5f17bb5f1..8a9319ed6759 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -53,7 +53,7 @@ extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, const char *restoreCommand); /* in pg_rewind.c */ -extern void progress_report(bool force); +extern void progress_report(bool finished); /* in timeline.c */ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, From 3941eb6341d8274dd63a26972042da6632533f2b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 17 Aug 2020 10:50:13 +0300 Subject: [PATCH 312/334] Make xact.h usable in frontend. xact.h included utils/datetime.h, which cannot be used in the frontend (it includes fmgr.h, which needs Datum). But xact.h only needs the definition of TimestampTz from it, which is available directly in datatypes/timestamp.h. Change xact.h to include that instead of utils/datetime.h, so that it can be used in client programs. --- contrib/pg_prewarm/autoprewarm.c | 1 + contrib/postgres_fdw/connection.c | 1 + src/backend/nodes/params.c | 1 + src/backend/utils/time/snapmgr.c | 2 ++ src/include/access/xact.h | 2 +- 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index d797095458a4..c32ddc56fdbc 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -46,6 +46,7 @@ #include "storage/smgr.h" #include "tcop/tcopprot.h" #include "utils/acl.h" +#include "utils/datetime.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/rel.h" diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 52d1fe356315..08daf26fdf08 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -22,6 +22,7 @@ #include "postgres_fdw.h" #include "storage/fd.h" #include "storage/latch.h" +#include "utils/datetime.h" #include "utils/hsearch.h" #include "utils/inval.h" #include "utils/memutils.h" diff --git a/src/backend/nodes/params.c b/src/backend/nodes/params.c index 1719119fc28f..bce0c7e72b2c 100644 --- a/src/backend/nodes/params.c +++ b/src/backend/nodes/params.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/xact.h" +#include "fmgr.h" #include "mb/stringinfo_mb.h" #include "nodes/params.h" #include "parser/parse_node.h" diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 752af0c10dfc..c208538e2e5c 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -53,6 +53,7 @@ #include "access/xact.h" #include "access/xlog.h" #include "catalog/catalog.h" +#include "datatype/timestamp.h" #include "lib/pairingheap.h" #include "miscadmin.h" #include "storage/predicate.h" @@ -67,6 +68,7 @@ #include "utils/resowner_private.h" #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "utils/timestamp.h" /* diff --git a/src/include/access/xact.h b/src/include/access/xact.h index c59de9bebaf8..df1b43a932e3 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -16,11 +16,11 @@ #include "access/transam.h" #include "access/xlogreader.h" +#include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "storage/relfilenode.h" #include "storage/sinval.h" -#include "utils/datetime.h" /* * Maximum size of Global Transaction ID (including '\0'). From a28d731a1187e8d9d8c2b6319375fcbf0a8debd5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 17 Aug 2020 10:52:58 +0300 Subject: [PATCH 313/334] Mark commit and abort WAL records with XLR_SPECIAL_REL_UPDATE. If a commit or abort record includes "dropped relfilenodes", then replaying the record will remove data files. That is surely a "special rel update", but the records were not marked as such. Fix that, teach pg_rewind to expect and ignore them, and add a test case to cover it. It's always been like this, but no backporting for fear of breaking existing applications. If an application parsed the WAL but was not handling commit/abort records, it would stop working. That might be a good thing if it really needed to handle the dropped rels, but it will be caught when the application is updated to work with PostgreSQL v14 anyway. Discussion: https://www.postgresql.org/message-id/07b33e2c-46a6-86a1-5f9e-a7da73fddb95%40iki.fi Reviewed-by: Amit Kapila, Michael Paquier --- src/backend/access/transam/xact.c | 2 ++ src/bin/pg_rewind/parsexlog.c | 13 +++++++++++++ src/bin/pg_rewind/t/001_basic.pl | 15 ++++++++++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 7ccb7d68ed9a..af6afcebb133 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -5565,6 +5565,7 @@ XactLogCommitRecord(TimestampTz commit_time, { xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES; xl_relfilenodes.nrels = nrels; + info |= XLR_SPECIAL_REL_UPDATE; } if (nmsgs > 0) @@ -5697,6 +5698,7 @@ XactLogAbortRecord(TimestampTz abort_time, { xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES; xl_relfilenodes.nrels = nrels; + info |= XLR_SPECIAL_REL_UPDATE; } if (TransactionIdIsValid(twophase_xid)) diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 2325fb5d3021..2229c86f9afb 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -14,6 +14,7 @@ #include #include "access/rmgr.h" +#include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" @@ -397,6 +398,18 @@ extractPageInfo(XLogReaderState *record) * source system. */ } + else if (rmid == RM_XACT_ID && + ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT || + (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED || + (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT || + (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT_PREPARED)) + { + /* + * These records can include "dropped rels". We can safely ignore + * them, we will see that they are missing and copy them from the + * source. + */ + } else if (info & XLR_SPECIAL_REL_UPDATE) { /* diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl index fb4a0acd965a..ba528e262f32 100644 --- a/src/bin/pg_rewind/t/001_basic.pl +++ b/src/bin/pg_rewind/t/001_basic.pl @@ -1,7 +1,7 @@ use strict; use warnings; use TestLib; -use Test::More tests => 20; +use Test::More tests => 23; use FindBin; use lib $FindBin::RealBin; @@ -29,6 +29,10 @@ sub run_test primary_psql("CREATE TABLE tail_tbl (id integer, d text)"); primary_psql("INSERT INTO tail_tbl VALUES (0, 'in primary')"); + # This test table is dropped in the old primary after promotion. + primary_psql("CREATE TABLE drop_tbl (d text)"); + primary_psql("INSERT INTO drop_tbl VALUES ('in primary')"); + primary_psql("CHECKPOINT"); RewindTest::create_standby($test_mode); @@ -66,6 +70,9 @@ sub run_test primary_psql("DELETE FROM tail_tbl WHERE id > 10"); primary_psql("VACUUM tail_tbl"); + # Drop drop_tbl. pg_rewind should copy it back. + primary_psql("DROP TABLE drop_tbl"); + # Before running pg_rewind, do a couple of extra tests with several # option combinations. As the code paths taken by those tests # do not change for the "local" and "remote" modes, just run them @@ -154,6 +161,12 @@ sub run_test ), 'tail-copy'); + check_query( + 'SELECT * FROM drop_tbl', + qq(in primary +), + 'drop'); + # Permissions on PGDATA should be default SKIP: { From 22e75a341ecc841bdc1db417d11a643b0a42df4f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 17 Aug 2020 15:40:07 -0400 Subject: [PATCH 314/334] Doc: fix description of UNION/CASE/etc type unification. The description of what select_common_type() does was not terribly accurate. Improve it. David Johnston and Tom Lane Discussion: https://postgr.es/m/1019930.1597613200@sss.pgh.pa.us --- doc/src/sgml/typeconv.sgml | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/doc/src/sgml/typeconv.sgml b/doc/src/sgml/typeconv.sgml index 81dba7dacfed..8900d0eb3832 100644 --- a/doc/src/sgml/typeconv.sgml +++ b/doc/src/sgml/typeconv.sgml @@ -1069,7 +1069,7 @@ domain's base type for all subsequent steps. functions, this behavior allows a domain type to be preserved through a UNION or similar construct, so long as the user is careful to ensure that all inputs are implicitly or explicitly of that - exact type. Otherwise the domain's base type will be preferred. + exact type. Otherwise the domain's base type will be used. @@ -1092,24 +1092,29 @@ If the non-unknown inputs are not all of the same type category, fail. -Choose the first non-unknown input type which is a preferred type in -that category, if there is one. - - - - - -Otherwise, choose the last non-unknown input type that allows all the -preceding non-unknown inputs to be implicitly converted to it. (There -always is such a type, since at least the first type in the list must -satisfy this condition.) +Select the first non-unknown input type as the candidate type, +then consider each other non-unknown input type, left to right. + + + For historical reasons, CASE treats + its ELSE clause (if any) as the first + input, with the THEN clauses(s) considered after + that. In all other cases, left to right means the order + in which the expressions appear in the query text. + + +If the candidate type can be implicitly converted to the other type, +but not vice-versa, select the other type as the new candidate type. +Then continue considering the remaining inputs. If, at any stage of this +process, a preferred type is selected, stop considering additional +inputs. -Convert all inputs to the selected type. Fail if there is not a -conversion from a given input to the selected type. +Convert all inputs to the final candidate type. Fail if there is not an +implicit conversion from a given input type to the candidate type. From 6e70443edacfc86674995c0c10ade0aec7a4fddf Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 17 Aug 2020 16:20:06 -0400 Subject: [PATCH 315/334] Disable autovacuum for BRIN test table This should improve stability in the tests. Per buildfarm member hyrax (CLOBBER_CACHE_ALWAYS) via Tom Lane. Discussion: https://postgr.es/m/871534.1597503261@sss.pgh.pa.us --- src/test/regress/expected/brin.out | 2 +- src/test/regress/sql/brin.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out index 0b14c73fc645..18403498dfab 100644 --- a/src/test/regress/expected/brin.out +++ b/src/test/regress/expected/brin.out @@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea, int4rangecol int4range, lsncol pg_lsn, boxcol box -) WITH (fillfactor=10); +) WITH (fillfactor=10, autovacuum_enabled=off); INSERT INTO brintest SELECT repeat(stringu1, 8)::bytea, substr(stringu1, 1, 1)::"char", diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql index 1289e76ecb9b..d1a82474f3f1 100644 --- a/src/test/regress/sql/brin.sql +++ b/src/test/regress/sql/brin.sql @@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea, int4rangecol int4range, lsncol pg_lsn, boxcol box -) WITH (fillfactor=10); +) WITH (fillfactor=10, autovacuum_enabled=off); INSERT INTO brintest SELECT repeat(stringu1, 8)::bytea, From adbe62d04b360bbd408d97e447932d8078485972 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 18 Aug 2020 11:10:50 +0900 Subject: [PATCH 316/334] Add PL/Sample to src/test/modules/ PL/Sample is an example template of procedural-language handler. This can be used as a base to implement a custom PL, or as a facility to test APIs dedicated to PLs. Much more could be done in this module, like adding a simple validator, but this is left as future work. The documentation included originally some C code to understand the basics of PL handler implementation, but it was outdated, and not really helpful either if trying to implement a new procedural language, particularly when it came to the integration of a PL installation with CREATE EXTENSION. Author: Mark Wong Reviewed-by: Tom Lane, Michael Paquier Discussion: https://postgr.es/m/20200612172648.GA3327@2ndQuadrant.com --- doc/src/sgml/plhandler.sgml | 60 +----- src/test/modules/Makefile | 1 + src/test/modules/plsample/.gitignore | 3 + src/test/modules/plsample/Makefile | 20 ++ src/test/modules/plsample/README | 6 + .../modules/plsample/expected/plsample.out | 36 ++++ src/test/modules/plsample/plsample--1.0.sql | 14 ++ src/test/modules/plsample/plsample.c | 183 ++++++++++++++++++ src/test/modules/plsample/plsample.control | 8 + src/test/modules/plsample/sql/plsample.sql | 15 ++ 10 files changed, 290 insertions(+), 56 deletions(-) create mode 100644 src/test/modules/plsample/.gitignore create mode 100644 src/test/modules/plsample/Makefile create mode 100644 src/test/modules/plsample/README create mode 100644 src/test/modules/plsample/expected/plsample.out create mode 100644 src/test/modules/plsample/plsample--1.0.sql create mode 100644 src/test/modules/plsample/plsample.c create mode 100644 src/test/modules/plsample/plsample.control create mode 100644 src/test/modules/plsample/sql/plsample.sql diff --git a/doc/src/sgml/plhandler.sgml b/doc/src/sgml/plhandler.sgml index e1b0af7a60d1..40ee59de9f34 100644 --- a/doc/src/sgml/plhandler.sgml +++ b/doc/src/sgml/plhandler.sgml @@ -96,62 +96,10 @@ - This is a template for a procedural-language handler written in C: - -#include "postgres.h" -#include "executor/spi.h" -#include "commands/trigger.h" -#include "fmgr.h" -#include "access/heapam.h" -#include "utils/syscache.h" -#include "catalog/pg_proc.h" -#include "catalog/pg_type.h" - -PG_MODULE_MAGIC; - -PG_FUNCTION_INFO_V1(plsample_call_handler); - -Datum -plsample_call_handler(PG_FUNCTION_ARGS) -{ - Datum retval; - - if (CALLED_AS_TRIGGER(fcinfo)) - { - /* - * Called as a trigger function - */ - TriggerData *trigdata = (TriggerData *) fcinfo->context; - - retval = ... - } - else - { - /* - * Called as a function - */ - - retval = ... - } - - return retval; -} - - Only a few thousand lines of code have to be added instead of the - dots to complete the call handler. - - - - After having compiled the handler function into a loadable module - (see ), the following commands then - register the sample procedural language: - -CREATE FUNCTION plsample_call_handler() RETURNS language_handler - AS 'filename' - LANGUAGE C; -CREATE LANGUAGE plsample - HANDLER plsample_call_handler; - + A template for a procedural-language handler written as a C extension is + provided in src/test/modules/plsample. This is a + working sample demonstrating one way to create a procedural-language + handler, process parameters, and return a value. diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 1428529b041a..a6d2ffbf9e0e 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -10,6 +10,7 @@ SUBDIRS = \ delay_execution \ dummy_index_am \ dummy_seclabel \ + plsample \ snapshot_too_old \ test_bloomfilter \ test_ddl_deparse \ diff --git a/src/test/modules/plsample/.gitignore b/src/test/modules/plsample/.gitignore new file mode 100644 index 000000000000..44d119cfcc24 --- /dev/null +++ b/src/test/modules/plsample/.gitignore @@ -0,0 +1,3 @@ +# Generated subdirectories +/log/ +/results/ diff --git a/src/test/modules/plsample/Makefile b/src/test/modules/plsample/Makefile new file mode 100644 index 000000000000..f1bc334bfc87 --- /dev/null +++ b/src/test/modules/plsample/Makefile @@ -0,0 +1,20 @@ +# src/test/modules/plsample/Makefile + +MODULES = plsample + +EXTENSION = plsample +DATA = plsample--1.0.sql +PGFILEDESC = "PL/Sample - template for procedural language" + +REGRESS = plsample + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/plsample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/plsample/README b/src/test/modules/plsample/README new file mode 100644 index 000000000000..0ed319308d22 --- /dev/null +++ b/src/test/modules/plsample/README @@ -0,0 +1,6 @@ +PL/Sample +========= + +PL/Sample is an example template of procedural-language handler. It is +a simple implementation, yet demonstrates some of the things that can be done +to build a fully functional procedural-language handler. diff --git a/src/test/modules/plsample/expected/plsample.out b/src/test/modules/plsample/expected/plsample.out new file mode 100644 index 000000000000..a0c318b6df55 --- /dev/null +++ b/src/test/modules/plsample/expected/plsample.out @@ -0,0 +1,36 @@ +CREATE EXTENSION plsample; +-- Create and test some dummy functions +CREATE FUNCTION plsample_result_text(a1 numeric, a2 text, a3 integer[]) +RETURNS TEXT +AS $$ + Example of source with text result. +$$ LANGUAGE plsample; +SELECT plsample_result_text(1.23, 'abc', '{4, 5, 6}'); +NOTICE: source text of function "plsample_result_text": + Example of source with text result. + +NOTICE: argument: 0; name: a1; value: 1.23 +NOTICE: argument: 1; name: a2; value: abc +NOTICE: argument: 2; name: a3; value: {4,5,6} + plsample_result_text +--------------------------------------- + + + Example of source with text result.+ + +(1 row) + +CREATE FUNCTION plsample_result_void(a1 text[]) +RETURNS VOID +AS $$ + Example of source with void result. +$$ LANGUAGE plsample; +SELECT plsample_result_void('{foo, bar, hoge}'); +NOTICE: source text of function "plsample_result_void": + Example of source with void result. + +NOTICE: argument: 0; name: a1; value: {foo,bar,hoge} + plsample_result_void +---------------------- + +(1 row) + diff --git a/src/test/modules/plsample/plsample--1.0.sql b/src/test/modules/plsample/plsample--1.0.sql new file mode 100644 index 000000000000..fc5b280bd4fa --- /dev/null +++ b/src/test/modules/plsample/plsample--1.0.sql @@ -0,0 +1,14 @@ +/* src/test/modules/plsample/plsample--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION plsample" to load this file. \quit + +CREATE FUNCTION plsample_call_handler() RETURNS language_handler + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE TRUSTED LANGUAGE plsample + HANDLER plsample_call_handler; + +ALTER LANGUAGE plsample OWNER TO @extowner@; + +COMMENT ON LANGUAGE plsample IS 'PL/Sample procedural language'; diff --git a/src/test/modules/plsample/plsample.c b/src/test/modules/plsample/plsample.c new file mode 100644 index 000000000000..408366906697 --- /dev/null +++ b/src/test/modules/plsample/plsample.c @@ -0,0 +1,183 @@ +/*------------------------------------------------------------------------- + * + * plsample.c + * Handler for the PL/Sample procedural language + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/test/modules/plsample/plsample.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "commands/event_trigger.h" +#include "commands/trigger.h" +#include "funcapi.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(plsample_call_handler); + +static Datum plsample_func_handler(PG_FUNCTION_ARGS); + +/* + * Handle function, procedure, and trigger calls. + */ +Datum +plsample_call_handler(PG_FUNCTION_ARGS) +{ + Datum retval = (Datum) 0; + + PG_TRY(); + { + /* + * Determine if called as function or trigger and call appropriate + * subhandler. + */ + if (CALLED_AS_TRIGGER(fcinfo)) + { + /* + * This function has been called as a trigger function, where + * (TriggerData *) fcinfo->context includes the information of the + * context. + */ + } + else if (CALLED_AS_EVENT_TRIGGER(fcinfo)) + { + /* + * This function is called as an event trigger function, where + * (EventTriggerData *) fcinfo->context includes the information + * of the context. + */ + } + else + { + /* Regular function handler */ + retval = plsample_func_handler(fcinfo); + } + } + PG_FINALLY(); + { + } + PG_END_TRY(); + + return retval; +} + +/* + * plsample_func_handler + * + * Function called by the call handler for function execution. + */ +static Datum +plsample_func_handler(PG_FUNCTION_ARGS) +{ + HeapTuple pl_tuple; + Datum ret; + char *source; + bool isnull; + FmgrInfo *arg_out_func; + Form_pg_type type_struct; + HeapTuple type_tuple; + Form_pg_proc pl_struct; + volatile MemoryContext proc_cxt = NULL; + Oid *argtypes; + char **argnames; + char *argmodes; + char *proname; + Form_pg_type pg_type_entry; + Oid result_typioparam; + FmgrInfo result_in_func; + int numargs; + + /* Fetch the source text of the function. */ + pl_tuple = SearchSysCache(PROCOID, + ObjectIdGetDatum(fcinfo->flinfo->fn_oid), 0, 0, 0); + if (!HeapTupleIsValid(pl_tuple)) + elog(ERROR, "cache lookup failed for function %u", + fcinfo->flinfo->fn_oid); + + /* + * Extract and print the source text of the function. This can be used as + * a base for the function validation and execution. + */ + pl_struct = (Form_pg_proc) GETSTRUCT(pl_tuple); + proname = pstrdup(NameStr(pl_struct->proname)); + ret = SysCacheGetAttr(PROCOID, pl_tuple, Anum_pg_proc_prosrc, &isnull); + if (isnull) + elog(ERROR, "could not find source text of function \"%s\"", + proname); + ReleaseSysCache(pl_tuple); + source = DatumGetCString(DirectFunctionCall1(textout, ret)); + ereport(NOTICE, + (errmsg("source text of function \"%s\": %s", + proname, source))); + + /* + * Allocate a context that will hold all the Postgres data for the + * procedure. + */ + proc_cxt = AllocSetContextCreate(TopMemoryContext, + "PL/Sample function", + ALLOCSET_SMALL_SIZES); + + arg_out_func = (FmgrInfo *) palloc0(fcinfo->nargs * sizeof(FmgrInfo)); + numargs = get_func_arg_info(pl_tuple, &argtypes, &argnames, &argmodes); + + /* + * Iterate through all of the function arguments, printing each input + * value. + */ + for (int i = 0; i < numargs; i++) + { + Oid argtype = pl_struct->proargtypes.values[i]; + char *value; + + type_tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(argtype)); + if (!HeapTupleIsValid(type_tuple)) + elog(ERROR, "cache lookup failed for type %u", argtype); + + type_struct = (Form_pg_type) GETSTRUCT(type_tuple); + fmgr_info_cxt(type_struct->typoutput, &(arg_out_func[i]), proc_cxt); + ReleaseSysCache(type_tuple); + + value = OutputFunctionCall(&arg_out_func[i], fcinfo->args[i].value); + ereport(NOTICE, + (errmsg("argument: %d; name: %s; value: %s", + i, argnames[i], value))); + } + + /* + * Get the required information for input conversion of the return value. + * + * If the function uses VOID as result, it is better to return NULL. + * Anyway, let's be honest. This is just a template, so there is not much + * we can do here. This returns NULL except if the result type is text, + * where the result is the source text of the function. + */ + if (pl_struct->prorettype != TEXTOID) + PG_RETURN_NULL(); + + type_tuple = SearchSysCache1(TYPEOID, + ObjectIdGetDatum(pl_struct->prorettype)); + if (!HeapTupleIsValid(type_tuple)) + elog(ERROR, "cache lookup failed for type %u", pl_struct->prorettype); + pg_type_entry = (Form_pg_type) GETSTRUCT(type_tuple); + result_typioparam = getTypeIOParam(type_tuple); + + fmgr_info_cxt(pg_type_entry->typinput, &result_in_func, proc_cxt); + ReleaseSysCache(type_tuple); + + ret = InputFunctionCall(&result_in_func, source, result_typioparam, -1); + PG_RETURN_DATUM(ret); +} diff --git a/src/test/modules/plsample/plsample.control b/src/test/modules/plsample/plsample.control new file mode 100644 index 000000000000..1e67251a1e03 --- /dev/null +++ b/src/test/modules/plsample/plsample.control @@ -0,0 +1,8 @@ +# plsample extension +comment = 'PL/Sample' +default_version = '1.0' +module_pathname = '$libdir/plsample' +relocatable = false +schema = pg_catalog +superuser = false +trusted = true diff --git a/src/test/modules/plsample/sql/plsample.sql b/src/test/modules/plsample/sql/plsample.sql new file mode 100644 index 000000000000..bf0fddac7fc8 --- /dev/null +++ b/src/test/modules/plsample/sql/plsample.sql @@ -0,0 +1,15 @@ +CREATE EXTENSION plsample; +-- Create and test some dummy functions +CREATE FUNCTION plsample_result_text(a1 numeric, a2 text, a3 integer[]) +RETURNS TEXT +AS $$ + Example of source with text result. +$$ LANGUAGE plsample; +SELECT plsample_result_text(1.23, 'abc', '{4, 5, 6}'); + +CREATE FUNCTION plsample_result_void(a1 text[]) +RETURNS VOID +AS $$ + Example of source with void result. +$$ LANGUAGE plsample; +SELECT plsample_result_void('{foo, bar, hoge}'); From 51300b45db95b6fd29f88534ab0739fdc9df1699 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 18 Aug 2020 12:24:22 +0900 Subject: [PATCH 317/334] Fix use-after-release issue in PL/Sample Introduced in adbe62d0. Per buildfarm member prion, when using RELCACHE_FORCE_RELEASE. --- src/test/modules/plsample/plsample.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/test/modules/plsample/plsample.c b/src/test/modules/plsample/plsample.c index 408366906697..80faef506b15 100644 --- a/src/test/modules/plsample/plsample.c +++ b/src/test/modules/plsample/plsample.c @@ -97,6 +97,7 @@ plsample_func_handler(PG_FUNCTION_ARGS) char *proname; Form_pg_type pg_type_entry; Oid result_typioparam; + Oid prorettype; FmgrInfo result_in_func; int numargs; @@ -117,7 +118,6 @@ plsample_func_handler(PG_FUNCTION_ARGS) if (isnull) elog(ERROR, "could not find source text of function \"%s\"", proname); - ReleaseSysCache(pl_tuple); source = DatumGetCString(DirectFunctionCall1(textout, ret)); ereport(NOTICE, (errmsg("source text of function \"%s\": %s", @@ -157,6 +157,10 @@ plsample_func_handler(PG_FUNCTION_ARGS) i, argnames[i], value))); } + /* Type of the result */ + prorettype = pl_struct->prorettype; + ReleaseSysCache(pl_tuple); + /* * Get the required information for input conversion of the return value. * @@ -165,13 +169,13 @@ plsample_func_handler(PG_FUNCTION_ARGS) * we can do here. This returns NULL except if the result type is text, * where the result is the source text of the function. */ - if (pl_struct->prorettype != TEXTOID) + if (prorettype != TEXTOID) PG_RETURN_NULL(); type_tuple = SearchSysCache1(TYPEOID, - ObjectIdGetDatum(pl_struct->prorettype)); + ObjectIdGetDatum(prorettype)); if (!HeapTupleIsValid(type_tuple)) - elog(ERROR, "cache lookup failed for type %u", pl_struct->prorettype); + elog(ERROR, "cache lookup failed for type %u", prorettype); pg_type_entry = (Form_pg_type) GETSTRUCT(type_tuple); result_typioparam = getTypeIOParam(type_tuple); From 623a9ba79bbdd11c5eccb30b8bd5c446130e521c Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 17 Aug 2020 21:07:10 -0700 Subject: [PATCH 318/334] snapshot scalability: cache snapshots using a xact completion counter. Previous commits made it faster/more scalable to compute snapshots. But not building a snapshot is still faster. Now that GetSnapshotData() does not maintain RecentGlobal* anymore, that is actually not too hard: This commit introduces xactCompletionCount, which tracks the number of top-level transactions with xids (i.e. which may have modified the database) that completed in some form since the start of the server. We can avoid rebuilding the snapshot's contents whenever the current xactCompletionCount is the same as it was when the snapshot was originally built. Currently this check happens while holding ProcArrayLock. While it's likely possible to perform the check without acquiring ProcArrayLock, it seems better to do that separately / later, some careful analysis is required. Even with the lock this is a significant win on its own. On a smaller two socket machine this gains another ~1.03x, on a larger machine the effect is roughly double (earlier patch version tested though). If we were able to safely avoid the lock there'd be another significant gain on top of that. Author: Andres Freund Reviewed-By: Robert Haas Reviewed-By: Thomas Munro Reviewed-By: David Rowley Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de --- src/backend/replication/logical/snapbuild.c | 1 + src/backend/storage/ipc/procarray.c | 125 ++++++++++++++++---- src/backend/utils/time/snapmgr.c | 4 + src/include/access/transam.h | 9 ++ src/include/utils/snapshot.h | 7 ++ 5 files changed, 126 insertions(+), 20 deletions(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index e9701ea72215..9d5d68f3fa78 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -524,6 +524,7 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->curcid = FirstCommandId; snapshot->active_count = 0; snapshot->regd_count = 0; + snapshot->snapXactCompletionCount = 0; return snapshot; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 96e4a8785760..e687cde6f176 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -407,6 +407,7 @@ CreateSharedProcArray(void) procArray->lastOverflowedXid = InvalidTransactionId; procArray->replication_slot_xmin = InvalidTransactionId; procArray->replication_slot_catalog_xmin = InvalidTransactionId; + ShmemVariableCache->xactCompletionCount = 1; } allProcs = ProcGlobal->allProcs; @@ -534,6 +535,9 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) /* Advance global latestCompletedXid while holding the lock */ MaintainLatestCompletedXid(latestXid); + /* Same with xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + ProcGlobal->xids[proc->pgxactoff] = 0; ProcGlobal->subxidStates[proc->pgxactoff].overflowed = false; ProcGlobal->subxidStates[proc->pgxactoff].count = 0; @@ -667,6 +671,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) { size_t pgxactoff = proc->pgxactoff; + Assert(LWLockHeldByMe(ProcArrayLock)); Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff])); Assert(ProcGlobal->xids[pgxactoff] == proc->xid); @@ -698,6 +703,9 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) /* Also advance global latestCompletedXid while holding the lock */ MaintainLatestCompletedXid(latestXid); + + /* Same with xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; } /* @@ -1916,6 +1924,93 @@ GetMaxSnapshotSubxidCount(void) return TOTAL_MAX_CACHED_SUBXIDS; } +/* + * Initialize old_snapshot_threshold specific parts of a newly build snapshot. + */ +static void +GetSnapshotDataInitOldSnapshot(Snapshot snapshot) +{ + if (!OldSnapshotThresholdActive()) + { + /* + * If not using "snapshot too old" feature, fill related fields with + * dummy values that don't require any locking. + */ + snapshot->lsn = InvalidXLogRecPtr; + snapshot->whenTaken = 0; + } + else + { + /* + * Capture the current time and WAL stream location in case this + * snapshot becomes old enough to need to fall back on the special + * "old snapshot" logic. + */ + snapshot->lsn = GetXLogInsertRecPtr(); + snapshot->whenTaken = GetSnapshotCurrentTimestamp(); + MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin); + } +} + +/* + * Helper function for GetSnapshotData() that checks if the bulk of the + * visibility information in the snapshot is still valid. If so, it updates + * the fields that need to change and returns true. Otherwise it returns + * false. + * + * This very likely can be evolved to not need ProcArrayLock held (at very + * least in the case we already hold a snapshot), but that's for another day. + */ +static bool +GetSnapshotDataReuse(Snapshot snapshot) +{ + uint64 curXactCompletionCount; + + Assert(LWLockHeldByMe(ProcArrayLock)); + + if (unlikely(snapshot->snapXactCompletionCount == 0)) + return false; + + curXactCompletionCount = ShmemVariableCache->xactCompletionCount; + if (curXactCompletionCount != snapshot->snapXactCompletionCount) + return false; + + /* + * If the current xactCompletionCount is still the same as it was at the + * time the snapshot was built, we can be sure that rebuilding the + * contents of the snapshot the hard way would result in the same snapshot + * contents: + * + * As explained in transam/README, the set of xids considered running by + * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot + * contents only depend on transactions with xids and xactCompletionCount + * is incremented whenever a transaction with an xid finishes (while + * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check + * ensures we would detect if the snapshot would have changed. + * + * As the snapshot contents are the same as it was before, it is is safe + * to re-enter the snapshot's xmin into the PGPROC array. None of the rows + * visible under the snapshot could already have been removed (that'd + * require the set of running transactions to change) and it fulfills the + * requirement that concurrent GetSnapshotData() calls yield the same + * xmin. + */ + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = snapshot->xmin; + + RecentXmin = snapshot->xmin; + Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); + + snapshot->curcid = GetCurrentCommandId(false); + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + + GetSnapshotDataInitOldSnapshot(snapshot); + + return true; +} + /* * GetSnapshotData -- returns information about running transactions. * @@ -1963,6 +2058,7 @@ GetSnapshotData(Snapshot snapshot) TransactionId oldestxid; int mypgxactoff; TransactionId myxid; + uint64 curXactCompletionCount; TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; @@ -2007,12 +2103,19 @@ GetSnapshotData(Snapshot snapshot) */ LWLockAcquire(ProcArrayLock, LW_SHARED); + if (GetSnapshotDataReuse(snapshot)) + { + LWLockRelease(ProcArrayLock); + return snapshot; + } + latest_completed = ShmemVariableCache->latestCompletedXid; mypgxactoff = MyProc->pgxactoff; myxid = other_xids[mypgxactoff]; Assert(myxid == MyProc->xid); oldestxid = ShmemVariableCache->oldestXid; + curXactCompletionCount = ShmemVariableCache->xactCompletionCount; /* xmax is always latestCompletedXid + 1 */ xmax = XidFromFullTransactionId(latest_completed); @@ -2266,6 +2369,7 @@ GetSnapshotData(Snapshot snapshot) snapshot->xcnt = count; snapshot->subxcnt = subcount; snapshot->suboverflowed = suboverflowed; + snapshot->snapXactCompletionCount = curXactCompletionCount; snapshot->curcid = GetCurrentCommandId(false); @@ -2277,26 +2381,7 @@ GetSnapshotData(Snapshot snapshot) snapshot->regd_count = 0; snapshot->copied = false; - if (old_snapshot_threshold < 0) - { - /* - * If not using "snapshot too old" feature, fill related fields with - * dummy values that don't require any locking. - */ - snapshot->lsn = InvalidXLogRecPtr; - snapshot->whenTaken = 0; - } - else - { - /* - * Capture the current time and WAL stream location in case this - * snapshot becomes old enough to need to fall back on the special - * "old snapshot" logic. - */ - snapshot->lsn = GetXLogInsertRecPtr(); - snapshot->whenTaken = GetSnapshotCurrentTimestamp(); - MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin); - } + GetSnapshotDataInitOldSnapshot(snapshot); return snapshot; } diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index c208538e2e5c..22cf3ebaf472 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -597,6 +597,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery; /* NB: curcid should NOT be copied, it's a local matter */ + CurrentSnapshot->snapXactCompletionCount = 0; + /* * Now we have to fix what GetSnapshotData did with MyProc->xmin and * TransactionXmin. There is a race condition: to make sure we are not @@ -672,6 +674,7 @@ CopySnapshot(Snapshot snapshot) newsnap->regd_count = 0; newsnap->active_count = 0; newsnap->copied = true; + newsnap->snapXactCompletionCount = 0; /* setup XID array */ if (snapshot->xcnt > 0) @@ -2209,6 +2212,7 @@ RestoreSnapshot(char *start_address) snapshot->curcid = serialized_snapshot.curcid; snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; + snapshot->snapXactCompletionCount = 0; /* Copy XIDs, if present. */ if (serialized_snapshot.xcnt > 0) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index b32044153b09..2f1f144db4d0 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -231,6 +231,15 @@ typedef struct VariableCacheData FullTransactionId latestCompletedXid; /* newest full XID that has * committed or aborted */ + /* + * Number of top-level transactions with xids (i.e. which may have + * modified the database) that completed in some form since the start of + * the server. This currently is solely used to check whether + * GetSnapshotData() needs to recompute the contents of the snapshot, or + * not. There are likely other users of this. Always above 1. + */ + uint64 xactCompletionCount; + /* * These fields are protected by XactTruncationLock */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 35b1f05bea65..dea072e5edf5 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -207,6 +207,13 @@ typedef struct SnapshotData TimestampTz whenTaken; /* timestamp when snapshot was taken */ XLogRecPtr lsn; /* position in the WAL stream when taken */ + + /* + * The transaction completion count at the time GetSnapshotData() built + * this snapshot. Allows to avoid re-computing static snapshots when no + * transactions completed since the last GetSnapshotData(). + */ + uint64 snapXactCompletionCount; } SnapshotData; #endif /* SNAPSHOT_H */ From 734478200ababcbb328ec3f02a74047bc470cae2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 18 Aug 2020 13:13:09 +0300 Subject: [PATCH 319/334] Avoid non-constant format string argument to fprintf(). As Tom Lane pointed out, it could defeat the compiler's printf() format string verification. Backpatch to v12, like that patch that introduced it. Discussion: https://www.postgresql.org/message-id/1069283.1597672779%40sss.pgh.pa.us --- src/bin/pg_basebackup/pg_basebackup.c | 2 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_rewind/pg_rewind.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 8158c8e41957..7a5d4562f946 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -860,7 +860,7 @@ progress_report(int tablespacenum, const char *filename, * Stay on the same line if reporting to a terminal and we're not done * yet. */ - fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n"); + fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr); } static int32 diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 0696db69bbd5..ffdc23945c6d 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -166,7 +166,7 @@ progress_report(bool finished) * Stay on the same line if reporting to a terminal and we're not done * yet. */ - fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n"); + fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr); } static bool diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index a9aecc790528..23fc749e4451 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -572,7 +572,7 @@ progress_report(bool finished) * Stay on the same line if reporting to a terminal and we're not done * yet. */ - fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n"); + fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr); } /* From 07f32fcd23ac81898ed47f88beb569c631a2f223 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 18 Aug 2020 16:31:12 -0700 Subject: [PATCH 320/334] Fix race condition in snapshot caching when 2PC is used. When preparing a transaction xactCompletionCount needs to be incremented, even though the transaction has not committed yet. Otherwise the snapshot used within the transaction otherwise can get reused outside of the prepared transaction. As GetSnapshotData() does not include the current xid when building a snapshot, reuse would not be correct. Somewhat surprisingly the regression tests only rarely show incorrect results without the fix. The reason for that is that often the snapshot's xmax will be >= the backend xid, yielding a snapshot that is correct, despite the bug. I'm working on a reliable test for the bug, but it seems worth seeing whether this fixes all the BF failures while I do. Author: Andres Freund Discussion: https://postgr.es/m/E1k7tGP-0005V0-5k@gemulon.postgresql.org --- src/backend/storage/ipc/procarray.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e687cde6f176..51f8099cad2c 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -860,6 +860,15 @@ ProcArrayClearTransaction(PGPROC *proc) Assert(!(proc->vacuumFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkpt); + /* + * Need to increment completion count even though transaction hasn't + * really committed yet. The reason for that is that GetSnapshotData() + * omits the xid of the current transaction, thus without the increment we + * otherwise could end up reusing the snapshot later. Which would be bad, + * because it might not count the prepared transaction as running. + */ + ShmemVariableCache->xactCompletionCount++; + /* Clear the subtransaction-XID cache too */ Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); From 3e98c0bafb28de87ae095b341687dc082371af54 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Wed, 19 Aug 2020 15:34:43 +0900 Subject: [PATCH 321/334] Add pg_backend_memory_contexts system view. This view displays the usages of all the memory contexts of the server process attached to the current session. This information is useful to investigate the cause of backend-local memory bloat. This information can be also collected by calling MemoryContextStats(TopMemoryContext) via a debugger. But this technique cannot be uesd in some environments because no debugger is available there. And it outputs lots of text messages and it's not easy to analyze them. So, pg_backend_memory_contexts view allows us to access to backend-local memory contexts information more easily. Bump catalog version. Author: Atsushi Torikoshi, Fujii Masao Reviewed-by: Tatsuhito Kasahara, Andres Freund, Daniel Gustafsson, Robert Haas, Michael Paquier Discussion: https://postgr.es/m/72a656e0f71d0860161e0b3f67e4d771@oss.nttdata.com --- doc/src/sgml/catalogs.sgml | 122 +++++++++++++++++++++++ src/backend/catalog/system_views.sql | 3 + src/backend/utils/mmgr/mcxt.c | 138 +++++++++++++++++++++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 9 ++ src/test/regress/expected/rules.out | 10 ++ 6 files changed, 283 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index fc329c5cff96..1232b24e74cf 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -9226,6 +9226,11 @@ SCRAM-SHA-256$<iteration count>:&l available versions of extensions + + pg_backend_memory_contexts + backend memory contexts + + pg_config compile-time configuration parameters @@ -9577,6 +9582,123 @@ SCRAM-SHA-256$<iteration count>:&l + + <structname>pg_backend_memory_contexts</structname> + + + pg_backend_memory_contexts + + + + The view pg_backend_memory_contexts displays all + the memory contexts of the server process attached to the current session. + + + pg_backend_memory_contexts contains one row + for each memory context. + + + + <structname>pg_backend_memory_contexts</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + Name of the memory context + + + + + + ident text + + + Identification information of the memory context. This field is truncated at 1024 bytes + + + + + + parent text + + + Name of the parent of this memory context + + + + + + level int4 + + + Distance from TopMemoryContext in context tree + + + + + + total_bytes int8 + + + Total bytes allocated for this memory context + + + + + + total_nblocks int8 + + + Total number of blocks allocated for this memory context + + + + + + free_bytes int8 + + + Free space in bytes + + + + + + free_chunks int8 + + + Total number of free chunks + + + + + + used_bytes int8 + + + Used space in bytes + + + + +
+ +
+ <structname>pg_config</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 8625cbeab6e4..ba5a23ac2524 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -554,6 +554,9 @@ CREATE VIEW pg_shmem_allocations AS REVOKE ALL ON pg_shmem_allocations FROM PUBLIC; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; +CREATE VIEW pg_backend_memory_contexts AS + SELECT * FROM pg_get_backend_memory_contexts(); + -- Statistics views CREATE VIEW pg_stat_all_tables AS diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index abda22fa570a..d9bb2499db75 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -21,8 +21,10 @@ #include "postgres.h" +#include "funcapi.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "utils/builtins.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -67,6 +69,12 @@ static void MemoryContextStatsPrint(MemoryContext context, void *passthru, #define AssertNotInCriticalSection(context) \ Assert(CritSectionCount == 0 || (context)->allowInCritSection) +/* ---------- + * The max bytes for showing identifiers of MemoryContext. + * ---------- + */ +#define MEMORY_CONTEXT_IDENT_DISPLAY_SIZE 1024 + /***************************************************************************** * EXPORTED ROUTINES * *****************************************************************************/ @@ -1220,3 +1228,133 @@ pchomp(const char *in) n--; return pnstrdup(in, n); } + +/* + * PutMemoryContextsStatsTupleStore + * One recursion level for pg_get_backend_memory_contexts. + */ +static void +PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, + TupleDesc tupdesc, MemoryContext context, + const char *parent, int level) +{ +#define PG_GET_BACKEND_MEMORY_CONTEXTS_COLS 9 + + Datum values[PG_GET_BACKEND_MEMORY_CONTEXTS_COLS]; + bool nulls[PG_GET_BACKEND_MEMORY_CONTEXTS_COLS]; + MemoryContextCounters stat; + MemoryContext child; + const char *name; + const char *ident; + + AssertArg(MemoryContextIsValid(context)); + + name = context->name; + ident = context->ident; + + /* + * To be consistent with logging output, we label dynahash contexts + * with just the hash table name as with MemoryContextStatsPrint(). + */ + if (ident && strcmp(name, "dynahash") == 0) + { + name = ident; + ident = NULL; + } + + /* Examine the context itself */ + memset(&stat, 0, sizeof(stat)); + (*context->methods->stats) (context, NULL, (void *) &level, &stat); + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + if (name) + values[0] = CStringGetTextDatum(name); + else + nulls[0] = true; + + if (ident) + { + int idlen = strlen(ident); + char clipped_ident[MEMORY_CONTEXT_IDENT_DISPLAY_SIZE]; + + /* + * Some identifiers such as SQL query string can be very long, + * truncate oversize identifiers. + */ + if (idlen >= MEMORY_CONTEXT_IDENT_DISPLAY_SIZE) + idlen = pg_mbcliplen(ident, idlen, MEMORY_CONTEXT_IDENT_DISPLAY_SIZE - 1); + + memcpy(clipped_ident, ident, idlen); + clipped_ident[idlen] = '\0'; + values[1] = CStringGetTextDatum(clipped_ident); + } + else + nulls[1] = true; + + if (parent) + values[2] = CStringGetTextDatum(parent); + else + nulls[2] = true; + + values[3] = Int32GetDatum(level); + values[4] = Int64GetDatum(stat.totalspace); + values[5] = Int64GetDatum(stat.nblocks); + values[6] = Int64GetDatum(stat.freespace); + values[7] = Int64GetDatum(stat.freechunks); + values[8] = Int64GetDatum(stat.totalspace - stat.freespace); + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + + for (child = context->firstchild; child != NULL; child = child->nextchild) + { + PutMemoryContextsStatsTupleStore(tupstore, tupdesc, + child, name, level + 1); + } +} + +/* + * pg_get_backend_memory_contexts + * SQL SRF showing backend memory context. + */ +Datum +pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + PutMemoryContextsStatsTupleStore(tupstore, tupdesc, + TopMemoryContext, NULL, 0); + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 928495112196..3e6779763000 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202007251 +#define CATALOG_VERSION_NO 202008191 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 082a11f2708c..27989971db74 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -7807,6 +7807,15 @@ proargnames => '{name,off,size,allocated_size}', prosrc => 'pg_get_shmem_allocations' }, +# memory context of local backend +{ oid => '2282', descr => 'information about all memory contexts of local backend', + proname => 'pg_get_backend_memory_contexts', prorows => '100', proretset => 't', + provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,text,text,int4,int8,int8,int8,int8,int8}', + proargmodes => '{o,o,o,o,o,o,o,o,o}', + proargnames => '{name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes}', + prosrc => 'pg_get_backend_memory_contexts' }, + # non-persistent series generator { oid => '1066', descr => 'non-persistent series generator', proname => 'generate_series', prorows => '1000', diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 601734a6f1ec..2a18dc423e2b 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1324,6 +1324,16 @@ pg_available_extensions| SELECT e.name, e.comment FROM (pg_available_extensions() e(name, default_version, comment) LEFT JOIN pg_extension x ON ((e.name = x.extname))); +pg_backend_memory_contexts| SELECT pg_get_backend_memory_contexts.name, + pg_get_backend_memory_contexts.ident, + pg_get_backend_memory_contexts.parent, + pg_get_backend_memory_contexts.level, + pg_get_backend_memory_contexts.total_bytes, + pg_get_backend_memory_contexts.total_nblocks, + pg_get_backend_memory_contexts.free_bytes, + pg_get_backend_memory_contexts.free_chunks, + pg_get_backend_memory_contexts.used_bytes + FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes); pg_config| SELECT pg_config.name, pg_config.setting FROM pg_config() pg_config(name, setting); From 20729324078055a4d9654fc5af9570fe625786a5 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 19 Aug 2020 14:07:49 -0400 Subject: [PATCH 322/334] Suppress unnecessary RelabelType nodes in yet more cases. Commit a477bfc1d fixed eval_const_expressions() to ensure that it didn't generate unnecessary RelabelType nodes, but I failed to notice that some other places in the planner had the same issue. Really noplace in the planner should be using plain makeRelabelType(), for fear of generating expressions that should be equal() to semantically equivalent trees, but aren't. An example is that because canonicalize_ec_expression() failed to be careful about this, we could end up with an equivalence class containing both a plain Const, and a Const-with-RelabelType representing exactly the same value. So far as I can tell this led to no visible misbehavior, but we did waste a bunch of cycles generating and evaluating "Const = Const-with-RelabelType" to prove such entries are redundant. Hence, move the support function added by a477bfc1d to where it can be more generally useful, and use it in the places where planner code previously used makeRelabelType. Back-patch to v12, like the previous patch. While I have no concrete evidence of any real misbehavior here, it's certainly possible that I overlooked a case where equivalent expressions that aren't equal() could cause a user-visible problem. In any case carrying extra RelabelType nodes through planning to execution isn't very desirable. Discussion: https://postgr.es/m/1311836.1597781384@sss.pgh.pa.us --- src/backend/nodes/nodeFuncs.c | 75 ++++++++++++++++---- src/backend/optimizer/path/equivclass.c | 43 +++++------ src/backend/optimizer/prep/prepunion.c | 10 +-- src/backend/optimizer/util/clauses.c | 94 ++++++------------------- src/include/nodes/nodeFuncs.h | 3 + 5 files changed, 106 insertions(+), 119 deletions(-) diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index d85ca9f7c501..9ce8f43385ec 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -575,27 +575,76 @@ exprIsLengthCoercion(const Node *expr, int32 *coercedTypmod) return false; } +/* + * applyRelabelType + * Add a RelabelType node if needed to make the expression expose + * the specified type, typmod, and collation. + * + * This is primarily intended to be used during planning. Therefore, it must + * maintain the post-eval_const_expressions invariants that there are not + * adjacent RelabelTypes, and that the tree is fully const-folded (hence, + * we mustn't return a RelabelType atop a Const). If we do find a Const, + * we'll modify it in-place if "overwrite_ok" is true; that should only be + * passed as true if caller knows the Const is newly generated. + */ +Node * +applyRelabelType(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid, + CoercionForm rformat, int rlocation, bool overwrite_ok) +{ + /* + * If we find stacked RelabelTypes (eg, from foo::int::oid) we can discard + * all but the top one, and must do so to ensure that semantically + * equivalent expressions are equal(). + */ + while (arg && IsA(arg, RelabelType)) + arg = (Node *) ((RelabelType *) arg)->arg; + + if (arg && IsA(arg, Const)) + { + /* Modify the Const directly to preserve const-flatness. */ + Const *con = (Const *) arg; + + if (!overwrite_ok) + con = copyObject(con); + con->consttype = rtype; + con->consttypmod = rtypmod; + con->constcollid = rcollid; + /* We keep the Const's original location. */ + return (Node *) con; + } + else if (exprType(arg) == rtype && + exprTypmod(arg) == rtypmod && + exprCollation(arg) == rcollid) + { + /* Sometimes we find a nest of relabels that net out to nothing. */ + return arg; + } + else + { + /* Nope, gotta have a RelabelType. */ + RelabelType *newrelabel = makeNode(RelabelType); + + newrelabel->arg = (Expr *) arg; + newrelabel->resulttype = rtype; + newrelabel->resulttypmod = rtypmod; + newrelabel->resultcollid = rcollid; + newrelabel->relabelformat = rformat; + newrelabel->location = rlocation; + return (Node *) newrelabel; + } +} + /* * relabel_to_typmod * Add a RelabelType node that changes just the typmod of the expression. * - * This is primarily intended to be used during planning. Therefore, it - * strips any existing RelabelType nodes to maintain the planner's invariant - * that there are not adjacent RelabelTypes. + * Convenience function for a common usage of applyRelabelType. */ Node * relabel_to_typmod(Node *expr, int32 typmod) { - Oid type = exprType(expr); - Oid coll = exprCollation(expr); - - /* Strip any existing RelabelType node(s) */ - while (expr && IsA(expr, RelabelType)) - expr = (Node *) ((RelabelType *) expr)->arg; - - /* Apply new typmod, preserving the previous exposed type and collation */ - return (Node *) makeRelabelType((Expr *) expr, type, typmod, coll, - COERCE_EXPLICIT_CAST); + return applyRelabelType(expr, exprType(expr), typmod, exprCollation(expr), + COERCE_EXPLICIT_CAST, -1, false); } /* diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index b99cec00cb7a..b68a5a0ec717 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -490,10 +490,6 @@ process_equivalence(PlannerInfo *root, * work to not label the collation at all in EC members, but this is risky * since some parts of the system expect exprCollation() to deliver the * right answer for a sort key.) - * - * Note this code assumes that the expression has already been through - * eval_const_expressions, so there are no CollateExprs and no redundant - * RelabelTypes. */ Expr * canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation) @@ -514,29 +510,24 @@ canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation) exprCollation((Node *) expr) != req_collation) { /* - * Strip any existing RelabelType, then add a new one if needed. This - * is to preserve the invariant of no redundant RelabelTypes. - * - * If we have to change the exposed type of the stripped expression, - * set typmod to -1 (since the new type may not have the same typmod - * interpretation). If we only have to change collation, preserve the - * exposed typmod. + * If we have to change the type of the expression, set typmod to -1, + * since the new type may not have the same typmod interpretation. + * When we only have to change collation, preserve the exposed typmod. + */ + int32 req_typmod; + + if (expr_type != req_type) + req_typmod = -1; + else + req_typmod = exprTypmod((Node *) expr); + + /* + * Use applyRelabelType so that we preserve const-flatness. This is + * important since eval_const_expressions has already been applied. */ - while (expr && IsA(expr, RelabelType)) - expr = (Expr *) ((RelabelType *) expr)->arg; - - if (exprType((Node *) expr) != req_type) - expr = (Expr *) makeRelabelType(expr, - req_type, - -1, - req_collation, - COERCE_IMPLICIT_CAST); - else if (exprCollation((Node *) expr) != req_collation) - expr = (Expr *) makeRelabelType(expr, - req_type, - exprTypmod((Node *) expr), - req_collation, - COERCE_IMPLICIT_CAST); + expr = (Expr *) applyRelabelType((Node *) expr, + req_type, req_typmod, req_collation, + COERCE_IMPLICIT_CAST, -1, false); } return expr; diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 2ebd4ea33207..745f443e5c2d 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -1200,13 +1200,9 @@ generate_setop_tlist(List *colTypes, List *colCollations, * will reach the executor without any further processing. */ if (exprCollation(expr) != colColl) - { - expr = (Node *) makeRelabelType((Expr *) expr, - exprType(expr), - exprTypmod(expr), - colColl, - COERCE_IMPLICIT_CAST); - } + expr = applyRelabelType(expr, + exprType(expr), exprTypmod(expr), colColl, + COERCE_IMPLICIT_CAST, -1, false); tle = makeTargetEntry((Expr *) expr, (AttrNumber) resno++, diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 7105d0a2db9a..750586fceb74 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -120,9 +120,6 @@ static Node *eval_const_expressions_mutator(Node *node, static bool contain_non_const_walker(Node *node, void *context); static bool ece_function_is_safe(Oid funcid, eval_const_expressions_context *context); -static Node *apply_const_relabel(Node *arg, Oid rtype, - int32 rtypmod, Oid rcollid, - CoercionForm rformat, int rlocation); static List *simplify_or_arguments(List *args, eval_const_expressions_context *context, bool *haveNull, bool *forceTrue); @@ -2819,12 +2816,13 @@ eval_const_expressions_mutator(Node *node, arg = eval_const_expressions_mutator((Node *) relabel->arg, context); /* ... and attach a new RelabelType node, if needed */ - return apply_const_relabel(arg, - relabel->resulttype, - relabel->resulttypmod, - relabel->resultcollid, - relabel->relabelformat, - relabel->location); + return applyRelabelType(arg, + relabel->resulttype, + relabel->resulttypmod, + relabel->resultcollid, + relabel->relabelformat, + relabel->location, + true); } case T_CoerceViaIO: { @@ -2971,12 +2969,13 @@ eval_const_expressions_mutator(Node *node, arg = eval_const_expressions_mutator((Node *) collate->arg, context); /* ... and attach a new RelabelType node, if needed */ - return apply_const_relabel(arg, - exprType(arg), - exprTypmod(arg), - collate->collOid, - COERCE_IMPLICIT_CAST, - collate->location); + return applyRelabelType(arg, + exprType(arg), + exprTypmod(arg), + collate->collOid, + COERCE_IMPLICIT_CAST, + collate->location, + true); } case T_CaseExpr: { @@ -3478,12 +3477,13 @@ eval_const_expressions_mutator(Node *node, cdomain->resulttype); /* Generate RelabelType to substitute for CoerceToDomain */ - return apply_const_relabel(arg, - cdomain->resulttype, - cdomain->resulttypmod, - cdomain->resultcollid, - cdomain->coercionformat, - cdomain->location); + return applyRelabelType(arg, + cdomain->resulttype, + cdomain->resulttypmod, + cdomain->resultcollid, + cdomain->coercionformat, + cdomain->location, + true); } newcdomain = makeNode(CoerceToDomain); @@ -3616,58 +3616,6 @@ ece_function_is_safe(Oid funcid, eval_const_expressions_context *context) return false; } -/* - * Subroutine for eval_const_expressions: apply RelabelType if needed - */ -static Node * -apply_const_relabel(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid, - CoercionForm rformat, int rlocation) -{ - /* - * If we find stacked RelabelTypes (eg, from foo::int::oid) we can discard - * all but the top one, and must do so to ensure that semantically - * equivalent expressions are equal(). - */ - while (arg && IsA(arg, RelabelType)) - arg = (Node *) ((RelabelType *) arg)->arg; - - if (arg && IsA(arg, Const)) - { - /* - * If it's a Const, just modify it in-place; since this is part of - * eval_const_expressions, we want to end up with a simple Const not - * an expression tree. We assume the Const is newly generated and - * hence safe to modify. - */ - Const *con = (Const *) arg; - - con->consttype = rtype; - con->consttypmod = rtypmod; - con->constcollid = rcollid; - return (Node *) con; - } - else if (exprType(arg) == rtype && - exprTypmod(arg) == rtypmod && - exprCollation(arg) == rcollid) - { - /* Sometimes we find a nest of relabels that net out to nothing. */ - return arg; - } - else - { - /* Nope, gotta have a RelabelType. */ - RelabelType *newrelabel = makeNode(RelabelType); - - newrelabel->arg = (Expr *) arg; - newrelabel->resulttype = rtype; - newrelabel->resulttypmod = rtypmod; - newrelabel->resultcollid = rcollid; - newrelabel->relabelformat = rformat; - newrelabel->location = rlocation; - return (Node *) newrelabel; - } -} - /* * Subroutine for eval_const_expressions: process arguments of an OR clause * diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h index 779906b9b77f..9cc56eecaa3a 100644 --- a/src/include/nodes/nodeFuncs.h +++ b/src/include/nodes/nodeFuncs.h @@ -36,6 +36,9 @@ typedef bool (*check_function_callback) (Oid func_id, void *context); extern Oid exprType(const Node *expr); extern int32 exprTypmod(const Node *expr); extern bool exprIsLengthCoercion(const Node *expr, int32 *coercedTypmod); +extern Node *applyRelabelType(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid, + CoercionForm rformat, int rlocation, + bool overwrite_ok); extern Node *relabel_to_typmod(Node *expr, int32 typmod); extern Node *strip_implicit_coercions(Node *node); extern bool expression_returns_set(Node *clause); From 1fe1f42e3e85279e1cb8b004b3b076a04bde4cee Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 19 Aug 2020 18:19:52 -0700 Subject: [PATCH 323/334] Acquire ProcArrayLock exclusively in ProcArrayClearTransaction. This corrects an oversight by me in 20729324078, which made ProcArrayClearTransaction() increment xactCompletionCount. That requires an exclusive lock, obviously. There's other approaches that avoid the exclusive acquisition, but given that a 2PC commit is fairly heavyweight, it doesn't seem worth doing so. I've not been able to measure a performance difference, unsurprisingly. I did add a comment documenting that we could do so, should it ever become a bottleneck. Reported-By: Tom Lane Author: Andres Freund Discussion: https://postgr.es/m/1355915.1597794204@sss.pgh.pa.us --- src/backend/storage/ipc/procarray.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 51f8099cad2c..60b7a5db8e07 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -840,13 +840,20 @@ ProcArrayClearTransaction(PGPROC *proc) size_t pgxactoff; /* - * We can skip locking ProcArrayLock exclusively here, because this action - * does not actually change anyone's view of the set of running XIDs: our - * entry is duplicate with the gxact that has already been inserted into - * the ProcArray. But need it in shared mode for pgproc->pgxactoff to stay - * the same. + * Currently we need to lock ProcArrayLock exclusively here, as we + * increment xactCompletionCount below. We also need it at least in shared + * mode for pgproc->pgxactoff to stay the same below. + * + * We could however, as this action does not actually change anyone's view + * of the set of running XIDs (our entry is duplicate with the gxact that + * has already been inserted into the ProcArray), lower the lock level to + * shared if we were to make xactCompletionCount an atomic variable. But + * that doesn't seem worth it currently, as a 2PC commit is heavyweight + * enough for this not to be the bottleneck. If it ever becomes a + * bottleneck it may also be worth considering to combine this with the + * subsequent ProcArrayRemove() */ - LWLockAcquire(ProcArrayLock, LW_SHARED); + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); pgxactoff = proc->pgxactoff; From 0784c333728dd454b80c0bd0faec916782370810 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Thu, 20 Aug 2020 13:49:04 -0400 Subject: [PATCH 324/334] Revise REINDEX CONCURRENTLY recovery instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the leftover invalid index is "ccold", there's no need to re-run the command. Reword the instructions to make that explicit. Backpatch to 12, where REINDEX CONCURRENTLY appeared. Author: Álvaro Herrera Reviewed-by: Michael Paquier Reviewed-by: Julien Rouhaud Discussion: https://postgr.es/m/20200819211312.GA15497@alvherre.pgsql --- doc/src/sgml/ref/reindex.sgml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index aac5d5be23f4..c16f223e4edb 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -307,7 +307,7 @@ REINDEX [ ( option [, ...] ) ] { IN - A new temporary index definition is added to the catalog + A new transient index definition is added to the catalog pg_index. This definition will be used to replace the old index. A SHARE UPDATE EXCLUSIVE lock at session level is taken on the indexes being reindexed as well as their @@ -383,13 +383,15 @@ Indexes: "idx_ccnew" btree (col) INVALID - The recommended recovery method in such cases is to drop the invalid index - and try again to perform REINDEX CONCURRENTLY. The - concurrent index created during the processing has a name ending in the - suffix ccnew, or ccold if it is an - old index definition which we failed to drop. Invalid indexes can be - dropped using DROP INDEX, including invalid toast - indexes. + If the index marked INVALID is suffixed + ccnew, then it corresponds to the transient + index created during the concurrent operation, and the recommended + recovery method is to drop it using DROP INDEX, + then attempt REINDEX CONCURRENTLY again. + If the invalid index is instead suffixed ccold, + it corresponds to the original index which could not be dropped; + the recommended recovery method is to just drop said index, since the + rebuild proper has been successful. From c62a0a49f33a0d45a97aa1d3a5bc6ddc83f10d82 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 20 Aug 2020 12:59:00 -0700 Subject: [PATCH 325/334] Revert "Make vacuum a bit more verbose to debug BF failure." This reverts commit 49967da65aec970fcda123acc681f1df5d70bfc6. Enough time has passed that we can be confident that 07f32fcd23a resolved the issue. Therefore we can remove the temporary debugging aids. Author: Andres Freund Discussion: https://postgr.es/m/E1k7tGP-0005V0-5k@gemulon.postgresql.org --- src/backend/access/heap/heapam.c | 11 +---------- src/backend/access/heap/vacuumlazy.c | 7 ------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 8eb276e46449..9b5f417eac44 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6048,16 +6048,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, TransactionIdIsInProgress(members[i].xid)) { /* running locker cannot possibly be older than the cutoff */ - if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) - { - /* temporary on-bf debugging */ - elog(PANIC, "too old alive locker: multi: %u, member xid: %u, memb-current: %d, memb-progress: %d, cutoff: %u, cutoff-multi: %u, relfrozenxid: %u, relminmxid: %u", - multi, members[i].xid, - TransactionIdIsCurrentTransactionId(members[i].xid), - TransactionIdIsInProgress(members[i].xid), - cutoff_xid, cutoff_multi, - relfrozenxid, relminmxid); - } + Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid)); newmembers[nnewmembers++] = members[i]; has_lockers = true; } diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 03c8e1ff7ea9..44e2224dd557 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1350,14 +1350,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple) || params->index_cleanup == VACOPT_TERNARY_DISABLED) - { - /* temporary on-bf debugging */ - elog(LOG, "treating dead HOT tuple (updated %d, heap only: %d, index cleanup: %d) as alive", - HeapTupleIsHotUpdated(&tuple), HeapTupleIsHeapOnly(&tuple), - params->index_cleanup == VACOPT_TERNARY_DISABLED); - nkeep += 1; - } else tupgone = true; /* we can delete the tuple */ all_visible = false; From 8431d33079a2c552aaa223ebcfd470572d90146b Mon Sep 17 00:00:00 2001 From: David Rowley Date: Fri, 21 Aug 2020 09:33:56 +1200 Subject: [PATCH 326/334] Fix a few typos in JIT comments and README Reviewed-by: Abhijit Menon-Sen Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CAApHDvobgmCs6CohqhKTUf7D8vffoZXQTCBTERo9gbOeZmvLTw%40mail.gmail.com Backpatch-through: 11, where JIT was added --- src/backend/jit/README | 14 +++++++------- src/include/jit/llvmjit_emit.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/backend/jit/README b/src/backend/jit/README index e2fac8558e8e..5427bdf2153f 100644 --- a/src/backend/jit/README +++ b/src/backend/jit/README @@ -10,11 +10,11 @@ SQL expressions to evaluate an SQL predicate like WHERE a.col = 3, it is possible to generate a function than can be natively executed by the CPU that just handles that expression, yielding a speedup. -That this is done at query execution time, possibly even only in cases -where the relevant task is done a number of times, makes it JIT, -rather than ahead-of-time (AOT). Given the way JIT compilation is used -in PostgreSQL, the lines between interpretation, AOT and JIT are -somewhat blurry. +This is JIT, rather than ahead-of-time (AOT) compilation, because it +is done at query execution time, and perhaps only in cases where the +relevant task is repeated a number of times. Given the way JIT +compilation is used in PostgreSQL, the lines between interpretation, +AOT and JIT are somewhat blurry. Note that the interpreted program turned into a native program does not necessarily have to be a program in the classical sense. E.g. it @@ -99,7 +99,7 @@ Lifetimes of JITed functions are managed via JITContext. Exactly one such context should be created for work in which all created JITed function should have the same lifetime. E.g. there's exactly one JITContext for each query executed, in the query's EState. Only the -release of an JITContext is exposed to the provider independent +release of a JITContext is exposed to the provider independent facility, as the creation of one is done on-demand by the JIT implementations. @@ -231,7 +231,7 @@ needs to be referenced as an offset to one block of memory stored in an ExprState, rather than absolute pointers into memory. Once that is addressed, adding an LRU cache that's keyed by the -generated LLVM IR will allow to use optimized functions even for +generated LLVM IR will allow the usage of optimized functions even for faster queries. A longer term project is to move expression compilation to the planner diff --git a/src/include/jit/llvmjit_emit.h b/src/include/jit/llvmjit_emit.h index 1a7d6db7259e..3142df608b3c 100644 --- a/src/include/jit/llvmjit_emit.h +++ b/src/include/jit/llvmjit_emit.h @@ -1,6 +1,6 @@ /* * llvmjit_emit.h - * Helpers to make emitting LLVM IR a it more concise and pgindent proof. + * Helpers to make emitting LLVM IR a bit more concise and pgindent proof. * * Copyright (c) 2018-2020, PostgreSQL Global Development Group * From d259afa7365165760004c2fdbe2520a94ddf2600 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 21 Aug 2020 12:33:30 +0900 Subject: [PATCH 327/334] Fix typos in comments. Author: Masahiko Sawada Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/CA+fd4k4m9hFSrRLB3etPWO5_v5=MujVZWRtz63q+55hM0Dz25Q@mail.gmail.com --- src/backend/storage/ipc/procarray.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 60b7a5db8e07..45eab7e5a622 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -198,7 +198,7 @@ typedef struct ComputeXidHorizonsResult * be removed. * * This likely should only be needed to determine whether pg_subtrans can - * be truncated. It currently includes the effects of replications slots, + * be truncated. It currently includes the effects of replication slots, * for historical reasons. But that could likely be changed. */ TransactionId oldest_considered_running; @@ -207,7 +207,7 @@ typedef struct ComputeXidHorizonsResult * Oldest xid for which deleted tuples need to be retained in shared * tables. * - * This includes the effects of replications lots. If that's not desired, + * This includes the effects of replication slots. If that's not desired, * look at shared_oldest_nonremovable_raw; */ TransactionId shared_oldest_nonremovable; From 4f78b6a46503c8b28f38ec40503149d78ba47716 Mon Sep 17 00:00:00 2001 From: Viktor Kurilko Date: Mon, 4 May 2026 21:52:00 +0700 Subject: [PATCH 328/334] Resolve conflicts in the catversion.h (#2443) Commit 3e98c0bafb28de87ae095b341687dc082371af54 updates the catalog version, but we use a different format. --- src/include/catalog/catversion.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 972fd7982228..add1503e3859 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -55,12 +55,7 @@ * catalog versions from Greenplum. */ -<<<<<<< HEAD /* 3yyymmddN */ -#define CATALOG_VERSION_NO 302604171 -======= -/* yyyymmddN */ -#define CATALOG_VERSION_NO 202008191 ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600 +#define CATALOG_VERSION_NO 302605041 #endif From 87d9e1568d497e466f2c7b3608d6e4695e2c44e7 Mon Sep 17 00:00:00 2001 From: Viktor Kurilko Date: Wed, 6 May 2026 10:56:47 +0700 Subject: [PATCH 329/334] Resolve conflicts in the snapshot.h (#2447) Commit 623a9ba79bbdd11c5eccb30b8bd5c446130e521c added a new field to the SnapshotData struct, but earlier commit 3b4cd7887fd16542339ae9cb13df252d7c58fc11 added a field nearby. --- src/include/utils/snapshot.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 93b7114dcae2..7e2772ab0a6b 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -212,18 +212,17 @@ typedef struct SnapshotData XLogRecPtr lsn; /* position in the WAL stream when taken */ /* -<<<<<<< HEAD * GP: Global information about which transactions are visible for a * distributed transaction, with cached local xids */ DistributedSnapshotWithLocalMapping distribSnapshotWithLocalMapping; -======= + + /* * The transaction completion count at the time GetSnapshotData() built * this snapshot. Allows to avoid re-computing static snapshots when no * transactions completed since the last GetSnapshotData(). */ uint64 snapXactCompletionCount; ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600 } SnapshotData; #endif /* SNAPSHOT_H */ From 4c3633beb77cd7d404186910c4be219590bcd2c0 Mon Sep 17 00:00:00 2001 From: Viktor Kurilko Date: Wed, 6 May 2026 11:27:00 +0700 Subject: [PATCH 330/334] Resolve conflicts src/backend/storage/lmgr/lwlocknames.txt (#2442) Commit 566372b3d6435639e4cc4476d79b8505a0297c87 added new locks to the lwlocknames.txt, while earlier commit 19cd1cf4b68faff2e29bc2fa884c480e4644cdb4 added gpdb specific locks to the same place. --- src/backend/storage/lmgr/lwlocknames.txt | 36 +++++++++++------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 8289c6993b7c..77fdf74695f7 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,26 +50,22 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 LogicalRepWorkerLock 43 XactTruncationLock 44 -<<<<<<< HEAD - -# Additional individual locks in GPDB -SharedSnapshotLock 45 -DistributedLogControlLock 46 -# 47 is available; was formerly AOSegFileLock -ResQueueLock 48 -ResGroupLock 49 -ErrorLogLock 50 -SessionStateLock 51 -RelfilenodeGenLock 52 -WorkFileManagerLock 53 -DistributedLogTruncateLock 54 -TwophaseCommitLock 55 -ShareInputScanLock 56 -FTSReplicationStatusLock 57 -GxidBumpLock 58 -ParallelCursorEndpointLock 59 -======= # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600 + +# Additional individual locks in GPDB +SharedSnapshotLock 48 +DistributedLogControlLock 49 +ResQueueLock 50 +ResGroupLock 51 +ErrorLogLock 52 +SessionStateLock 53 +RelfilenodeGenLock 54 +WorkFileManagerLock 55 +DistributedLogTruncateLock 56 +TwophaseCommitLock 57 +ShareInputScanLock 58 +FTSReplicationStatusLock 59 +GxidBumpLock 60 +ParallelCursorEndpointLock 61 From bae42457e5125ed93bb1e181bcf2bf2a80b14fb4 Mon Sep 17 00:00:00 2001 From: Maxim Michkov Date: Wed, 6 May 2026 10:53:55 +0300 Subject: [PATCH 331/334] Resolve conflicts in src/backend/commands/tablecmds.c (#2460) Commit e3931d01f3afef14703827eda1dad0a3fb3b5d07 added new local variables while commit 07c2f0112adc8c682fcf330b6beef0197d210834 added another variable nearby. --- src/backend/commands/tablecmds.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b117cf3f0b85..de6f9993d85d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -7579,12 +7579,9 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel, AlterTableCmd *childcmd; AclResult aclresult; ObjectAddress address; -<<<<<<< HEAD List* enc; -======= TupleDesc tupdesc; FormData_pg_attribute *aattr[] = {&attribute}; ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600 /* At top level, permission check was done in ATPrepCmd, else do it */ if (recursing) From a732b1e754b3bdbb74b230de5516337cdb9f9802 Mon Sep 17 00:00:00 2001 From: Maxim Michkov Date: Wed, 6 May 2026 10:54:15 +0300 Subject: [PATCH 332/334] Resolve conflicts in src/backend/commands/opclasscmds.c (#2459) Commit 9f9682783bea74bf8d93cac4f7dd65fa677f5dc7 removed header opfam_internal.h, however there was GPDB-specific include nearby. --- src/backend/commands/opclasscmds.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index 5c92cad7d7cf..97e4a0fbe7e7 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -27,11 +27,6 @@ #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/objectaccess.h" -<<<<<<< HEAD -#include "catalog/oid_dispatch.h" -#include "catalog/opfam_internal.h" -======= ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600 #include "catalog/pg_am.h" #include "catalog/pg_amop.h" #include "catalog/pg_amproc.h" @@ -54,6 +49,7 @@ #include "utils/rel.h" #include "utils/syscache.h" +#include "catalog/oid_dispatch.h" #include "cdb/cdbvars.h" #include "cdb/cdbdisp_query.h" From 33bce61d133eba8edb7fdafbdd47c28015a722ee Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Wed, 6 May 2026 13:30:27 +0500 Subject: [PATCH 333/334] Resolve conflicts in src/include/access/nbtree.h (#2448) Commit 9f96827 added a definition of the new function btadjustmembers to src/include/access/nbtree.h, while earlier commit 38d8815 had already added a definition of the function btree_or_bitmap_validate to the same location. --- src/include/access/nbtree.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 22df078571c0..ec6ba6072fe0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1141,14 +1141,11 @@ extern bool _bt_allequalimage(Relation rel, bool debugmessage); * prototypes for functions in nbtvalidate.c */ extern bool btvalidate(Oid opclassoid); -<<<<<<< HEAD extern bool btree_or_bitmap_validate(Oid opclassoid, const char *amname); -======= extern void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions); ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600 /* * prototypes for functions in nbtsort.c From 02a1c00366db8589ce76c88cbc0a5bb536f487db Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Wed, 6 May 2026 13:31:07 +0500 Subject: [PATCH 334/334] Resolve conflicts in configure.ac (#2440) Commit 25244b8 renamed configure.in to configure.ac, although earlier GPDB-specific commits had already added the dnl prefix in the same location. --- configure.ac | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/configure.ac b/configure.ac index 5102af0ae73d..c545328648ac 100644 --- a/configure.ac +++ b/configure.ac @@ -24,17 +24,10 @@ AC_INIT([Greenplum Database], [8.0.0-alpha.0], [support@greenplum.org], [], [htt [PG_PACKAGE_VERSION=14alpha0] AC_SUBST(PG_PACKAGE_VERSION) -<<<<<<< HEAD:configure.in dnl m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required. dnl Untested combinations of 'autoconf' and PostgreSQL versions are not -dnl recommended. You can remove the check from 'configure.in' but it is then +dnl recommended. You can remove the check from 'configure.ac' but it is then dnl your responsibility whether the result works or not.])]) -======= -m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required. -Untested combinations of 'autoconf' and PostgreSQL versions are not -recommended. You can remove the check from 'configure.ac' but it is then -your responsibility whether the result works or not.])]) ->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600:configure.ac AC_COPYRIGHT([Copyright (c) 1996-2020, PostgreSQL Global Development Group]) AC_CONFIG_SRCDIR([src/backend/access/common/heaptuple.c]) AC_CONFIG_AUX_DIR(config)